HandH1998 · yych0745 · Jan 7, 2025 · Jan 8, 2025 · Jan 9, 2025 · Jan 10, 2025
@@ -0,0 +1,187 @@
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
+#define INCLUDE_NLOHMANN_JSON_FWD_HPP_
+
+#include <cstdint> // int64_t, uint64_t
+#include <map> // map
+#include <memory> // allocator
+#include <string> // string
+#include <vector> // vector
+
+// #include <nlohmann/detail/abi_macros.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+
+
+// This file contains all macro definitions affecting or depending on the ABI
+
+#ifndef JSON_SKIP_LIBRARY_VERSION_CHECK
+    #if defined(NLOHMANN_JSON_VERSION_MAJOR) && defined(NLOHMANN_JSON_VERSION_MINOR) && defined(NLOHMANN_JSON_VERSION_PATCH)
+        #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 3
+            #warning "Already included a different version of the library!"
+        #endif
+    #endif
+#endif
+
+#define NLOHMANN_JSON_VERSION_MAJOR 3   // NOLINT(modernize-macro-to-enum)
+#define NLOHMANN_JSON_VERSION_MINOR 11  // NOLINT(modernize-macro-to-enum)
+#define NLOHMANN_JSON_VERSION_PATCH 3   // NOLINT(modernize-macro-to-enum)
+
+#ifndef JSON_DIAGNOSTICS
+    #define JSON_DIAGNOSTICS 0
+#endif
+
+#ifndef JSON_DIAGNOSTIC_POSITIONS
+    #define JSON_DIAGNOSTIC_POSITIONS 0
+#endif
+
+#ifndef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
+    #define JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON 0
+#endif
+
+#if JSON_DIAGNOSTICS
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS _diag
+#else
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS
+#endif
+
+#if JSON_DIAGNOSTIC_POSITIONS
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS _dp
+#else
+    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS
+#endif
+
+#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
+    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON _ldvcmp
+#else
+    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON
+#endif
+
+#ifndef NLOHMANN_JSON_NAMESPACE_NO_VERSION
+    #define NLOHMANN_JSON_NAMESPACE_NO_VERSION 0
+#endif
+
+// Construct the namespace ABI tags component
+#define NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c) json_abi ## a ## b ## c
+#define NLOHMANN_JSON_ABI_TAGS_CONCAT(a, b, c) \
+    NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c)
+
+#define NLOHMANN_JSON_ABI_TAGS                                       \
+    NLOHMANN_JSON_ABI_TAGS_CONCAT(                                   \
+            NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS,                       \
+            NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON, \
+            NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS)
+
+// Construct the namespace version component
+#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch) \
+    _v ## major ## _ ## minor ## _ ## patch
+#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(major, minor, patch) \
+    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch)
+
+#if NLOHMANN_JSON_NAMESPACE_NO_VERSION
+#define NLOHMANN_JSON_NAMESPACE_VERSION
+#else
+#define NLOHMANN_JSON_NAMESPACE_VERSION                                 \
+    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(NLOHMANN_JSON_VERSION_MAJOR, \
+                                           NLOHMANN_JSON_VERSION_MINOR, \
+                                           NLOHMANN_JSON_VERSION_PATCH)
+#endif
+
+// Combine namespace components
+#define NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b) a ## b
+#define NLOHMANN_JSON_NAMESPACE_CONCAT(a, b) \
+    NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b)
+
+#ifndef NLOHMANN_JSON_NAMESPACE
+#define NLOHMANN_JSON_NAMESPACE               \
+    nlohmann::NLOHMANN_JSON_NAMESPACE_CONCAT( \
+            NLOHMANN_JSON_ABI_TAGS,           \
+            NLOHMANN_JSON_NAMESPACE_VERSION)
+#endif
+
+#ifndef NLOHMANN_JSON_NAMESPACE_BEGIN
+#define NLOHMANN_JSON_NAMESPACE_BEGIN                \
+    namespace nlohmann                               \
+    {                                                \
+    inline namespace NLOHMANN_JSON_NAMESPACE_CONCAT( \
+                NLOHMANN_JSON_ABI_TAGS,              \
+                NLOHMANN_JSON_NAMESPACE_VERSION)     \
+    {
+#endif
+
+#ifndef NLOHMANN_JSON_NAMESPACE_END
+#define NLOHMANN_JSON_NAMESPACE_END                                     \
+    }  /* namespace (inline namespace) NOLINT(readability/namespace) */ \
+    }  // namespace nlohmann
+#endif
+
+
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+@since version 1.0.0
+*/
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+/*!
+@brief default JSONSerializer template argument
+
+This serializer ignores the template arguments and uses ADL
+([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl))
+for serialization.
+*/
+template<typename T = void, typename SFINAE = void>
+struct adl_serializer;
+
+/// a class to store JSON values
+/// @sa https://json.nlohmann.me/api/basic_json/
+template<template<typename U, typename V, typename... Args> class ObjectType =
+         std::map,
+         template<typename U, typename... Args> class ArrayType = std::vector,
+         class StringType = std::string, class BooleanType = bool,
+         class NumberIntegerType = std::int64_t,
+         class NumberUnsignedType = std::uint64_t,
+         class NumberFloatType = double,
+         template<typename U> class AllocatorType = std::allocator,
+         template<typename T, typename SFINAE = void> class JSONSerializer =
+         adl_serializer,
+         class BinaryType = std::vector<std::uint8_t>, // cppcheck-suppress syntaxError
+         class CustomBaseClass = void>
+class basic_json;
+
+/// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document
+/// @sa https://json.nlohmann.me/api/json_pointer/
+template<typename RefStringType>
+class json_pointer;
+
+/*!
+@brief default specialization
+@sa https://json.nlohmann.me/api/json/
+*/
+using json = basic_json<>;
+
+/// @brief a minimal map-like container that preserves insertion order
+/// @sa https://json.nlohmann.me/api/ordered_map/
+template<class Key, class T, class IgnoredLess, class Allocator>
+struct ordered_map;
+
+/// @brief specialization that maintains the insertion order of object keys
+/// @sa https://json.nlohmann.me/api/ordered_json/
+using ordered_json = basic_json<nlohmann::ordered_map>;
+
+NLOHMANN_JSON_NAMESPACE_END
+
+#endif  // INCLUDE_NLOHMANN_JSON_FWD_HPP_
@@ -32,6 +32,7 @@ add_library(_kernels SHARED
     src/sgl-kernel/csrc/trt_reduce_kernel.cu
     src/sgl-kernel/csrc/moe_align_kernel.cu
     src/sgl-kernel/csrc/int8_gemm_kernel.cu
+    src/sgl-kernel/csrc/fp8_gemm_kernel.cu
     src/sgl-kernel/csrc/sgl_kernel_ops.cu
 )
 

@@ -0,0 +1,11 @@
+{
+    "M=1": 24,
+    "M=1024": 61,
+    "M=128": 13,
+    "M=16": 24,
+    "M=2048": 67,
+    "M=256": 13,
+    "M=4096": 66,
+    "M=512": 61,
+    "M=64": 17
+}
@@ -0,0 +1,11 @@
+{
+    "M=1": 24,
+    "M=1024": 61,
+    "M=128": 13,
+    "M=16": 24,
+    "M=2048": 64,
+    "M=256": 13,
+    "M=4096": 64,
+    "M=512": 61,
+    "M=64": 17
+}
@@ -0,0 +1,3 @@
+<html><body>
+<image src="int8 scaled matmul.png"/>
+</body></html>
@@ -0,0 +1,10 @@
+{
+    "M=1,N=4096,K=8192": 1,
+    "M=1024,N=4096,K=8192": 5,
+    "M=128,N=4096,K=8192": 3,
+    "M=16,N=4096,K=8192": 1,
+    "M=2048,N=4096,K=8192": 5,
+    "M=256,N=4096,K=8192": 4,
+    "M=512,N=4096,K=8192": 5,
+    "M=64,N=4096,K=8192": 1
+}
diff --git a/sgl-kernel/benchmark/89_fp8_bf16_256解决.json b/sgl-kernel/benchmark/89_fp8_bf16_256解决.json
@@ -0,0 +1,10 @@
+{
+    "M=1,N=4096,K=8192": 75,
+    "M=1024,N=4096,K=8192": 33,
+    "M=128,N=4096,K=8192": 15,
+    "M=16,N=4096,K=8192": 88,
+    "M=2048,N=4096,K=8192": 32,
+    "M=256,N=4096,K=8192": 12,
+    "M=512,N=4096,K=8192": 31,
+    "M=64,N=4096,K=8192": 90
+}
@@ -0,0 +1,11 @@
+{
+    "M=1": 24,
+    "M=1024": 61,
+    "M=128": 14,
+    "M=16": 24,
+    "M=2048": 71,
+    "M=256": 38,
+    "M=4096": 66,
+    "M=512": 61,
+    "M=64": 13
+}
@@ -0,0 +1,11 @@
+{
+    "M=1": 17,
+    "M=1024": 61,
+    "M=128": 13,
+    "M=16": 17,
+    "M=2048": 64,
+    "M=256": 13,
+    "M=4096": 66,
+    "M=512": 61,
+    "M=64": 17
+}
@@ -0,0 +1,118 @@
+import torch
+import torch.nn.functional as F
+import triton
+
+from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
+from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
+from sgl_kernel import fp8_scaled_mm as sgl_scaled_mm
+from sgl_kernel import fp8_scaled_mm_profile as sgl_scaled_mm_profile
+import time
+import vllm
+import triton
+
+def get_sm_version():
+    device = torch.cuda.current_device()
+    major, minor = torch.cuda.get_device_capability(device)
+    return major * 10 + minor
+
+
+def get_device_name():
+    return torch.cuda.get_device_name(torch.cuda.current_device())
+
+def get_config_filename(dtype="bf16"):
+    sm_version = get_sm_version()
+    return f"sm{sm_version}_fp8_{dtype}.json"
+
+def do_profile(dtype="bf16", n=4096, k=8192):
+    M = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
+    for m in M:
+        a = torch.ones((m, k), device="cuda") * 5.0
+        b = torch.ones((n, k), device="cuda") * 5.0
+        scale_a = torch.randn((m,), device="cuda", dtype=torch.float32)
+        scale_b = torch.randn((n,), device="cuda", dtype=torch.float32)
+        a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
+        b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+        b_fp8 = b_fp8.t()
+        sgl_scaled_mm_profile(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=None)
+
+@triton.testing.perf_report(
+        triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048],
+        x_log=False,
+        line_arg="provider",
+        line_vals=["vllm-fp8-fp16", "vllm-fp8-bf16", "sglang-fp8-fp16", "sglang-fp8-bf16", 
+                  "sglang-fp8-profile-fp16", "sglang-fp8-profile-bf16", "torch-fp8"],
+        line_names=["vllm-fp8-fp16", "vllm-fp8-bf16", "sglang-fp8-fp16", "sglang-fp8-bf16", 
+                   "sglang-fp8-profile-fp16", "sglang-fp8-profile-bf16", "torch-fp8"],
+        styles=[("green", "-"), ("green", "--"), ("blue", "-"), ("blue", "--"), 
+               ("red", "-"), ("red", "--"), ("purple", "-")],
+        ylabel="GB/s",
+        plot_name="int8 scaled matmul",
+        args={},
+    )
+)
+
+def benchmark(batch_size, provider):
+    M, N, K = batch_size, 4096, 8192
+    a = torch.ones((M, K), device="cuda") * 5.0
+    b = torch.ones((N, K), device="cuda") * 5.0
+    scale_a = torch.randn((M,), device="cuda", dtype=torch.float32)
+    scale_b = torch.randn((N,), device="cuda", dtype=torch.float32)
+    a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
+    b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
+    b_fp8 = b_fp8.t()
+    quantiles = [0.5, 0.2, 0.8]
+
+    dtype = torch.float16 if "fp16" in provider else torch.bfloat16
+    bias = torch.randn((N,), device="cuda", dtype=dtype)
+    if "vllm-fp8" in provider:
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: vllm_scaled_mm(
+                a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=bias
+            ),
+            quantiles=quantiles,
+        )
+    elif "sglang-fp8-profile" in provider:
+        do_profile(dtype, N, K)
+        try:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: sgl_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=bias, is_profile=True),
+                quantiles=quantiles,
+            )
+        except RuntimeError as e:
+            print("Error details:", e)
+            ms, min_ms, max_ms = 1, 1, 1
+    elif "sglang-fp8" in provider:
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: sgl_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=bias, is_profile=False),
+            quantiles=quantiles,
+        )
+    elif provider == "torch-fp8":
+        scale_a_2d = scale_a_fp8.float().unsqueeze(1)  # [M, 1]
+        scale_b_2d = scale_b_fp8.float().unsqueeze(0)  # [1, N]
+        try:
+            out = torch.empty(
+                (a_fp8.shape[0], b_fp8.shape[0]), device="cuda", dtype=torch.bfloat16
+            )
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: torch._scaled_mm(
+                    a_fp8,
+                    b_fp8,
+                    out=out,
+                    out_dtype=torch.bfloat16,
+                    scale_a=scale_a_2d,
+                    scale_b=scale_b_2d,
+                    use_fast_accum=True,
+                    bias=bias,
+                ),
+                quantiles=quantiles,
+            )
+        except RuntimeError as e:
+            print("Error details:", e)
+            raise
+    gbps = lambda ms: (2 * M * N * K + M * N) * a.element_size() * 1e-9 / (ms * 1e-3)
+    return gbps(ms), gbps(max_ms), gbps(min_ms)
+
+
+benchmark.run(print_data=True, show_plots=True, save_path="bench_fp8_res")
@@ -0,0 +1,3 @@
+<html><body>
+<image src="int8 scaled matmul.png"/>
+</body></html>
@@ -0,0 +1,3 @@
+<html><body>
+<image src="int8 scaled matmul.png"/>
+</body></html>