Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25,420 changes: 25,420 additions & 0 deletions sgl-kernel/3rdparty/nlohmann/json.hpp

Large diffs are not rendered by default.

187 changes: 187 additions & 0 deletions sgl-kernel/3rdparty/nlohmann/json_fwd.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
// __ _____ _____ _____
// __| | __| | | | JSON for Modern C++
// | | |__ | | | | | | version 3.11.3
// |_____|_____|_____|_|___| https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT

#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
#define INCLUDE_NLOHMANN_JSON_FWD_HPP_

#include <cstdint> // int64_t, uint64_t
#include <map> // map
#include <memory> // allocator
#include <string> // string
#include <vector> // vector

// #include <nlohmann/detail/abi_macros.hpp>
// __ _____ _____ _____
// __| | __| | | | JSON for Modern C++
// | | |__ | | | | | | version 3.11.3
// |_____|_____|_____|_|___| https://github.com/nlohmann/json
//
// SPDX-FileCopyrightText: 2013 - 2024 Niels Lohmann <https://nlohmann.me>
// SPDX-License-Identifier: MIT



// This file contains all macro definitions affecting or depending on the ABI

#ifndef JSON_SKIP_LIBRARY_VERSION_CHECK
#if defined(NLOHMANN_JSON_VERSION_MAJOR) && defined(NLOHMANN_JSON_VERSION_MINOR) && defined(NLOHMANN_JSON_VERSION_PATCH)
#if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 3
#warning "Already included a different version of the library!"
#endif
#endif
#endif

#define NLOHMANN_JSON_VERSION_MAJOR 3 // NOLINT(modernize-macro-to-enum)
#define NLOHMANN_JSON_VERSION_MINOR 11 // NOLINT(modernize-macro-to-enum)
#define NLOHMANN_JSON_VERSION_PATCH 3 // NOLINT(modernize-macro-to-enum)

#ifndef JSON_DIAGNOSTICS
#define JSON_DIAGNOSTICS 0
#endif

#ifndef JSON_DIAGNOSTIC_POSITIONS
#define JSON_DIAGNOSTIC_POSITIONS 0
#endif

#ifndef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
#define JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON 0
#endif

#if JSON_DIAGNOSTICS
#define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS _diag
#else
#define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS
#endif

#if JSON_DIAGNOSTIC_POSITIONS
#define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS _dp
#else
#define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS
#endif

#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
#define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON _ldvcmp
#else
#define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON
#endif

#ifndef NLOHMANN_JSON_NAMESPACE_NO_VERSION
#define NLOHMANN_JSON_NAMESPACE_NO_VERSION 0
#endif

// Construct the namespace ABI tags component
#define NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c) json_abi ## a ## b ## c
#define NLOHMANN_JSON_ABI_TAGS_CONCAT(a, b, c) \
NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c)

#define NLOHMANN_JSON_ABI_TAGS \
NLOHMANN_JSON_ABI_TAGS_CONCAT( \
NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS, \
NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON, \
NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS)

// Construct the namespace version component
#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch) \
_v ## major ## _ ## minor ## _ ## patch
#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(major, minor, patch) \
NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch)

#if NLOHMANN_JSON_NAMESPACE_NO_VERSION
#define NLOHMANN_JSON_NAMESPACE_VERSION
#else
#define NLOHMANN_JSON_NAMESPACE_VERSION \
NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(NLOHMANN_JSON_VERSION_MAJOR, \
NLOHMANN_JSON_VERSION_MINOR, \
NLOHMANN_JSON_VERSION_PATCH)
#endif

// Combine namespace components
#define NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b) a ## b
#define NLOHMANN_JSON_NAMESPACE_CONCAT(a, b) \
NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b)

#ifndef NLOHMANN_JSON_NAMESPACE
#define NLOHMANN_JSON_NAMESPACE \
nlohmann::NLOHMANN_JSON_NAMESPACE_CONCAT( \
NLOHMANN_JSON_ABI_TAGS, \
NLOHMANN_JSON_NAMESPACE_VERSION)
#endif

#ifndef NLOHMANN_JSON_NAMESPACE_BEGIN
#define NLOHMANN_JSON_NAMESPACE_BEGIN \
namespace nlohmann \
{ \
inline namespace NLOHMANN_JSON_NAMESPACE_CONCAT( \
NLOHMANN_JSON_ABI_TAGS, \
NLOHMANN_JSON_NAMESPACE_VERSION) \
{
#endif

#ifndef NLOHMANN_JSON_NAMESPACE_END
#define NLOHMANN_JSON_NAMESPACE_END \
} /* namespace (inline namespace) NOLINT(readability/namespace) */ \
} // namespace nlohmann
#endif


/*!
@brief namespace for Niels Lohmann
@see https://github.com/nlohmann
@since version 1.0.0
*/
NLOHMANN_JSON_NAMESPACE_BEGIN

/*!
@brief default JSONSerializer template argument

This serializer ignores the template arguments and uses ADL
([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl))
for serialization.
*/
template<typename T = void, typename SFINAE = void>
struct adl_serializer;

/// a class to store JSON values
/// @sa https://json.nlohmann.me/api/basic_json/
template<template<typename U, typename V, typename... Args> class ObjectType =
std::map,
template<typename U, typename... Args> class ArrayType = std::vector,
class StringType = std::string, class BooleanType = bool,
class NumberIntegerType = std::int64_t,
class NumberUnsignedType = std::uint64_t,
class NumberFloatType = double,
template<typename U> class AllocatorType = std::allocator,
template<typename T, typename SFINAE = void> class JSONSerializer =
adl_serializer,
class BinaryType = std::vector<std::uint8_t>, // cppcheck-suppress syntaxError
class CustomBaseClass = void>
class basic_json;

/// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document
/// @sa https://json.nlohmann.me/api/json_pointer/
template<typename RefStringType>
class json_pointer;

/*!
@brief default specialization
@sa https://json.nlohmann.me/api/json/
*/
using json = basic_json<>;

/// @brief a minimal map-like container that preserves insertion order
/// @sa https://json.nlohmann.me/api/ordered_map/
template<class Key, class T, class IgnoredLess, class Allocator>
struct ordered_map;

/// @brief specialization that maintains the insertion order of object keys
/// @sa https://json.nlohmann.me/api/ordered_json/
using ordered_json = basic_json<nlohmann::ordered_map>;

NLOHMANN_JSON_NAMESPACE_END

#endif // INCLUDE_NLOHMANN_JSON_FWD_HPP_
1 change: 1 addition & 0 deletions sgl-kernel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ add_library(_kernels SHARED
src/sgl-kernel/csrc/trt_reduce_kernel.cu
src/sgl-kernel/csrc/moe_align_kernel.cu
src/sgl-kernel/csrc/int8_gemm_kernel.cu
src/sgl-kernel/csrc/fp8_gemm_kernel.cu
src/sgl-kernel/csrc/sgl_kernel_ops.cu
)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"M=1": 24,
"M=1024": 61,
"M=128": 13,
"M=16": 24,
"M=2048": 67,
"M=256": 13,
"M=4096": 66,
"M=512": 61,
"M=64": 17
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"M=1": 24,
"M=1024": 61,
"M=128": 13,
"M=16": 24,
"M=2048": 64,
"M=256": 13,
"M=4096": 64,
"M=512": 61,
"M=64": 17
}
3 changes: 3 additions & 0 deletions sgl-kernel/bench_fp8_res/results.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<html><body>
<image src="int8 scaled matmul.png"/>
</body></html>
10 changes: 10 additions & 0 deletions sgl-kernel/benchmark/89_fp8_bf16.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"M=1,N=4096,K=8192": 1,
"M=1024,N=4096,K=8192": 5,
"M=128,N=4096,K=8192": 3,
"M=16,N=4096,K=8192": 1,
"M=2048,N=4096,K=8192": 5,
"M=256,N=4096,K=8192": 4,
"M=512,N=4096,K=8192": 5,
"M=64,N=4096,K=8192": 1
}
10 changes: 10 additions & 0 deletions sgl-kernel/benchmark/89_fp8_bf16_256解决.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"M=1,N=4096,K=8192": 75,
"M=1024,N=4096,K=8192": 33,
"M=128,N=4096,K=8192": 15,
"M=16,N=4096,K=8192": 88,
"M=2048,N=4096,K=8192": 32,
"M=256,N=4096,K=8192": 12,
"M=512,N=4096,K=8192": 31,
"M=64,N=4096,K=8192": 90
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"M=1": 24,
"M=1024": 61,
"M=128": 14,
"M=16": 24,
"M=2048": 71,
"M=256": 38,
"M=4096": 66,
"M=512": 61,
"M=64": 13
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"M=1": 17,
"M=1024": 61,
"M=128": 13,
"M=16": 17,
"M=2048": 64,
"M=256": 13,
"M=4096": 66,
"M=512": 61,
"M=64": 17
}
118 changes: 118 additions & 0 deletions sgl-kernel/benchmark/bench_fp8_gemm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import torch
import torch.nn.functional as F
import triton

from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
from sgl_kernel import fp8_scaled_mm as sgl_scaled_mm
from sgl_kernel import fp8_scaled_mm_profile as sgl_scaled_mm_profile
import time
import vllm
import triton

def get_sm_version():
device = torch.cuda.current_device()
major, minor = torch.cuda.get_device_capability(device)
return major * 10 + minor


def get_device_name():
return torch.cuda.get_device_name(torch.cuda.current_device())

def get_config_filename(dtype="bf16"):
sm_version = get_sm_version()
return f"sm{sm_version}_fp8_{dtype}.json"

def do_profile(dtype="bf16", n=4096, k=8192):
M = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
for m in M:
a = torch.ones((m, k), device="cuda") * 5.0
b = torch.ones((n, k), device="cuda") * 5.0
scale_a = torch.randn((m,), device="cuda", dtype=torch.float32)
scale_b = torch.randn((n,), device="cuda", dtype=torch.float32)
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
b_fp8 = b_fp8.t()
sgl_scaled_mm_profile(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=None)

@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048],
x_log=False,
line_arg="provider",
line_vals=["vllm-fp8-fp16", "vllm-fp8-bf16", "sglang-fp8-fp16", "sglang-fp8-bf16",
"sglang-fp8-profile-fp16", "sglang-fp8-profile-bf16", "torch-fp8"],
line_names=["vllm-fp8-fp16", "vllm-fp8-bf16", "sglang-fp8-fp16", "sglang-fp8-bf16",
"sglang-fp8-profile-fp16", "sglang-fp8-profile-bf16", "torch-fp8"],
styles=[("green", "-"), ("green", "--"), ("blue", "-"), ("blue", "--"),
("red", "-"), ("red", "--"), ("purple", "-")],
ylabel="GB/s",
plot_name="int8 scaled matmul",
args={},
)
)

def benchmark(batch_size, provider):
M, N, K = batch_size, 4096, 8192
a = torch.ones((M, K), device="cuda") * 5.0
b = torch.ones((N, K), device="cuda") * 5.0
scale_a = torch.randn((M,), device="cuda", dtype=torch.float32)
scale_b = torch.randn((N,), device="cuda", dtype=torch.float32)
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
b_fp8 = b_fp8.t()
quantiles = [0.5, 0.2, 0.8]

dtype = torch.float16 if "fp16" in provider else torch.bfloat16
bias = torch.randn((N,), device="cuda", dtype=dtype)
if "vllm-fp8" in provider:
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: vllm_scaled_mm(
a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=bias
),
quantiles=quantiles,
)
elif "sglang-fp8-profile" in provider:
do_profile(dtype, N, K)
try:
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: sgl_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=bias, is_profile=True),
quantiles=quantiles,
)
except RuntimeError as e:
print("Error details:", e)
ms, min_ms, max_ms = 1, 1, 1
elif "sglang-fp8" in provider:
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: sgl_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype, bias=bias, is_profile=False),
quantiles=quantiles,
)
elif provider == "torch-fp8":
scale_a_2d = scale_a_fp8.float().unsqueeze(1) # [M, 1]
scale_b_2d = scale_b_fp8.float().unsqueeze(0) # [1, N]
try:
out = torch.empty(
(a_fp8.shape[0], b_fp8.shape[0]), device="cuda", dtype=torch.bfloat16
)
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: torch._scaled_mm(
a_fp8,
b_fp8,
out=out,
out_dtype=torch.bfloat16,
scale_a=scale_a_2d,
scale_b=scale_b_2d,
use_fast_accum=True,
bias=bias,
),
quantiles=quantiles,
)
except RuntimeError as e:
print("Error details:", e)
raise
gbps = lambda ms: (2 * M * N * K + M * N) * a.element_size() * 1e-9 / (ms * 1e-3)
return gbps(ms), gbps(max_ms), gbps(min_ms)


benchmark.run(print_data=True, show_plots=True, save_path="bench_fp8_res")
3 changes: 3 additions & 0 deletions sgl-kernel/benchmark/bench_fp8_res/results.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<html><body>
<image src="int8 scaled matmul.png"/>
</body></html>
3 changes: 3 additions & 0 deletions sgl-kernel/benchmark/bench_int8_res/results.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<html><body>
<image src="int8 scaled matmul.png"/>
</body></html>
Empty file added sgl-kernel/outp
Empty file.
Loading