From 217d49d0bd92160f5547bdc8bd1ac4080d98f481 Mon Sep 17 00:00:00 2001 From: qinyiqun Date: Wed, 14 Jan 2026 10:00:38 +0800 Subject: [PATCH 01/11] =?UTF-8?q?=E6=94=AF=E6=8C=81nv=20w8=201batch=201tp?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- csrc/layers/fused_linear.cpp | 125 ++++++++++++++++-- csrc/layers/fused_linear.hpp | 82 +++++++++++- csrc/models/llama/llama_attention.cpp | 28 ++-- csrc/models/llama/llama_config.hpp | 10 +- csrc/models/llama/llama_mlp.cpp | 23 +++- examples/jiuge.py | 6 +- python/infinilm/modeling_utils.py | 6 +- .../models/llama/configuration_llama.py | 12 +- python/infinilm/models/quant_config.py | 110 +++++++++++++++ 9 files changed, 360 insertions(+), 42 deletions(-) create mode 100644 python/infinilm/models/quant_config.py diff --git a/csrc/layers/fused_linear.cpp b/csrc/layers/fused_linear.cpp index 9b2c813d..7f5ec364 100644 --- a/csrc/layers/fused_linear.cpp +++ b/csrc/layers/fused_linear.cpp @@ -6,6 +6,57 @@ namespace infinilm::layers { // --------------------------------------------------------- // QKV Parallel Linear // --------------------------------------------------------- +// QKVParallelLinear::QKVParallelLinear(size_t hidden_size, +// size_t head_dim, +// size_t num_q_head, +// size_t num_kv_head, +// bool bias, +// const infinicore::DataType &dtype, +// const infinicore::Device &device, +// engine::distributed::RankInfo rank_info) +// : QKVParallelLinear(hidden_size, +// head_dim, head_dim, head_dim, +// num_q_head, num_kv_head, num_kv_head, +// bias, bias, bias, +// dtype, device, rank_info) {} + +// QKVParallelLinear::QKVParallelLinear(size_t hidden_size, +// size_t q_dim, size_t k_dim, size_t v_dim, +// size_t num_q_head, size_t num_k_head, size_t num_v_head, +// bool q_bias, bool k_bias, bool v_bias, +// const infinicore::DataType &dtype, +// const infinicore::Device &device, +// engine::distributed::RankInfo rank_info) +// : infinicore::nn::ColumnParallelLinear( +// hidden_size, +// num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim, +// (q_bias || k_bias || v_bias), +// dtype, +// device, +// rank_info.tp_rank, +// rank_info.tp_size), +// q_dim_(q_dim), +// k_dim_(k_dim), +// v_dim_(v_dim), +// num_q_head_(num_q_head), +// num_k_head_(num_k_head), +// num_v_head_(num_v_head), +// q_bias_(q_bias), +// k_bias_(k_bias), +// v_bias_(v_bias) { +// if (num_q_head % tp_size_ != 0 || num_k_head % tp_size_ != 0 || num_v_head % tp_size_ != 0) { +// throw std::runtime_error("QKVParallelLinear: num_[q|k|v]_head must be divisible by tp_size"); +// } + +// if ((q_bias_ != k_bias_) || (k_bias_ != v_bias_)) { +// throw std::runtime_error("q_bias, k_bias, v_bias must all match"); +// } + +// q_out_size_ = num_q_head_ * q_dim_ / tp_size_; +// k_out_size_ = num_k_head_ * k_dim_ / tp_size_; +// v_out_size_ = num_v_head_ * v_dim_ / tp_size_; +// } + QKVParallelLinear::QKVParallelLinear(size_t hidden_size, size_t head_dim, size_t num_q_head, @@ -13,12 +64,14 @@ QKVParallelLinear::QKVParallelLinear(size_t hidden_size, bool bias, const infinicore::DataType &dtype, const infinicore::Device &device, - engine::distributed::RankInfo rank_info) + engine::distributed::RankInfo rank_info, + std::optional quant_config) : QKVParallelLinear(hidden_size, head_dim, head_dim, head_dim, num_q_head, num_kv_head, num_kv_head, bias, bias, bias, - dtype, device, rank_info) {} + dtype, device, rank_info, + quant_config) {} QKVParallelLinear::QKVParallelLinear(size_t hidden_size, size_t q_dim, size_t k_dim, size_t v_dim, @@ -26,15 +79,17 @@ QKVParallelLinear::QKVParallelLinear(size_t hidden_size, bool q_bias, bool k_bias, bool v_bias, const infinicore::DataType &dtype, const infinicore::Device &device, - engine::distributed::RankInfo rank_info) + engine::distributed::RankInfo rank_info, + std::optional quant_config) : infinicore::nn::ColumnParallelLinear( - hidden_size, - num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim, - (q_bias || k_bias || v_bias), - dtype, - device, - rank_info.tp_rank, - rank_info.tp_size), + hidden_size, + num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim, + (q_bias || k_bias || v_bias), + dtype, + device, + rank_info.tp_rank, + rank_info.tp_size, + quant_config), q_dim_(q_dim), k_dim_(k_dim), v_dim_(v_dim), @@ -86,6 +141,23 @@ infinicore::nn::Parameter QKVParallelLinear::get_v_weight() const { 0, tp_rank_, tp_size_); } +infinicore::nn::Parameter QKVParallelLinear::get_q_weight_scale() const { + return infinicore::nn::Parameter( + weight_scale_->narrow({{0, 0, q_out_size_}}), 0, tp_rank_, tp_size_); +} + +infinicore::nn::Parameter QKVParallelLinear::get_k_weight_scale() const { + return infinicore::nn::Parameter( + weight_scale_->narrow({{0, q_out_size_, k_out_size_}}), + 0, tp_rank_, tp_size_); +} + +infinicore::nn::Parameter QKVParallelLinear::get_v_weight_scale() const { + return infinicore::nn::Parameter( + weight_scale_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}), + 0, tp_rank_, tp_size_); +} + infinicore::nn::Parameter QKVParallelLinear::get_q_bias() const { if (!q_bias_) { return infinicore::nn::Parameter(); @@ -120,16 +192,33 @@ bool QKVParallelLinear::has_v_bias() const { return v_bias_; } // --------------------------------------------------------- // Gate-Up Parallel Linear // --------------------------------------------------------- +// GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias, +// const infinicore::DataType &dtype, const infinicore::Device &device, +// engine::distributed::RankInfo rank_info) +// : GateUpParallelLinear(hidden_size, intermediate_size, bias, bias, dtype, device, rank_info) { +// } + +// GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, +// const infinicore::DataType &dtype, const infinicore::Device &device, +// engine::distributed::RankInfo rank_info) +// : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) { +// if (gate_bias_ != up_bias_) { +// throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time"); +// } +// } + GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias, const infinicore::DataType &dtype, const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : GateUpParallelLinear(hidden_size, intermediate_size, bias, bias, dtype, device, rank_info) { + engine::distributed::RankInfo rank_info, + std::optional quant_config) + : GateUpParallelLinear(hidden_size, intermediate_size, bias, bias, dtype, device, rank_info, quant_config) { } GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, const infinicore::DataType &dtype, const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) { + engine::distributed::RankInfo rank_info, + std::optional quant_config) + : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size, quant_config), gate_bias_(gate_bias), up_bias_(up_bias) { if (gate_bias_ != up_bias_) { throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time"); } @@ -168,6 +257,14 @@ infinicore::nn::Parameter GateUpParallelLinear::get_up_bias() const { } } +infinicore::nn::Parameter GateUpParallelLinear::get_gate_weight_scale() const { + return infinicore::nn::Parameter(weight_scale_->narrow({{0, 0, weight_scale_->size(0) / 2}}), 0, tp_rank_, tp_size_); +} + +infinicore::nn::Parameter GateUpParallelLinear::get_up_weight_scale() const { + return infinicore::nn::Parameter(weight_scale_->narrow({{0, weight_scale_->size(0) / 2, weight_scale_->size(0) / 2}}), 0, tp_rank_, tp_size_); +} + bool GateUpParallelLinear::has_gate_bias() const { return gate_bias_; } diff --git a/csrc/layers/fused_linear.hpp b/csrc/layers/fused_linear.hpp index 1e32ce50..8bde20d8 100644 --- a/csrc/layers/fused_linear.hpp +++ b/csrc/layers/fused_linear.hpp @@ -1,18 +1,37 @@ #pragma once #include "infinicore/nn/linear.hpp" +#include "infinicore/nn/quantization.hpp" #include "../engine/distributed/communication_group.hpp" namespace infinilm::layers { class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear { public: + // explicit QKVParallelLinear(size_t hidden_size, + // size_t q_dim, size_t k_dim, size_t v_dim, + // size_t num_q_head, size_t num_k_head, size_t num_v_head, + // bool q_bias, bool k_bias, bool v_bias, + // const infinicore::DataType &dtype = infinicore::DataType::F32, + // const infinicore::Device &device = infinicore::Device(), + // engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + + // // A more common case where all heads have the same dimension + // explicit QKVParallelLinear(size_t hidden_size, + // size_t head_dim, + // size_t num_q_head, size_t num_kv_head, + // bool bias = false, + // const infinicore::DataType &dtype = infinicore::DataType::F32, + // const infinicore::Device &device = infinicore::Device(), + // engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + explicit QKVParallelLinear(size_t hidden_size, size_t q_dim, size_t k_dim, size_t v_dim, size_t num_q_head, size_t num_k_head, size_t num_v_head, bool q_bias, bool k_bias, bool v_bias, const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), + std::optional quant_config = std::nullopt); // A more common case where all heads have the same dimension explicit QKVParallelLinear(size_t hidden_size, @@ -21,7 +40,8 @@ class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear { bool bias = false, const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), + std::optional quant_config = std::nullopt); std::tuple forward_split(infinicore::Tensor &input); @@ -30,6 +50,10 @@ class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear { infinicore::nn::Parameter get_k_weight() const; infinicore::nn::Parameter get_v_weight() const; + infinicore::nn::Parameter get_q_weight_scale() const; + infinicore::nn::Parameter get_k_weight_scale() const; + infinicore::nn::Parameter get_v_weight_scale() const; + infinicore::nn::Parameter get_q_bias() const; infinicore::nn::Parameter get_k_bias() const; infinicore::nn::Parameter get_v_bias() const; @@ -55,22 +79,37 @@ class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear { class GateUpParallelLinear : public infinicore::nn::ColumnParallelLinear { public: + // GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias = false, + // const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), + // engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + + // GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, + // const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), + // engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + + // Overload for quantization, old ones need tobe purged GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias = false, const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), + std::optional quant_config = std::nullopt); GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), + std::optional quant_config = std::nullopt); std::tuple forward_split(infinicore::Tensor &input); infinicore::nn::Parameter get_gate_weight() const; + infinicore::nn::Parameter get_gate_weight_scale() const; + infinicore::nn::Parameter get_gate_bias() const; infinicore::nn::Parameter get_up_weight() const; + infinicore::nn::Parameter get_up_weight_scale() const; + infinicore::nn::Parameter get_up_bias() const; bool has_gate_bias() const; @@ -103,4 +142,39 @@ class GateUpParallelLinear : public infinicore::nn::ColumnParallelLinear { if (name##_->has_up_bias()) \ this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias()); +// ========================= QKV 量化 ================================== +#define INFINILM_QKV_LINEAR_W8A8_INIT(name, q_name, k_name, v_name, ...) \ + name##_ = std::make_shared(__VA_ARGS__); \ + /* 注册 Q 权重 */ \ + this->register_parameter(std::string(q_name) + ".weight", name##_->get_q_weight()); \ + this->register_parameter(std::string(q_name) + ".weight_scale", name##_->get_q_weight_scale()); \ + /* 注册 K 权重 */ \ + this->register_parameter(std::string(k_name) + ".weight", name##_->get_k_weight()); \ + this->register_parameter(std::string(k_name) + ".weight_scale", name##_->get_k_weight_scale()); \ + /* 注册 V 权重 */ \ + this->register_parameter(std::string(v_name) + ".weight", name##_->get_v_weight()); \ + this->register_parameter(std::string(v_name) + ".weight_scale", name##_->get_v_weight_scale()); \ + /* bias 保持原样 */ \ + if (name##_->has_q_bias()) \ + this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias()); \ + if (name##_->has_k_bias()) \ + this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias()); \ + if (name##_->has_v_bias()) \ + this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias()); + +// ========================= Gate-Up 量化 ============================== +#define INFINILM_GATE_UP_LINEAR_W8A8_INIT(name, gate_name, up_name, ...) \ + name##_ = std::make_shared(__VA_ARGS__); \ + /* 注册 Gate 权重 */ \ + this->register_parameter(std::string(gate_name) + ".weight", name##_->get_gate_weight()); \ + this->register_parameter(std::string(gate_name) + ".weight_scale", name##_->get_gate_weight_scale()); \ + /* 注册 Up 权重 */ \ + this->register_parameter(std::string(up_name) + ".weight", name##_->get_up_weight()); \ + this->register_parameter(std::string(up_name) + ".weight_scale", name##_->get_up_weight_scale()); \ + /* bias 保持原样 */ \ + if (name##_->has_gate_bias()) \ + this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias()); \ + if (name##_->has_up_bias()) \ + this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias()); + } // namespace infinilm::layers diff --git a/csrc/models/llama/llama_attention.cpp b/csrc/models/llama/llama_attention.cpp index ad42efb1..2701d3a2 100644 --- a/csrc/models/llama/llama_attention.cpp +++ b/csrc/models/llama/llama_attention.cpp @@ -48,16 +48,26 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config, scaling_ = 1.0f / std::sqrt(static_cast(head_dim_)); // Initialize projection layers - INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, config.num_attention_heads, config.num_key_value_heads, use_bias_, - dtype, device, rank_info); - // Output projection uses attention_output_bias (can be different from qkv) - INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads * head_dim_, hidden_size_, use_output_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); + if (!config.quant_config.has_value()) { + INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, config.num_attention_heads, config.num_key_value_heads, use_bias_, + dtype, device, rank_info); + // Output projection uses attention_output_bias (can be different from qkv) + INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_output_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm); - // Initialize qk RMSNorm - if (use_qk_norm_) { - INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, config.rms_norm_eps, dtype, device); - INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, config.rms_norm_eps, dtype, device); + } else { + switch (config.quant_config.value().get_quant_type()) { + case infinicore::nn::QuantType::COMPRESSED_TENSOR: { + INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, config.num_attention_heads, config.num_key_value_heads, use_bias_, + dtype, device, rank_info, config.quant_config.value()); + + INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_output_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm, config.quant_config.value()); + break; + } + default: { + } + } } } diff --git a/csrc/models/llama/llama_config.hpp b/csrc/models/llama/llama_config.hpp index 59108546..b0fb892e 100644 --- a/csrc/models/llama/llama_config.hpp +++ b/csrc/models/llama/llama_config.hpp @@ -6,6 +6,7 @@ #include #include "../infinilm_model.hpp" +#include "infinicore/nn/quantization.hpp" #include @@ -36,8 +37,6 @@ struct LlamaConfig : public InfinilmModel::Config { size_t max_position_embeddings = 2048; // Maximum sequence length double rope_theta = 10000.0; // RoPE base frequency - std::shared_ptr rope_scaling = nullptr; // RoPE scaling type - // Normalization double rms_norm_eps = 1e-6; // RMSNorm epsilon @@ -66,11 +65,16 @@ struct LlamaConfig : public InfinilmModel::Config { std::vector bos_token_id = {1}; // Beginning of sequence token ID(s) std::vector eos_token_id = {2}; // End of sequence token ID(s) + // Quant Config + // std::optional quant_config = std::nullopt; + std::optional quant_config = infinicore::nn::QuantConfig(infinicore::nn::QuantType::COMPRESSED_TENSOR); + /** * @brief Compute key-value dimension for Grouped Query Attention (GQA) * @return The dimension for key/value projections */ - size_t kv_dim() const { + size_t + kv_dim() const { return hidden_size * num_key_value_heads / num_attention_heads; } diff --git a/csrc/models/llama/llama_mlp.cpp b/csrc/models/llama/llama_mlp.cpp index fc7abd69..2cc48ccb 100644 --- a/csrc/models/llama/llama_mlp.cpp +++ b/csrc/models/llama/llama_mlp.cpp @@ -1,6 +1,7 @@ #include "llama_mlp.hpp" #include "infinicore/nn/linear.hpp" #include "infinicore/ops.hpp" +#include namespace infinilm::models::llama { @@ -16,10 +17,24 @@ LlamaMLP::LlamaMLP(const LlamaConfig &config, int tp_size = rank_info.tp_size; // Initialize projection layers - INFINILM_GATE_UP_LINEAR_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, use_bias_, - dtype, device, rank_info_); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, use_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm); + if (!config.quant_config.has_value()) { + INFINILM_GATE_UP_LINEAR_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, use_bias_, + dtype, device, rank_info_); + INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, use_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm); + } else { + switch (config.quant_config.value().get_quant_type()) { + case infinicore::nn::QuantType::COMPRESSED_TENSOR: { + INFINILM_GATE_UP_LINEAR_W8A8_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, use_bias_, + dtype, device, rank_info_, config.quant_config.value()); + INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, use_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm, config.quant_config.value()); + break; + } + default: { + } + } + } } infinicore::Tensor LlamaMLP::forward(const infinicore::Tensor &hidden_states) const { diff --git a/examples/jiuge.py b/examples/jiuge.py index 0ca5a418..a4b8c28f 100644 --- a/examples/jiuge.py +++ b/examples/jiuge.py @@ -56,7 +56,7 @@ def get_args(): parser.add_argument( "--max_new_tokens", type=int, - default=100, + default=1000, help="max_new_tokens", ) parser.add_argument( @@ -121,7 +121,7 @@ def get_args(): def test( prompts: str | list[str], model_path, - max_new_tokens=100, + max_new_tokens=5000, infini_device=infinicore.device("cpu", 0), tp=1, enable_paged_attn=False, @@ -140,7 +140,6 @@ def test( distributed_config=DistConfig(tp), enable_graph_compiling=enable_graph, ) - # ---------------------------------------------------------------------------- # # Load Weights # ---------------------------------------------------------------------------- # @@ -150,7 +149,6 @@ def test( # create tokenizer # ---------------------------------------------------------------------------- # tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - if "llama" == model.config.model_type: backend = getattr(tokenizer, "backend_tokenizer", None) target = getattr(backend, "_tokenizer", backend) diff --git a/python/infinilm/modeling_utils.py b/python/infinilm/modeling_utils.py index 792aa503..a8d987ca 100644 --- a/python/infinilm/modeling_utils.py +++ b/python/infinilm/modeling_utils.py @@ -75,7 +75,8 @@ def load_state_dict( ) for k in f.keys(): - state_dict[k] = f.get_tensor(k).to(device=device, dtype=dtype) + # state_dict[k] = f.get_tensor(k).to(device=device, dtype=dtype) + state_dict[k] = f.get_tensor(k).to(device=device) return state_dict @@ -147,6 +148,7 @@ def load_model_state_dict_by_file( model_param = load_state_dict( file_path, device=torch_device, dtype=torch_dtype ) + already_loaded_keys.extend(model_param.keys()) # --------------------------------------------------------- # @@ -155,7 +157,6 @@ def load_model_state_dict_by_file( model_param_infini = {} for key in model_param.keys(): model_param_infini[key] = infinicore.from_torch(model_param[key]) - model.load_state_dict(model_param_infini, strict=False) infinicore.sync_device() @@ -168,7 +169,6 @@ def load_model_state_dict_by_file( model_param_infini[key] = infinicore.from_torch( model_params[key].to(dtype=torch_dtype) ) - already_loaded_keys.append(key) model.load_state_dict(model_param_infini, strict=True) diff --git a/python/infinilm/models/llama/configuration_llama.py b/python/infinilm/models/llama/configuration_llama.py index 15776c84..f893c5cf 100644 --- a/python/infinilm/models/llama/configuration_llama.py +++ b/python/infinilm/models/llama/configuration_llama.py @@ -15,12 +15,13 @@ """LLaMA model configuration""" +from typing import Optional import infinicore from infinilm.lib import _infinilm from ...configuration_utils import PretrainedConfig - +from ..quant_config import parse_quant_config, QuantizationConfig class LlamaConfig(PretrainedConfig, _infinilm.LlamaConfig): r""" @@ -182,6 +183,7 @@ def __init__( mlp_bias=False, head_dim=None, torch_dtype=None, + quantization_config=None, **kwargs, ): _infinilm.LlamaConfig.__init__(self) @@ -245,3 +247,11 @@ def __init__( tie_word_embeddings=tie_word_embeddings, **kwargs, ) + + if isinstance(quantization_config, dict): + self.quantization_config: Optional[QuantizationConfig] = parse_quant_config(quantization_config) + self.quantization_config_dict = quantization_config + else: + self.quantization_config = None + self.quantization_config_dict = None + diff --git a/python/infinilm/models/quant_config.py b/python/infinilm/models/quant_config.py new file mode 100644 index 00000000..9e8ea0bf --- /dev/null +++ b/python/infinilm/models/quant_config.py @@ -0,0 +1,110 @@ +# coding=utf-8 +# Copyright (c) 2025, InfiniCore +# BSD 3-Clause License + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Any, Type + +# ---------------- 抽象层 ---------------- +class QuantizationConfig(ABC): + """InfiniCore 量化统一入口,C++ 或 Python 侧都只认这四个接口。""" + @abstractmethod + def get_name(self) -> str: ... + @abstractmethod + def get_min_capability(self) -> int: ... + @abstractmethod + def get_scaled_act_names(self) -> List[str]: ... + @abstractmethod + def get_quant_method(self) -> str: + """返回算法名,供 C++ dispatcher 用。""" + ... + +# ---------------- 数据类 ---------------- +@dataclass +class CompressedTensorsConfig(QuantizationConfig): + """对应 HF compressed-tensors 导出格式。""" + quant_method: str = "compressed-tensors" + format: str = "int-quantized" + quantization_status: str = "compressed" + version: str = "0.11.0" + global_compression_ratio: Optional[float] = None + ignore: List[str] = field(default_factory=lambda: ["lm_head"]) + kv_cache_scheme: Optional[Dict[str, Any]] = None + sparsity_config: Dict[str, Any] = field(default_factory=dict) + transform_config: Dict[str, Any] = field(default_factory=dict) + config_groups: Dict[str, "Group"] = field(default_factory=dict) + + @dataclass + class TensorConfig: + num_bits: int + type: str + symmetric: bool + dynamic: bool + strategy: str + observer: Optional[str] = None + observer_kwargs: Dict[str, Any] = field(default_factory=dict) + group_size: Optional[int] = None + block_structure: Optional[str] = None + actorder: Optional[Any] = None + + @dataclass + class Group: + targets: List[str] + weights: "CompressedTensorsConfig.TensorConfig" + input_activations: Optional["CompressedTensorsConfig.TensorConfig"] = None + output_activations: Optional["CompressedTensorsConfig.TensorConfig"] = None + format: str = "int-quantized" + + @staticmethod + def from_dict(cfg: Dict[str, Any]) -> "CompressedTensorsConfig": + def _build_tensor(obj: Optional[Dict[str, Any]]) -> Optional["CompressedTensorsConfig.TensorConfig"]: + return None if obj is None else CompressedTensorsConfig.TensorConfig(**obj) + + groups = {} + for gname, gcfg in cfg.get("config_groups", {}).items(): + groups[gname] = CompressedTensorsConfig.Group( + targets=gcfg["targets"], + weights=_build_tensor(gcfg["weights"]), + input_activations=_build_tensor(gcfg.get("input_activations")), + output_activations=_build_tensor(gcfg.get("output_activations")), + format=gcfg.get("format", "int-quantized"), + ) + return CompressedTensorsConfig( + quant_method=cfg["quant_method"], + format=cfg["format"], + quantization_status=cfg["quantization_status"], + version=cfg["version"], + global_compression_ratio=cfg.get("global_compression_ratio"), + ignore=cfg.get("ignore", ["lm_head"]), + kv_cache_scheme=cfg.get("kv_cache_scheme"), + sparsity_config=cfg.get("sparsity_config", {}), + transform_config=cfg.get("transform_config", {}), + config_groups=groups, + ) + + def get_name(self) -> str: + return self.quant_method + + def get_min_capability(self) -> int: + return 75 + + def get_scaled_act_names(self) -> List[str]: + return [] + + def get_quant_method(self) -> str: + return self.quant_method + + +_QUANT_METHOD_MAP: Dict[str, Type[QuantizationConfig]] = { + "compressed-tensors": CompressedTensorsConfig, +} + +def parse_quant_config(quant_cfg: Dict[str, Any]) -> Optional[QuantizationConfig]: + """统一解析入口,供 LlamaConfig 调用。""" + method = quant_cfg.get("quant_method") + cls = _QUANT_METHOD_MAP.get(method) + if cls is None: + return None + + return cls.from_dict(quant_cfg) \ No newline at end of file From b8188bca1bc148343c0d1f72b8c7b2ca29c34bb1 Mon Sep 17 00:00:00 2001 From: qinyiqun Date: Fri, 16 Jan 2026 18:33:16 +0800 Subject: [PATCH 02/11] =?UTF-8?q?=E5=A2=9E=E5=8A=A0json=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitmodules | 3 +++ csrc/config/global_config.hpp | 31 ++++++++++++++++++++++++++++++ csrc/config/quant_config.hpp | 17 ++++++++++++++++ csrc/engine/infer_engine.cpp | 5 +++++ csrc/engine/infer_engine.hpp | 3 +++ csrc/engine/rank_worker.cpp | 2 ++ csrc/engine/rank_worker.hpp | 3 +++ csrc/models/infinilm_model.hpp | 6 +++--- csrc/models/llama/llama.hpp | 7 ++++--- csrc/models/llama/llama_config.hpp | 5 +++++ csrc/pybind11/engine/engine.hpp | 19 +++++++----------- python/infinilm/infer_engine.py | 1 + third_party/json | 1 + xmake.lua | 1 + 14 files changed, 86 insertions(+), 18 deletions(-) create mode 100644 csrc/config/global_config.hpp create mode 100644 csrc/config/quant_config.hpp create mode 160000 third_party/json diff --git a/.gitmodules b/.gitmodules index eab6041a..ade5ff58 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "third_party/spdlog"] path = third_party/spdlog url = https://github.com/gabime/spdlog.git +[submodule "third_party/json"] + path = third_party/json + url = https://github.com/nlohmann/json.git diff --git a/csrc/config/global_config.hpp b/csrc/config/global_config.hpp new file mode 100644 index 00000000..d04c1c94 --- /dev/null +++ b/csrc/config/global_config.hpp @@ -0,0 +1,31 @@ +#pragma once + +#include "quant_config.hpp" +#include +#include +#include + +namespace infinilm::config::global_config { +struct GlobalConfig { + // Quantization configuration +public: + infinilm::config::quantization::QuantConfig get_quant_config_json() const { + return infinilm::config::quantization::QuantConfig(config_json.value("quantization_config", nlohmann::json::object())).to_json(); + } + + GlobalConfig() = default; + GlobalConfig(const nlohmann::json &json) : config_json(json) {}; + GlobalConfig(const std::string &path) { + std::ifstream file(path); + if (file.is_open()) { + file >> config_json; + file.close(); + } else { + throw std::runtime_error("Could not open config file: " + path); + } + } + +private: + nlohmann::json config_json; +}; +} // namespace infinilm::config::global_config \ No newline at end of file diff --git a/csrc/config/quant_config.hpp b/csrc/config/quant_config.hpp new file mode 100644 index 00000000..fa9e01f4 --- /dev/null +++ b/csrc/config/quant_config.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "nlohmann/json.hpp" + +namespace infinilm::config::quantization { + +struct QuantConfig { + nlohmann::json quantization_config; + + QuantConfig() = default; + QuantConfig(const nlohmann::json &json) : quantization_config(json) {}; + nlohmann::json to_json() const { + return quantization_config; + } +}; + +} // namespace infinilm::config::quantization \ No newline at end of file diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp index f49a9108..9e38e623 100644 --- a/csrc/engine/infer_engine.cpp +++ b/csrc/engine/infer_engine.cpp @@ -11,6 +11,7 @@ InferEngine::InferEngine( const distributed::DistConfig &distributed_config, infinicore::Device::Type device_type, const cache::CacheConfig *cache_config, + const std::string &model_path, bool enable_graph_compiling) // Changed parameter : communication_group_(distributed_config, device_type), model_config_(config) { @@ -18,6 +19,9 @@ InferEngine::InferEngine( if (cache_config != nullptr) { cache_config_ = cache_config->unique_copy(); } + if (!model_path.empty()) { + global_config_ = infinilm::config::global_config::GlobalConfig(model_path + "/config.json"); + } // Create one RankWorker per rank int world_size = communication_group_.get_world_size(); barrier_ = std::make_unique((size_t)world_size); @@ -27,6 +31,7 @@ InferEngine::InferEngine( model_config_, communication_group_.get_rank_info(r), cache_config_ != nullptr ? cache_config_.get() : nullptr, + global_config_, barrier_.get(), enable_graph_compiling)); } diff --git a/csrc/engine/infer_engine.hpp b/csrc/engine/infer_engine.hpp index ce834c6a..3811e2ee 100644 --- a/csrc/engine/infer_engine.hpp +++ b/csrc/engine/infer_engine.hpp @@ -1,5 +1,6 @@ #pragma once +#include "../config/global_config.hpp" #include "../models/infinilm_model.hpp" #include "../models/llama/llama_config.hpp" #include "distributed/distributed.hpp" @@ -24,6 +25,7 @@ class InferEngine { const distributed::DistConfig &distributed_config = distributed::DistConfig(), infinicore::Device::Type device_type = infinicore::context::getDevice().getType(), const cache::CacheConfig *cache_config = nullptr, + const std::string &modle_path = "", bool enable_graph_compiling = false); // Load a parameter to all workers (each can extract its shard inside RankWorker) @@ -52,6 +54,7 @@ class InferEngine { distributed::CommunicationGroup communication_group_; const InfinilmModel::Config &model_config_; std::unique_ptr cache_config_; + infinilm::config::global_config::GlobalConfig global_config_; }; } // namespace infinilm::engine diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp index 8149b69b..38f88de0 100644 --- a/csrc/engine/rank_worker.cpp +++ b/csrc/engine/rank_worker.cpp @@ -13,6 +13,7 @@ namespace infinilm::engine { RankWorker::RankWorker(const InfinilmModel::Config &model_config, const distributed::RankInfo &rank_info, const cache::CacheConfig *cache_config, + const infinilm::config::global_config::GlobalConfig &global_config, RankBarrier *barrier, bool enable_graph_compiling) : model_config_(model_config), @@ -21,6 +22,7 @@ RankWorker::RankWorker(const InfinilmModel::Config &model_config, job_cmd_(Command::INIT), has_job_(false), job_done_(false), + global_config_(global_config), should_exit_(false), init_done_(false), rng_(std::random_device{}()), diff --git a/csrc/engine/rank_worker.hpp b/csrc/engine/rank_worker.hpp index 480dc767..9a83dc0b 100644 --- a/csrc/engine/rank_worker.hpp +++ b/csrc/engine/rank_worker.hpp @@ -1,6 +1,7 @@ #pragma once #include "../cache/cache.hpp" +#include "../config/global_config.hpp" #include "../models/model_factory.hpp" #include "compiler/general_compiler.hpp" #include "distributed/distributed.hpp" @@ -59,6 +60,7 @@ class RankWorker { RankWorker(const InfinilmModel::Config &model_config, const distributed::RankInfo &rank_info, const cache::CacheConfig *cache_config, + const infinilm::config::global_config::GlobalConfig &global_config, RankBarrier *barrier, bool enable_graph_compiling); @@ -98,6 +100,7 @@ class RankWorker { distributed::RankInfo rank_info_; std::shared_ptr model_; std::shared_ptr cache_; + const infinilm::config::global_config::GlobalConfig &global_config_; // Graph Compiling bool enable_graph_compiling_; diff --git a/csrc/models/infinilm_model.hpp b/csrc/models/infinilm_model.hpp index 3537bc75..5c89a54c 100644 --- a/csrc/models/infinilm_model.hpp +++ b/csrc/models/infinilm_model.hpp @@ -1,8 +1,8 @@ #pragma once -#include "infinicore/nn/module.hpp" - #include "../cache/cache.hpp" +#include "infinicore/nn/module.hpp" +#include "nlohmann/json.hpp" #include @@ -13,7 +13,7 @@ class InfinilmModel : public infinicore::nn::Module { public: struct Config { std::string model_type; - + nlohmann::json model_config; virtual ~Config() = default; }; diff --git a/csrc/models/llama/llama.hpp b/csrc/models/llama/llama.hpp index fe554c32..eebac92b 100644 --- a/csrc/models/llama/llama.hpp +++ b/csrc/models/llama/llama.hpp @@ -16,9 +16,10 @@ * - LlamaForCausalLM: Complete model with language modeling head */ -#include "llama_config.hpp" +#include "../../config/global_config.hpp" #include "llama_attention.hpp" -#include "llama_mlp.hpp" +#include "llama_config.hpp" #include "llama_decoder_layer.hpp" -#include "llama_model.hpp" #include "llama_for_causal_lm.hpp" +#include "llama_mlp.hpp" +#include "llama_model.hpp" diff --git a/csrc/models/llama/llama_config.hpp b/csrc/models/llama/llama_config.hpp index b0fb892e..e3e56a7f 100644 --- a/csrc/models/llama/llama_config.hpp +++ b/csrc/models/llama/llama_config.hpp @@ -7,6 +7,7 @@ #include "../infinilm_model.hpp" #include "infinicore/nn/quantization.hpp" +#include "nlohmann/json.hpp" #include @@ -37,6 +38,8 @@ struct LlamaConfig : public InfinilmModel::Config { size_t max_position_embeddings = 2048; // Maximum sequence length double rope_theta = 10000.0; // RoPE base frequency + std::shared_ptr rope_scaling = nullptr; // RoPE scaling type + // Normalization double rms_norm_eps = 1e-6; // RMSNorm epsilon @@ -94,6 +97,8 @@ struct LlamaConfig : public InfinilmModel::Config { } return true; } + + nlohmann::json config_json; }; } // namespace infinilm::models::llama diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp index f5dae4a7..c64b9905 100644 --- a/csrc/pybind11/engine/engine.hpp +++ b/csrc/pybind11/engine/engine.hpp @@ -36,18 +36,21 @@ inline void bind_infer_engine(py::module &m) { const distributed::DistConfig &dist, infinicore::Device::Type dev, std::shared_ptr cache_cfg, + const std::string &modle_path, bool enable_graph_compiling) { return std::make_shared( cfg, dist, dev, cache_cfg ? cache_cfg.get() : nullptr, + modle_path, enable_graph_compiling); }), py::arg("config"), py::arg("distributed_config") = distributed::DistConfig(), py::arg("device_type") = infinicore::context::getDevice().getType(), py::arg("cache_config") = py::none(), + py::arg("model_path") = "", py::arg("enable_graph_compiling") = false) .def("load_param", &InferEngine::load_param, py::arg("name"), py::arg("param"), @@ -63,20 +66,12 @@ inline void bind_infer_engine(py::module &m) { } return state_dict_tp_all; }) - .def( - "forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments") - .def( - "reset_cache", [](InferEngine &self, std::shared_ptr cfg) { - self.reset_cache(cfg ? cfg.get() : nullptr); - }, - py::arg("cache_config") = py::none()) + .def("forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments") + .def("reset_cache", [](InferEngine &self, std::shared_ptr cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none()) .def("get_cache_config", [](const InferEngine &self) { auto cfg = self.get_cache_config(); - return std::shared_ptr(std::move(cfg->unique_copy())); - }) - .def("__repr__", [](const InferEngine &self) { - return ""; - }); + return std::shared_ptr(std::move(cfg->unique_copy())); }) + .def("__repr__", [](const InferEngine &self) { return ""; }); py::class_(infer_engine, "Input") .def( diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py index f5359d7d..fa51e4fe 100644 --- a/python/infinilm/infer_engine.py +++ b/python/infinilm/infer_engine.py @@ -40,6 +40,7 @@ def __init__( distributed_config._underlying, device._underlying.type, cache_config, + model_path, enable_graph_compiling, ) diff --git a/third_party/json b/third_party/json new file mode 160000 index 00000000..5ed07097 --- /dev/null +++ b/third_party/json @@ -0,0 +1 @@ +Subproject commit 5ed07097faa6c50199c4a3b66e5ed37d4fbfccc2 diff --git a/xmake.lua b/xmake.lua index ad636197..aab1a0c7 100644 --- a/xmake.lua +++ b/xmake.lua @@ -6,6 +6,7 @@ set_toolchains("gcc") -- Add spdlog from third_party directory add_includedirs("third_party/spdlog/include") +add_includedirs("third_party/json/single_include/") target("infinicore_infer") set_kind("shared") From e19f135ac65dd8a5767705bebdfaf5870f1e5f58 Mon Sep 17 00:00:00 2001 From: qinyiqun Date: Tue, 20 Jan 2026 14:01:22 +0800 Subject: [PATCH 03/11] =?UTF-8?q?InfiniLM=20=E5=A2=9E=E5=8A=A0=E9=87=8F?= =?UTF-8?q?=E5=8C=96=E5=B1=82=E5=92=8Cglobal=20config?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- csrc/engine/rank_worker.cpp | 2 +- csrc/models/llama/llama_attention.cpp | 9 +++++---- csrc/models/llama/llama_attention.hpp | 4 +++- csrc/models/llama/llama_config.hpp | 2 +- csrc/models/llama/llama_mlp.cpp | 4 ++-- csrc/models/model_factory.cpp | 3 ++- csrc/models/model_factory.hpp | 4 +++- csrc/quantization/compressed_tensors.cpp | 6 ++++++ csrc/quantization/compressed_tensors.hpp | 19 +++++++++++++++++++ csrc/quantization/quantization.hpp | 19 +++++++++++++++++++ csrc/quantization/utils.hpp | 2 ++ 11 files changed, 63 insertions(+), 11 deletions(-) create mode 100644 csrc/quantization/compressed_tensors.cpp create mode 100644 csrc/quantization/compressed_tensors.hpp create mode 100644 csrc/quantization/quantization.hpp create mode 100644 csrc/quantization/utils.hpp diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp index 38f88de0..ff78d31e 100644 --- a/csrc/engine/rank_worker.cpp +++ b/csrc/engine/rank_worker.cpp @@ -197,7 +197,7 @@ void RankWorker::thread_loop() { infinicore::context::setDevice(rank_info_.device); // Create model using factory (may be expensive) - model_ = InfinilmModelFactory::createModel(model_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr); + model_ = InfinilmModelFactory::createModel(model_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr, global_config_); if (!model_) { throw std::runtime_error("Failed to create model"); } diff --git a/csrc/models/llama/llama_attention.cpp b/csrc/models/llama/llama_attention.cpp index 2701d3a2..aac7d12d 100644 --- a/csrc/models/llama/llama_attention.cpp +++ b/csrc/models/llama/llama_attention.cpp @@ -20,7 +20,8 @@ namespace infinilm::models::llama { LlamaAttention::LlamaAttention(const LlamaConfig &config, const infinicore::Device &device, size_t layer_idx, - engine::distributed::RankInfo rank_info) + engine::distributed::RankInfo rank_info, + const infinilm::config::global_config::GlobalConfig &global_config) : layer_idx_(layer_idx), hidden_size_(config.hidden_size), num_attention_heads_(config.num_attention_heads), @@ -48,16 +49,16 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config, scaling_ = 1.0f / std::sqrt(static_cast(head_dim_)); // Initialize projection layers + // if (global_config.get_global_config_json().is_null()) { if (!config.quant_config.has_value()) { INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, config.num_attention_heads, config.num_key_value_heads, use_bias_, dtype, device, rank_info); // Output projection uses attention_output_bias (can be different from qkv) INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_output_bias_, dtype, device, tp_rank, tp_size, rank_info.comm); - } else { - switch (config.quant_config.value().get_quant_type()) { - case infinicore::nn::QuantType::COMPRESSED_TENSOR: { + switch (config.quant_config.value().get_quant_scheme()) { + case infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8: { INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, config.num_attention_heads, config.num_key_value_heads, use_bias_, dtype, device, rank_info, config.quant_config.value()); diff --git a/csrc/models/llama/llama_attention.hpp b/csrc/models/llama/llama_attention.hpp index 9d464bcf..d35a7e6a 100644 --- a/csrc/models/llama/llama_attention.hpp +++ b/csrc/models/llama/llama_attention.hpp @@ -1,6 +1,7 @@ #pragma once #include "../../cache/kv_cache.hpp" +#include "../../config/global_config.hpp" #include "../../engine/distributed/distributed.hpp" #include "../../layers/fused_linear.hpp" #include "llama_config.hpp" @@ -39,7 +40,8 @@ class LlamaAttention : public infinicore::nn::Module { LlamaAttention(const LlamaConfig &config, const infinicore::Device &device, size_t layer_idx, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), + const infinilm::config::global_config::GlobalConfig &global_config = infinilm::config::global_config::GlobalConfig()); /** * @brief Forward pass: compute attention diff --git a/csrc/models/llama/llama_config.hpp b/csrc/models/llama/llama_config.hpp index e3e56a7f..ca02507d 100644 --- a/csrc/models/llama/llama_config.hpp +++ b/csrc/models/llama/llama_config.hpp @@ -70,7 +70,7 @@ struct LlamaConfig : public InfinilmModel::Config { // Quant Config // std::optional quant_config = std::nullopt; - std::optional quant_config = infinicore::nn::QuantConfig(infinicore::nn::QuantType::COMPRESSED_TENSOR); + std::optional quant_config = infinicore::nn::QuantConfig(infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8); /** * @brief Compute key-value dimension for Grouped Query Attention (GQA) diff --git a/csrc/models/llama/llama_mlp.cpp b/csrc/models/llama/llama_mlp.cpp index 2cc48ccb..1ac3474b 100644 --- a/csrc/models/llama/llama_mlp.cpp +++ b/csrc/models/llama/llama_mlp.cpp @@ -23,8 +23,8 @@ LlamaMLP::LlamaMLP(const LlamaConfig &config, INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, use_bias_, dtype, device, tp_rank, tp_size, rank_info.comm); } else { - switch (config.quant_config.value().get_quant_type()) { - case infinicore::nn::QuantType::COMPRESSED_TENSOR: { + switch (config.quant_config.value().get_quant_scheme()) { + case infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8: { INFINILM_GATE_UP_LINEAR_W8A8_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, use_bias_, dtype, device, rank_info_, config.quant_config.value()); INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, use_bias_, diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp index 999bb364..f84d4905 100644 --- a/csrc/models/model_factory.cpp +++ b/csrc/models/model_factory.cpp @@ -5,7 +5,8 @@ namespace infinilm { std::shared_ptr InfinilmModelFactory::createModel( const InfinilmModel::Config &config, engine::distributed::RankInfo rank_info, - const cache::CacheConfig *cache) { + const cache::CacheConfig *cache, + const config::global_config::GlobalConfig &global_config) { std::shared_ptr model; if (const auto llama_config_ptr = dynamic_cast(&config)) { diff --git a/csrc/models/model_factory.hpp b/csrc/models/model_factory.hpp index a73f432c..3d14ced3 100644 --- a/csrc/models/model_factory.hpp +++ b/csrc/models/model_factory.hpp @@ -1,5 +1,6 @@ #pragma once +#include "../config/global_config.hpp" #include "infinilm_model.hpp" #include "../engine/distributed/distributed.hpp" @@ -10,6 +11,7 @@ class InfinilmModelFactory { static std::shared_ptr createModel( const InfinilmModel::Config &config, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - const cache::CacheConfig *cache = nullptr); + const cache::CacheConfig *cache = nullptr, + const config::global_config::GlobalConfig &global_config = config::global_config::GlobalConfig()); }; } // namespace infinilm diff --git a/csrc/quantization/compressed_tensors.cpp b/csrc/quantization/compressed_tensors.cpp new file mode 100644 index 00000000..f5b71bcc --- /dev/null +++ b/csrc/quantization/compressed_tensors.cpp @@ -0,0 +1,6 @@ +// #include "compressed_tensors.hpp" + +// infinicore::nn::QuantScheme CompressedTensors::get_quant_scheme() { +// // need to add more schemes later +// return infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8; +// } \ No newline at end of file diff --git a/csrc/quantization/compressed_tensors.hpp b/csrc/quantization/compressed_tensors.hpp new file mode 100644 index 00000000..c7d6eead --- /dev/null +++ b/csrc/quantization/compressed_tensors.hpp @@ -0,0 +1,19 @@ +#pragma once +#include "quantization.hpp" +// #include "utils.hpp" +namespace infinilm::quantization { + +class CompressedTensors : public BaseQuantization { +public: + CompressedTensors(const infinilm::config::global_config::GlobalConfig &global_config) + : BaseQuantization(global_config) { + quant_config_ = global_config.get_quant_config_json(); + } + + infinicore::nn::QuantScheme + get_quant_scheme() const override { + return infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8; + } +}; + +} // namespace infinilm::quantization \ No newline at end of file diff --git a/csrc/quantization/quantization.hpp b/csrc/quantization/quantization.hpp new file mode 100644 index 00000000..8bca2651 --- /dev/null +++ b/csrc/quantization/quantization.hpp @@ -0,0 +1,19 @@ +#pragma once +#include "compressed_tensors.hpp" + +// #include "../config/quant_config.hpp" +#include "../config/global_config.hpp" +#include "infinicore/nn/quantization.hpp" + +namespace infinilm::quantization { +class BaseQuantization { +public: + explicit BaseQuantization(const infinilm::config::global_config::GlobalConfig &global_config) {}; + virtual ~BaseQuantization() = default; + + virtual infinicore::nn::QuantScheme get_quant_scheme() const = 0; + +protected: + infinilm::config::quantization::QuantConfig quant_config_; +} +} // namespace infinilm::quantization \ No newline at end of file diff --git a/csrc/quantization/utils.hpp b/csrc/quantization/utils.hpp new file mode 100644 index 00000000..1ae21db2 --- /dev/null +++ b/csrc/quantization/utils.hpp @@ -0,0 +1,2 @@ +#include "../config/global_config.hpp" +#include "infinicore/nn/quantization.hpp" \ No newline at end of file From 5cba58a390da98650c5892f0d26d248c62db89c7 Mon Sep 17 00:00:00 2001 From: qinyiqun Date: Wed, 21 Jan 2026 16:01:39 +0800 Subject: [PATCH 04/11] =?UTF-8?q?=E4=BB=A5=E4=B8=80=E7=A7=8D=E6=AF=94?= =?UTF-8?q?=E8=BE=83=E4=BC=98=E9=9B=85=E7=9A=84=E6=96=B9=E5=BC=8F=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E4=BA=86quant=20config=E7=9A=84=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- csrc/config/global_config.cpp | 14 +++++++++ csrc/config/global_config.hpp | 17 +++++------ csrc/config/quant_config.cpp | 22 ++++++++++++++ csrc/config/quant_config.hpp | 24 +++++++++++---- csrc/engine/infer_engine.cpp | 7 +++-- csrc/engine/infer_engine.hpp | 2 +- csrc/engine/rank_worker.cpp | 3 +- csrc/engine/rank_worker.hpp | 4 +-- csrc/models/infinilm_model.hpp | 1 - csrc/models/llama/llama_attention.cpp | 37 +++++++++++------------ csrc/models/llama/llama_attention.hpp | 3 +- csrc/models/llama/llama_config.hpp | 4 --- csrc/models/llama/llama_decoder_layer.cpp | 7 +++-- csrc/models/llama/llama_decoder_layer.hpp | 4 ++- csrc/models/llama/llama_for_causal_lm.cpp | 5 +-- csrc/models/llama/llama_for_causal_lm.hpp | 3 +- csrc/models/llama/llama_mlp.cpp | 30 +++++++++--------- csrc/models/llama/llama_mlp.hpp | 6 +++- csrc/models/llama/llama_model.cpp | 8 ++--- csrc/models/llama/llama_model.hpp | 5 +-- csrc/models/model_factory.cpp | 4 +-- csrc/models/model_factory.hpp | 2 +- csrc/quantization/compressed_tensors.hpp | 12 +++++--- csrc/quantization/quantization.hpp | 13 ++++---- 24 files changed, 143 insertions(+), 94 deletions(-) create mode 100644 csrc/config/global_config.cpp create mode 100644 csrc/config/quant_config.cpp diff --git a/csrc/config/global_config.cpp b/csrc/config/global_config.cpp new file mode 100644 index 00000000..63d03fb0 --- /dev/null +++ b/csrc/config/global_config.cpp @@ -0,0 +1,14 @@ +#include "global_config.hpp" +#include +namespace infinilm::config::global_config { +GlobalConfig::GlobalConfig(const std::string &path) { + std::ifstream file(path); + if (file.is_open()) { + file >> config_json; + file.close(); + } else { + throw std::runtime_error("Could not open config file: " + path); + } + this->quant_config = quantization::QuantConfig(config_json["quantization_config"]); +} +} // namespace infinilm::config::global_config \ No newline at end of file diff --git a/csrc/config/global_config.hpp b/csrc/config/global_config.hpp index d04c1c94..6f9c32b6 100644 --- a/csrc/config/global_config.hpp +++ b/csrc/config/global_config.hpp @@ -9,23 +9,20 @@ namespace infinilm::config::global_config { struct GlobalConfig { // Quantization configuration public: - infinilm::config::quantization::QuantConfig get_quant_config_json() const { - return infinilm::config::quantization::QuantConfig(config_json.value("quantization_config", nlohmann::json::object())).to_json(); - } - GlobalConfig() = default; GlobalConfig(const nlohmann::json &json) : config_json(json) {}; - GlobalConfig(const std::string &path) { - std::ifstream file(path); - if (file.is_open()) { - file >> config_json; - file.close(); + GlobalConfig(const std::string &path); + + infinicore::nn::QuantScheme get_quant_scheme() const { + if (quant_config.get_quant_scheme() != infinicore::nn::QuantScheme::NONE) { + return quant_config.get_quant_scheme(); } else { - throw std::runtime_error("Could not open config file: " + path); + return infinicore::nn::QuantScheme::NONE; } } private: nlohmann::json config_json; + quantization::QuantConfig quant_config; }; } // namespace infinilm::config::global_config \ No newline at end of file diff --git a/csrc/config/quant_config.cpp b/csrc/config/quant_config.cpp new file mode 100644 index 00000000..0ee47682 --- /dev/null +++ b/csrc/config/quant_config.cpp @@ -0,0 +1,22 @@ +#include "quant_config.hpp" +#include +namespace infinilm::config::quantization { +QuantConfig::QuantConfig(const nlohmann::json &json) : quantization_config(json) { + this->quantization_method = get_quantization_method(); +} + +std::shared_ptr +QuantConfig::get_quantization_method() const { + if (quantization_config.is_null()) { + return nullptr; + } + + // Determine the quantization scheme from the JSON config + if (quantization_config["quant_method"] == "compressed-tensors") { + return std::make_shared(quantization_config); + } + // Add other schemes as needed + + return nullptr; // Default case if no matching scheme +} +} // namespace infinilm::config::quantization \ No newline at end of file diff --git a/csrc/config/quant_config.hpp b/csrc/config/quant_config.hpp index fa9e01f4..46400eff 100644 --- a/csrc/config/quant_config.hpp +++ b/csrc/config/quant_config.hpp @@ -1,17 +1,29 @@ #pragma once +#include "../quantization/compressed_tensors.hpp" +#include "../quantization/quantization.hpp" #include "nlohmann/json.hpp" +#include namespace infinilm::config::quantization { -struct QuantConfig { - nlohmann::json quantization_config; - +class QuantConfig { +public: QuantConfig() = default; - QuantConfig(const nlohmann::json &json) : quantization_config(json) {}; - nlohmann::json to_json() const { - return quantization_config; + QuantConfig(const nlohmann::json &json); + + infinicore::nn::QuantScheme get_quant_scheme() const { + if (quantization_method != nullptr) { + return quantization_method->get_quant_scheme(); + } else { + return infinicore::nn::QuantScheme::NONE; + } } + +private: + nlohmann::json quantization_config; + std::shared_ptr get_quantization_method() const; + std::shared_ptr quantization_method; }; } // namespace infinilm::config::quantization \ No newline at end of file diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp index 9e38e623..25b2b70d 100644 --- a/csrc/engine/infer_engine.cpp +++ b/csrc/engine/infer_engine.cpp @@ -1,5 +1,6 @@ #include "infer_engine.hpp" #include "spdlog/spdlog.h" +#include namespace infinilm::engine { @@ -19,9 +20,9 @@ InferEngine::InferEngine( if (cache_config != nullptr) { cache_config_ = cache_config->unique_copy(); } - if (!model_path.empty()) { - global_config_ = infinilm::config::global_config::GlobalConfig(model_path + "/config.json"); - } + // if (!model_path.empty()) { + this->global_config_ = std::make_shared(model_path + "/config.json"); + // Create one RankWorker per rank int world_size = communication_group_.get_world_size(); barrier_ = std::make_unique((size_t)world_size); diff --git a/csrc/engine/infer_engine.hpp b/csrc/engine/infer_engine.hpp index 3811e2ee..8c0102f8 100644 --- a/csrc/engine/infer_engine.hpp +++ b/csrc/engine/infer_engine.hpp @@ -54,7 +54,7 @@ class InferEngine { distributed::CommunicationGroup communication_group_; const InfinilmModel::Config &model_config_; std::unique_ptr cache_config_; - infinilm::config::global_config::GlobalConfig global_config_; + std::shared_ptr global_config_; }; } // namespace infinilm::engine diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp index ff78d31e..9bc7e8f6 100644 --- a/csrc/engine/rank_worker.cpp +++ b/csrc/engine/rank_worker.cpp @@ -13,7 +13,7 @@ namespace infinilm::engine { RankWorker::RankWorker(const InfinilmModel::Config &model_config, const distributed::RankInfo &rank_info, const cache::CacheConfig *cache_config, - const infinilm::config::global_config::GlobalConfig &global_config, + std::shared_ptr global_config, RankBarrier *barrier, bool enable_graph_compiling) : model_config_(model_config), @@ -32,7 +32,6 @@ RankWorker::RankWorker(const InfinilmModel::Config &model_config, } // start the thread thread_ = std::thread(&RankWorker::thread_loop, this); - // Wait until the worker thread finishes initialization (model created) std::unique_lock lk(mutex_); cv_.wait(lk, [&] { return init_done_; }); diff --git a/csrc/engine/rank_worker.hpp b/csrc/engine/rank_worker.hpp index 9a83dc0b..b079aef4 100644 --- a/csrc/engine/rank_worker.hpp +++ b/csrc/engine/rank_worker.hpp @@ -60,7 +60,7 @@ class RankWorker { RankWorker(const InfinilmModel::Config &model_config, const distributed::RankInfo &rank_info, const cache::CacheConfig *cache_config, - const infinilm::config::global_config::GlobalConfig &global_config, + std::shared_ptr global_config, RankBarrier *barrier, bool enable_graph_compiling); @@ -100,7 +100,7 @@ class RankWorker { distributed::RankInfo rank_info_; std::shared_ptr model_; std::shared_ptr cache_; - const infinilm::config::global_config::GlobalConfig &global_config_; + std::shared_ptr global_config_; // Graph Compiling bool enable_graph_compiling_; diff --git a/csrc/models/infinilm_model.hpp b/csrc/models/infinilm_model.hpp index 5c89a54c..be7ebd0d 100644 --- a/csrc/models/infinilm_model.hpp +++ b/csrc/models/infinilm_model.hpp @@ -13,7 +13,6 @@ class InfinilmModel : public infinicore::nn::Module { public: struct Config { std::string model_type; - nlohmann::json model_config; virtual ~Config() = default; }; diff --git a/csrc/models/llama/llama_attention.cpp b/csrc/models/llama/llama_attention.cpp index aac7d12d..f1545c5d 100644 --- a/csrc/models/llama/llama_attention.cpp +++ b/csrc/models/llama/llama_attention.cpp @@ -21,7 +21,7 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config, const infinicore::Device &device, size_t layer_idx, engine::distributed::RankInfo rank_info, - const infinilm::config::global_config::GlobalConfig &global_config) + std::shared_ptr global_config) : layer_idx_(layer_idx), hidden_size_(config.hidden_size), num_attention_heads_(config.num_attention_heads), @@ -30,8 +30,9 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config, kv_dim_(config.kv_dim()), use_bias_(config.attention_bias), use_output_bias_(config.attention_output_bias), - use_qk_norm_(config.qk_norm), - max_position_embeddings_(config.max_position_embeddings), rank_info_(rank_info) { + max_position_embeddings_(config.max_position_embeddings), + rank_info_(rank_info), + global_config_(global_config) { const auto &dtype{config.dtype}; int tp_rank = rank_info.tp_rank; @@ -48,27 +49,23 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config, } scaling_ = 1.0f / std::sqrt(static_cast(head_dim_)); - // Initialize projection layers - // if (global_config.get_global_config_json().is_null()) { - if (!config.quant_config.has_value()) { + auto quant_scheme = this->global_config_->get_quant_scheme(); + switch (quant_scheme) { + case infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8: + INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, config.num_attention_heads, config.num_key_value_heads, use_bias_, + dtype, device, rank_info, quant_scheme); + + INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_output_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm, quant_scheme); + break; + + default: INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, config.num_attention_heads, config.num_key_value_heads, use_bias_, dtype, device, rank_info); - // Output projection uses attention_output_bias (can be different from qkv) + INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_output_bias_, dtype, device, tp_rank, tp_size, rank_info.comm); - } else { - switch (config.quant_config.value().get_quant_scheme()) { - case infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8: { - INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, config.num_attention_heads, config.num_key_value_heads, use_bias_, - dtype, device, rank_info, config.quant_config.value()); - - INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_output_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm, config.quant_config.value()); - break; - } - default: { - } - } + break; } } diff --git a/csrc/models/llama/llama_attention.hpp b/csrc/models/llama/llama_attention.hpp index d35a7e6a..ca9abe32 100644 --- a/csrc/models/llama/llama_attention.hpp +++ b/csrc/models/llama/llama_attention.hpp @@ -41,7 +41,7 @@ class LlamaAttention : public infinicore::nn::Module { const infinicore::Device &device, size_t layer_idx, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - const infinilm::config::global_config::GlobalConfig &global_config = infinilm::config::global_config::GlobalConfig()); + std::shared_ptr global_config = nullptr); /** * @brief Forward pass: compute attention @@ -115,6 +115,7 @@ class LlamaAttention : public infinicore::nn::Module { size_t max_position_embeddings_; // For cache initialization (deprecated, kept for compatibility) float scaling_; + std::shared_ptr global_config_; }; } // namespace infinilm::models::llama diff --git a/csrc/models/llama/llama_config.hpp b/csrc/models/llama/llama_config.hpp index ca02507d..0db2bcc8 100644 --- a/csrc/models/llama/llama_config.hpp +++ b/csrc/models/llama/llama_config.hpp @@ -68,10 +68,6 @@ struct LlamaConfig : public InfinilmModel::Config { std::vector bos_token_id = {1}; // Beginning of sequence token ID(s) std::vector eos_token_id = {2}; // End of sequence token ID(s) - // Quant Config - // std::optional quant_config = std::nullopt; - std::optional quant_config = infinicore::nn::QuantConfig(infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8); - /** * @brief Compute key-value dimension for Grouped Query Attention (GQA) * @return The dimension for key/value projections diff --git a/csrc/models/llama/llama_decoder_layer.cpp b/csrc/models/llama/llama_decoder_layer.cpp index c99dad6f..7958820c 100644 --- a/csrc/models/llama/llama_decoder_layer.cpp +++ b/csrc/models/llama/llama_decoder_layer.cpp @@ -9,7 +9,8 @@ namespace infinilm::models::llama { LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config, const infinicore::Device &device, size_t layer_idx, - engine::distributed::RankInfo rank_info) : layer_idx_(layer_idx), rank_info_(rank_info) { + engine::distributed::RankInfo rank_info, + std::shared_ptr global_config) : layer_idx_(layer_idx), rank_info_(rank_info), global_config_(global_config) { const auto &dtype{config.dtype}; // Initialize layer normalization layers @@ -19,8 +20,8 @@ LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config, dtype, device); // Initialize attention and MLP modules - INFINICORE_NN_MODULE_INIT(self_attn, config, device, layer_idx, rank_info_); - INFINICORE_NN_MODULE_INIT(mlp, config, device, rank_info_); + INFINICORE_NN_MODULE_INIT(self_attn, config, device, layer_idx, rank_info_, global_config); + INFINICORE_NN_MODULE_INIT(mlp, config, device, rank_info_, global_config); } std::tuple diff --git a/csrc/models/llama/llama_decoder_layer.hpp b/csrc/models/llama/llama_decoder_layer.hpp index 839d6d37..2198afb6 100644 --- a/csrc/models/llama/llama_decoder_layer.hpp +++ b/csrc/models/llama/llama_decoder_layer.hpp @@ -36,7 +36,8 @@ class LlamaDecoderLayer : public infinicore::nn::Module { LlamaDecoderLayer(const LlamaConfig &config, const infinicore::Device &device, size_t layer_idx, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), + std::shared_ptr global_config = nullptr); /** * @brief Forward pass: process one decoder layer @@ -79,6 +80,7 @@ class LlamaDecoderLayer : public infinicore::nn::Module { INFINICORE_NN_MODULE(LlamaAttention, self_attn); INFINICORE_NN_MODULE(LlamaMLP, mlp); engine::distributed::RankInfo rank_info_; + std::shared_ptr global_config_; private: size_t layer_idx_; // Layer index for cache management and debugging diff --git a/csrc/models/llama/llama_for_causal_lm.cpp b/csrc/models/llama/llama_for_causal_lm.cpp index c7f8728e..8fdcffb4 100644 --- a/csrc/models/llama/llama_for_causal_lm.cpp +++ b/csrc/models/llama/llama_for_causal_lm.cpp @@ -8,7 +8,8 @@ namespace infinilm::models::llama { LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config, const infinicore::Device &device, - engine::distributed::RankInfo rank_info) { + engine::distributed::RankInfo rank_info, + std::shared_ptr global_config) { // Initialize module's device_ member device_ = device; @@ -16,7 +17,7 @@ LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config, const auto &dtype{config.dtype}; // Initialize base model - INFINICORE_NN_MODULE_INIT(model, config, device, rank_info); + INFINICORE_NN_MODULE_INIT(model, config, device, rank_info, global_config); // Initialize language modeling head // Note: If tie_word_embeddings is true, we would share weights with embed_tokens diff --git a/csrc/models/llama/llama_for_causal_lm.hpp b/csrc/models/llama/llama_for_causal_lm.hpp index 4b7275cd..5d260230 100644 --- a/csrc/models/llama/llama_for_causal_lm.hpp +++ b/csrc/models/llama/llama_for_causal_lm.hpp @@ -30,7 +30,8 @@ class LlamaForCausalLM : public InfinilmModel { */ LlamaForCausalLM(const LlamaConfig &config, const infinicore::Device &device, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), + std::shared_ptr global_config = nullptr); /** * @brief Forward pass: compute language modeling logits diff --git a/csrc/models/llama/llama_mlp.cpp b/csrc/models/llama/llama_mlp.cpp index 1ac3474b..3f457d82 100644 --- a/csrc/models/llama/llama_mlp.cpp +++ b/csrc/models/llama/llama_mlp.cpp @@ -7,33 +7,33 @@ namespace infinilm::models::llama { LlamaMLP::LlamaMLP(const LlamaConfig &config, const infinicore::Device &device, - engine::distributed::RankInfo rank_info) + engine::distributed::RankInfo rank_info, + std::shared_ptr global_config) : hidden_size_(config.hidden_size), intermediate_size_(config.intermediate_size), - use_bias_(config.mlp_bias), rank_info_(rank_info) { + use_bias_(config.mlp_bias), rank_info_(rank_info), global_config_(global_config) { const auto &dtype{config.dtype}; int tp_rank = rank_info.tp_rank; int tp_size = rank_info.tp_size; // Initialize projection layers - if (!config.quant_config.has_value()) { + auto quant_scheme = this->global_config_->get_quant_scheme(); + // std::cout << "LlamaMLP quant_scheme: " << static_cast(quant_scheme) << std::endl; + switch (quant_scheme) { + case infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8: + INFINILM_GATE_UP_LINEAR_W8A8_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, use_bias_, + dtype, device, rank_info_, quant_scheme); + INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, use_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm, quant_scheme); + break; + + default: INFINILM_GATE_UP_LINEAR_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, use_bias_, dtype, device, rank_info_); INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, use_bias_, dtype, device, tp_rank, tp_size, rank_info.comm); - } else { - switch (config.quant_config.value().get_quant_scheme()) { - case infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8: { - INFINILM_GATE_UP_LINEAR_W8A8_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, use_bias_, - dtype, device, rank_info_, config.quant_config.value()); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, use_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm, config.quant_config.value()); - break; - } - default: { - } - } + break; } } diff --git a/csrc/models/llama/llama_mlp.hpp b/csrc/models/llama/llama_mlp.hpp index 665dac70..42eacc1e 100644 --- a/csrc/models/llama/llama_mlp.hpp +++ b/csrc/models/llama/llama_mlp.hpp @@ -3,6 +3,7 @@ #include "../../layers/fused_linear.hpp" #include "llama_config.hpp" +#include "../../config/global_config.hpp" #include "infinicore/device.hpp" #include "infinicore/nn/linear.hpp" #include "infinicore/nn/module.hpp" @@ -35,7 +36,8 @@ class LlamaMLP : public infinicore::nn::Module { */ LlamaMLP(const LlamaConfig &config, const infinicore::Device &device, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), + std::shared_ptr global_config = nullptr); /** * @brief Forward pass: compute MLP output @@ -57,6 +59,8 @@ class LlamaMLP : public infinicore::nn::Module { size_t hidden_size_; size_t intermediate_size_; bool use_bias_; + + std::shared_ptr global_config_; }; } // namespace infinilm::models::llama diff --git a/csrc/models/llama/llama_model.cpp b/csrc/models/llama/llama_model.cpp index f1de0618..e71cc55b 100644 --- a/csrc/models/llama/llama_model.cpp +++ b/csrc/models/llama/llama_model.cpp @@ -9,13 +9,13 @@ namespace infinilm::models::llama { LlamaModel::LlamaModel(const LlamaConfig &config, const infinicore::Device &device, - engine::distributed::RankInfo rank_info) - : config_(config), rank_info_(rank_info) { + engine::distributed::RankInfo rank_info, + std::shared_ptr global_config) + : config_(config), rank_info_(rank_info), global_config_(global_config) { const auto &dtype{config.dtype}; // Initialize token embeddings INFINICORE_NN_MODULE_INIT(embed_tokens, config.vocab_size, config.hidden_size, std::nullopt, dtype, device); - // Initialize decoder layers with layer indices // TODO: Update INFINICORE_NN_MODULE_VEC_INIT macro to support per-layer constructor arguments // (e.g., via a factory function or lambda that receives the layer index) @@ -23,7 +23,7 @@ LlamaModel::LlamaModel(const LlamaConfig &config, layers_.reserve(config.num_hidden_layers); for (size_t i = 0; i < config.num_hidden_layers; ++i) { layers_.push_back(this->register_module( - "layers." + std::to_string(i), config, device, i, rank_info)); + "layers." + std::to_string(i), config, device, i, rank_info, global_config_)); } // Initialize final layer normalization diff --git a/csrc/models/llama/llama_model.hpp b/csrc/models/llama/llama_model.hpp index 5a008b0f..b43fa542 100644 --- a/csrc/models/llama/llama_model.hpp +++ b/csrc/models/llama/llama_model.hpp @@ -1,7 +1,6 @@ #pragma once #include "../../cache/kv_cache.hpp" -#include "llama_config.hpp" #include "llama_decoder_layer.hpp" #include "infinicore/nn/embedding.hpp" @@ -40,7 +39,8 @@ class LlamaModel : public infinicore::nn::Module { */ LlamaModel(const LlamaConfig &config, const infinicore::Device &device, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), + std::shared_ptr global_config = nullptr); /** * @brief Forward pass: process input through the model @@ -86,6 +86,7 @@ class LlamaModel : public infinicore::nn::Module { private: LlamaConfig config_; + std::shared_ptr global_config_; }; } // namespace infinilm::models::llama diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp index f84d4905..cf783fb8 100644 --- a/csrc/models/model_factory.cpp +++ b/csrc/models/model_factory.cpp @@ -6,13 +6,13 @@ std::shared_ptr InfinilmModelFactory::createModel( const InfinilmModel::Config &config, engine::distributed::RankInfo rank_info, const cache::CacheConfig *cache, - const config::global_config::GlobalConfig &global_config) { + std::shared_ptr global_config) { std::shared_ptr model; if (const auto llama_config_ptr = dynamic_cast(&config)) { const auto &llama_config = *llama_config_ptr; model = std::make_shared( - llama_config, rank_info.device, rank_info); + llama_config, rank_info.device, rank_info, global_config); } else { throw std::invalid_argument("InfinilmModelFactory::createModel: Unsupported model config type"); } diff --git a/csrc/models/model_factory.hpp b/csrc/models/model_factory.hpp index 3d14ced3..fcf60708 100644 --- a/csrc/models/model_factory.hpp +++ b/csrc/models/model_factory.hpp @@ -12,6 +12,6 @@ class InfinilmModelFactory { const InfinilmModel::Config &config, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), const cache::CacheConfig *cache = nullptr, - const config::global_config::GlobalConfig &global_config = config::global_config::GlobalConfig()); + std::shared_ptr global_config = nullptr); }; } // namespace infinilm diff --git a/csrc/quantization/compressed_tensors.hpp b/csrc/quantization/compressed_tensors.hpp index c7d6eead..b290f9ac 100644 --- a/csrc/quantization/compressed_tensors.hpp +++ b/csrc/quantization/compressed_tensors.hpp @@ -1,19 +1,21 @@ #pragma once +// #include "../config/global_config.hpp" +#include "../config/quant_config.hpp" #include "quantization.hpp" // #include "utils.hpp" namespace infinilm::quantization { class CompressedTensors : public BaseQuantization { public: - CompressedTensors(const infinilm::config::global_config::GlobalConfig &global_config) - : BaseQuantization(global_config) { - quant_config_ = global_config.get_quant_config_json(); - } + explicit CompressedTensors(const nlohmann::json &quant_config) + : BaseQuantization(quant_config) { + // quant_config_ = global_config.get_quant_config_json(); + }; infinicore::nn::QuantScheme get_quant_scheme() const override { return infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8; - } + }; }; } // namespace infinilm::quantization \ No newline at end of file diff --git a/csrc/quantization/quantization.hpp b/csrc/quantization/quantization.hpp index 8bca2651..65f34aa1 100644 --- a/csrc/quantization/quantization.hpp +++ b/csrc/quantization/quantization.hpp @@ -1,19 +1,18 @@ #pragma once -#include "compressed_tensors.hpp" - -// #include "../config/quant_config.hpp" -#include "../config/global_config.hpp" +#include "../config/quant_config.hpp" #include "infinicore/nn/quantization.hpp" +#include "nlohmann/json.hpp" namespace infinilm::quantization { class BaseQuantization { public: - explicit BaseQuantization(const infinilm::config::global_config::GlobalConfig &global_config) {}; + explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config) {}; virtual ~BaseQuantization() = default; virtual infinicore::nn::QuantScheme get_quant_scheme() const = 0; protected: - infinilm::config::quantization::QuantConfig quant_config_; -} + // infinilm::config::quantization::QuantConfig quant_config_; + nlohmann::json quant_config_; +}; } // namespace infinilm::quantization \ No newline at end of file From f5f38a76d341e437ffff472bb937998393af1ae5 Mon Sep 17 00:00:00 2001 From: qinyiqun Date: Wed, 21 Jan 2026 17:26:04 +0800 Subject: [PATCH 05/11] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E9=83=A8=E5=88=86?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=E7=BB=93=E6=9E=84=EF=BC=8C=E5=88=A0=E9=99=A4?= =?UTF-8?q?=E6=97=A0=E7=94=A8=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- csrc/config/global_config.cpp | 13 ++- csrc/config/global_config.hpp | 16 +-- csrc/config/quant_config.cpp | 4 +- csrc/config/quant_config.hpp | 7 +- csrc/engine/infer_engine.cpp | 3 +- csrc/layers/fused_linear.cpp | 66 ----------- csrc/layers/fused_linear.hpp | 30 +---- csrc/quantization/base_quantization.hpp | 18 +++ csrc/quantization/compressed_tensors.cpp | 6 - csrc/quantization/compressed_tensors.hpp | 14 +-- csrc/quantization/quantization.hpp | 19 +-- csrc/quantization/utils.hpp | 2 - examples/jiuge.py | 4 +- python/infinilm/modeling_utils.py | 2 - .../models/llama/configuration_llama.py | 13 +-- python/infinilm/models/quant_config.py | 110 ------------------ 16 files changed, 56 insertions(+), 271 deletions(-) create mode 100644 csrc/quantization/base_quantization.hpp delete mode 100644 csrc/quantization/compressed_tensors.cpp delete mode 100644 csrc/quantization/utils.hpp delete mode 100644 python/infinilm/models/quant_config.py diff --git a/csrc/config/global_config.cpp b/csrc/config/global_config.cpp index 63d03fb0..93cd45be 100644 --- a/csrc/config/global_config.cpp +++ b/csrc/config/global_config.cpp @@ -1,5 +1,5 @@ #include "global_config.hpp" -#include + namespace infinilm::config::global_config { GlobalConfig::GlobalConfig(const std::string &path) { std::ifstream file(path); @@ -11,4 +11,13 @@ GlobalConfig::GlobalConfig(const std::string &path) { } this->quant_config = quantization::QuantConfig(config_json["quantization_config"]); } -} // namespace infinilm::config::global_config \ No newline at end of file + +infinicore::nn::QuantScheme +GlobalConfig::get_quant_scheme() const { + if (quant_config.get_quant_scheme() != infinicore::nn::QuantScheme::NONE) { + return quant_config.get_quant_scheme(); + } else { + return infinicore::nn::QuantScheme::NONE; + } +} +} // namespace infinilm::config::global_config diff --git a/csrc/config/global_config.hpp b/csrc/config/global_config.hpp index 6f9c32b6..1621142e 100644 --- a/csrc/config/global_config.hpp +++ b/csrc/config/global_config.hpp @@ -1,28 +1,22 @@ #pragma once - #include "quant_config.hpp" #include -#include #include namespace infinilm::config::global_config { struct GlobalConfig { - // Quantization configuration + // Global config is implemented using nlohmann/json and is primarily used for advanced configuration + // beyond the standard model config. It is initialized via GlobalConfig(const std::string& path) + // and passed through the InferEngine during inference. public: GlobalConfig() = default; GlobalConfig(const nlohmann::json &json) : config_json(json) {}; GlobalConfig(const std::string &path); - infinicore::nn::QuantScheme get_quant_scheme() const { - if (quant_config.get_quant_scheme() != infinicore::nn::QuantScheme::NONE) { - return quant_config.get_quant_scheme(); - } else { - return infinicore::nn::QuantScheme::NONE; - } - } + infinicore::nn::QuantScheme get_quant_scheme() const; private: nlohmann::json config_json; quantization::QuantConfig quant_config; }; -} // namespace infinilm::config::global_config \ No newline at end of file +} // namespace infinilm::config::global_config diff --git a/csrc/config/quant_config.cpp b/csrc/config/quant_config.cpp index 0ee47682..8984661f 100644 --- a/csrc/config/quant_config.cpp +++ b/csrc/config/quant_config.cpp @@ -1,5 +1,5 @@ #include "quant_config.hpp" -#include + namespace infinilm::config::quantization { QuantConfig::QuantConfig(const nlohmann::json &json) : quantization_config(json) { this->quantization_method = get_quantization_method(); @@ -19,4 +19,4 @@ QuantConfig::get_quantization_method() const { return nullptr; // Default case if no matching scheme } -} // namespace infinilm::config::quantization \ No newline at end of file +} // namespace infinilm::config::quantization diff --git a/csrc/config/quant_config.hpp b/csrc/config/quant_config.hpp index 46400eff..dec3750e 100644 --- a/csrc/config/quant_config.hpp +++ b/csrc/config/quant_config.hpp @@ -1,13 +1,12 @@ #pragma once - -#include "../quantization/compressed_tensors.hpp" #include "../quantization/quantization.hpp" #include "nlohmann/json.hpp" -#include namespace infinilm::config::quantization { class QuantConfig { + // QuantConfig is used to store and parse the "quantization" field from config.json. + // This is currently a basic version and will be extended in the future. public: QuantConfig() = default; QuantConfig(const nlohmann::json &json); @@ -26,4 +25,4 @@ class QuantConfig { std::shared_ptr quantization_method; }; -} // namespace infinilm::config::quantization \ No newline at end of file +} // namespace infinilm::config::quantization diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp index 25b2b70d..d63e0b4b 100644 --- a/csrc/engine/infer_engine.cpp +++ b/csrc/engine/infer_engine.cpp @@ -20,7 +20,8 @@ InferEngine::InferEngine( if (cache_config != nullptr) { cache_config_ = cache_config->unique_copy(); } - // if (!model_path.empty()) { + + // Load global config if model_path is provided, model_path must be valid, and config.json exists this->global_config_ = std::make_shared(model_path + "/config.json"); // Create one RankWorker per rank diff --git a/csrc/layers/fused_linear.cpp b/csrc/layers/fused_linear.cpp index 7f5ec364..700e8fde 100644 --- a/csrc/layers/fused_linear.cpp +++ b/csrc/layers/fused_linear.cpp @@ -6,57 +6,6 @@ namespace infinilm::layers { // --------------------------------------------------------- // QKV Parallel Linear // --------------------------------------------------------- -// QKVParallelLinear::QKVParallelLinear(size_t hidden_size, -// size_t head_dim, -// size_t num_q_head, -// size_t num_kv_head, -// bool bias, -// const infinicore::DataType &dtype, -// const infinicore::Device &device, -// engine::distributed::RankInfo rank_info) -// : QKVParallelLinear(hidden_size, -// head_dim, head_dim, head_dim, -// num_q_head, num_kv_head, num_kv_head, -// bias, bias, bias, -// dtype, device, rank_info) {} - -// QKVParallelLinear::QKVParallelLinear(size_t hidden_size, -// size_t q_dim, size_t k_dim, size_t v_dim, -// size_t num_q_head, size_t num_k_head, size_t num_v_head, -// bool q_bias, bool k_bias, bool v_bias, -// const infinicore::DataType &dtype, -// const infinicore::Device &device, -// engine::distributed::RankInfo rank_info) -// : infinicore::nn::ColumnParallelLinear( -// hidden_size, -// num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim, -// (q_bias || k_bias || v_bias), -// dtype, -// device, -// rank_info.tp_rank, -// rank_info.tp_size), -// q_dim_(q_dim), -// k_dim_(k_dim), -// v_dim_(v_dim), -// num_q_head_(num_q_head), -// num_k_head_(num_k_head), -// num_v_head_(num_v_head), -// q_bias_(q_bias), -// k_bias_(k_bias), -// v_bias_(v_bias) { -// if (num_q_head % tp_size_ != 0 || num_k_head % tp_size_ != 0 || num_v_head % tp_size_ != 0) { -// throw std::runtime_error("QKVParallelLinear: num_[q|k|v]_head must be divisible by tp_size"); -// } - -// if ((q_bias_ != k_bias_) || (k_bias_ != v_bias_)) { -// throw std::runtime_error("q_bias, k_bias, v_bias must all match"); -// } - -// q_out_size_ = num_q_head_ * q_dim_ / tp_size_; -// k_out_size_ = num_k_head_ * k_dim_ / tp_size_; -// v_out_size_ = num_v_head_ * v_dim_ / tp_size_; -// } - QKVParallelLinear::QKVParallelLinear(size_t hidden_size, size_t head_dim, size_t num_q_head, @@ -192,21 +141,6 @@ bool QKVParallelLinear::has_v_bias() const { return v_bias_; } // --------------------------------------------------------- // Gate-Up Parallel Linear // --------------------------------------------------------- -// GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias, -// const infinicore::DataType &dtype, const infinicore::Device &device, -// engine::distributed::RankInfo rank_info) -// : GateUpParallelLinear(hidden_size, intermediate_size, bias, bias, dtype, device, rank_info) { -// } - -// GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, -// const infinicore::DataType &dtype, const infinicore::Device &device, -// engine::distributed::RankInfo rank_info) -// : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) { -// if (gate_bias_ != up_bias_) { -// throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time"); -// } -// } - GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias, const infinicore::DataType &dtype, const infinicore::Device &device, engine::distributed::RankInfo rank_info, diff --git a/csrc/layers/fused_linear.hpp b/csrc/layers/fused_linear.hpp index 8bde20d8..f4220fce 100644 --- a/csrc/layers/fused_linear.hpp +++ b/csrc/layers/fused_linear.hpp @@ -7,23 +7,6 @@ namespace infinilm::layers { class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear { public: - // explicit QKVParallelLinear(size_t hidden_size, - // size_t q_dim, size_t k_dim, size_t v_dim, - // size_t num_q_head, size_t num_k_head, size_t num_v_head, - // bool q_bias, bool k_bias, bool v_bias, - // const infinicore::DataType &dtype = infinicore::DataType::F32, - // const infinicore::Device &device = infinicore::Device(), - // engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - - // // A more common case where all heads have the same dimension - // explicit QKVParallelLinear(size_t hidden_size, - // size_t head_dim, - // size_t num_q_head, size_t num_kv_head, - // bool bias = false, - // const infinicore::DataType &dtype = infinicore::DataType::F32, - // const infinicore::Device &device = infinicore::Device(), - // engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - explicit QKVParallelLinear(size_t hidden_size, size_t q_dim, size_t k_dim, size_t v_dim, size_t num_q_head, size_t num_k_head, size_t num_v_head, @@ -79,15 +62,6 @@ class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear { class GateUpParallelLinear : public infinicore::nn::ColumnParallelLinear { public: - // GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias = false, - // const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), - // engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - - // GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, - // const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), - // engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); - - // Overload for quantization, old ones need tobe purged GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias = false, const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), @@ -142,7 +116,7 @@ class GateUpParallelLinear : public infinicore::nn::ColumnParallelLinear { if (name##_->has_up_bias()) \ this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias()); -// ========================= QKV 量化 ================================== +// ========================= QKV Quantization ================================== #define INFINILM_QKV_LINEAR_W8A8_INIT(name, q_name, k_name, v_name, ...) \ name##_ = std::make_shared(__VA_ARGS__); \ /* 注册 Q 权重 */ \ @@ -162,7 +136,7 @@ class GateUpParallelLinear : public infinicore::nn::ColumnParallelLinear { if (name##_->has_v_bias()) \ this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias()); -// ========================= Gate-Up 量化 ============================== +// ========================= Gate-Up Quantization ============================== #define INFINILM_GATE_UP_LINEAR_W8A8_INIT(name, gate_name, up_name, ...) \ name##_ = std::make_shared(__VA_ARGS__); \ /* 注册 Gate 权重 */ \ diff --git a/csrc/quantization/base_quantization.hpp b/csrc/quantization/base_quantization.hpp new file mode 100644 index 00000000..0d1f52ce --- /dev/null +++ b/csrc/quantization/base_quantization.hpp @@ -0,0 +1,18 @@ +#pragma once +#include "../config/quant_config.hpp" +#include "infinicore/nn/quantization.hpp" +#include "nlohmann/json.hpp" + +namespace infinilm::quantization { +class BaseQuantization { + // Base class for quantization schemes. Intended to be extended to support various quantization methods. +public: + explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config) {}; + virtual ~BaseQuantization() = default; + + virtual infinicore::nn::QuantScheme get_quant_scheme() const = 0; + +protected: + nlohmann::json quant_config_; +}; +} // namespace infinilm::quantization diff --git a/csrc/quantization/compressed_tensors.cpp b/csrc/quantization/compressed_tensors.cpp deleted file mode 100644 index f5b71bcc..00000000 --- a/csrc/quantization/compressed_tensors.cpp +++ /dev/null @@ -1,6 +0,0 @@ -// #include "compressed_tensors.hpp" - -// infinicore::nn::QuantScheme CompressedTensors::get_quant_scheme() { -// // need to add more schemes later -// return infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8; -// } \ No newline at end of file diff --git a/csrc/quantization/compressed_tensors.hpp b/csrc/quantization/compressed_tensors.hpp index b290f9ac..f502f398 100644 --- a/csrc/quantization/compressed_tensors.hpp +++ b/csrc/quantization/compressed_tensors.hpp @@ -1,16 +1,16 @@ #pragma once -// #include "../config/global_config.hpp" + #include "../config/quant_config.hpp" -#include "quantization.hpp" -// #include "utils.hpp" +#include "base_quantization.hpp" namespace infinilm::quantization { class CompressedTensors : public BaseQuantization { + // This is a temporary class that currently only returns COMPRESSED_TENSOR_W8A8I8. + // Future enhancements should parse quant_config to extract detailed quantization + // information and support multiple quantization schemes. public: explicit CompressedTensors(const nlohmann::json &quant_config) - : BaseQuantization(quant_config) { - // quant_config_ = global_config.get_quant_config_json(); - }; + : BaseQuantization(quant_config) {}; infinicore::nn::QuantScheme get_quant_scheme() const override { @@ -18,4 +18,4 @@ class CompressedTensors : public BaseQuantization { }; }; -} // namespace infinilm::quantization \ No newline at end of file +} // namespace infinilm::quantization diff --git a/csrc/quantization/quantization.hpp b/csrc/quantization/quantization.hpp index 65f34aa1..48b7646e 100644 --- a/csrc/quantization/quantization.hpp +++ b/csrc/quantization/quantization.hpp @@ -1,18 +1,5 @@ #pragma once -#include "../config/quant_config.hpp" -#include "infinicore/nn/quantization.hpp" -#include "nlohmann/json.hpp" - -namespace infinilm::quantization { -class BaseQuantization { -public: - explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config) {}; - virtual ~BaseQuantization() = default; - virtual infinicore::nn::QuantScheme get_quant_scheme() const = 0; - -protected: - // infinilm::config::quantization::QuantConfig quant_config_; - nlohmann::json quant_config_; -}; -} // namespace infinilm::quantization \ No newline at end of file +#include "base_quantization.hpp" +#include "compressed_tensors.hpp" +#include "infinicore/nn/quantization.hpp" diff --git a/csrc/quantization/utils.hpp b/csrc/quantization/utils.hpp deleted file mode 100644 index 1ae21db2..00000000 --- a/csrc/quantization/utils.hpp +++ /dev/null @@ -1,2 +0,0 @@ -#include "../config/global_config.hpp" -#include "infinicore/nn/quantization.hpp" \ No newline at end of file diff --git a/examples/jiuge.py b/examples/jiuge.py index a4b8c28f..653a1a55 100644 --- a/examples/jiuge.py +++ b/examples/jiuge.py @@ -56,7 +56,7 @@ def get_args(): parser.add_argument( "--max_new_tokens", type=int, - default=1000, + default=100, help="max_new_tokens", ) parser.add_argument( @@ -121,7 +121,7 @@ def get_args(): def test( prompts: str | list[str], model_path, - max_new_tokens=5000, + max_new_tokens=100, infini_device=infinicore.device("cpu", 0), tp=1, enable_paged_attn=False, diff --git a/python/infinilm/modeling_utils.py b/python/infinilm/modeling_utils.py index a8d987ca..d1b26dd9 100644 --- a/python/infinilm/modeling_utils.py +++ b/python/infinilm/modeling_utils.py @@ -75,7 +75,6 @@ def load_state_dict( ) for k in f.keys(): - # state_dict[k] = f.get_tensor(k).to(device=device, dtype=dtype) state_dict[k] = f.get_tensor(k).to(device=device) return state_dict @@ -148,7 +147,6 @@ def load_model_state_dict_by_file( model_param = load_state_dict( file_path, device=torch_device, dtype=torch_dtype ) - already_loaded_keys.extend(model_param.keys()) # --------------------------------------------------------- # diff --git a/python/infinilm/models/llama/configuration_llama.py b/python/infinilm/models/llama/configuration_llama.py index f893c5cf..8d07a657 100644 --- a/python/infinilm/models/llama/configuration_llama.py +++ b/python/infinilm/models/llama/configuration_llama.py @@ -15,13 +15,11 @@ """LLaMA model configuration""" -from typing import Optional import infinicore from infinilm.lib import _infinilm from ...configuration_utils import PretrainedConfig -from ..quant_config import parse_quant_config, QuantizationConfig class LlamaConfig(PretrainedConfig, _infinilm.LlamaConfig): r""" @@ -183,7 +181,6 @@ def __init__( mlp_bias=False, head_dim=None, torch_dtype=None, - quantization_config=None, **kwargs, ): _infinilm.LlamaConfig.__init__(self) @@ -246,12 +243,4 @@ def __init__( eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs, - ) - - if isinstance(quantization_config, dict): - self.quantization_config: Optional[QuantizationConfig] = parse_quant_config(quantization_config) - self.quantization_config_dict = quantization_config - else: - self.quantization_config = None - self.quantization_config_dict = None - + ) \ No newline at end of file diff --git a/python/infinilm/models/quant_config.py b/python/infinilm/models/quant_config.py deleted file mode 100644 index 9e8ea0bf..00000000 --- a/python/infinilm/models/quant_config.py +++ /dev/null @@ -1,110 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2025, InfiniCore -# BSD 3-Clause License - -from abc import ABC, abstractmethod -from dataclasses import dataclass, field -from typing import Dict, List, Optional, Any, Type - -# ---------------- 抽象层 ---------------- -class QuantizationConfig(ABC): - """InfiniCore 量化统一入口,C++ 或 Python 侧都只认这四个接口。""" - @abstractmethod - def get_name(self) -> str: ... - @abstractmethod - def get_min_capability(self) -> int: ... - @abstractmethod - def get_scaled_act_names(self) -> List[str]: ... - @abstractmethod - def get_quant_method(self) -> str: - """返回算法名,供 C++ dispatcher 用。""" - ... - -# ---------------- 数据类 ---------------- -@dataclass -class CompressedTensorsConfig(QuantizationConfig): - """对应 HF compressed-tensors 导出格式。""" - quant_method: str = "compressed-tensors" - format: str = "int-quantized" - quantization_status: str = "compressed" - version: str = "0.11.0" - global_compression_ratio: Optional[float] = None - ignore: List[str] = field(default_factory=lambda: ["lm_head"]) - kv_cache_scheme: Optional[Dict[str, Any]] = None - sparsity_config: Dict[str, Any] = field(default_factory=dict) - transform_config: Dict[str, Any] = field(default_factory=dict) - config_groups: Dict[str, "Group"] = field(default_factory=dict) - - @dataclass - class TensorConfig: - num_bits: int - type: str - symmetric: bool - dynamic: bool - strategy: str - observer: Optional[str] = None - observer_kwargs: Dict[str, Any] = field(default_factory=dict) - group_size: Optional[int] = None - block_structure: Optional[str] = None - actorder: Optional[Any] = None - - @dataclass - class Group: - targets: List[str] - weights: "CompressedTensorsConfig.TensorConfig" - input_activations: Optional["CompressedTensorsConfig.TensorConfig"] = None - output_activations: Optional["CompressedTensorsConfig.TensorConfig"] = None - format: str = "int-quantized" - - @staticmethod - def from_dict(cfg: Dict[str, Any]) -> "CompressedTensorsConfig": - def _build_tensor(obj: Optional[Dict[str, Any]]) -> Optional["CompressedTensorsConfig.TensorConfig"]: - return None if obj is None else CompressedTensorsConfig.TensorConfig(**obj) - - groups = {} - for gname, gcfg in cfg.get("config_groups", {}).items(): - groups[gname] = CompressedTensorsConfig.Group( - targets=gcfg["targets"], - weights=_build_tensor(gcfg["weights"]), - input_activations=_build_tensor(gcfg.get("input_activations")), - output_activations=_build_tensor(gcfg.get("output_activations")), - format=gcfg.get("format", "int-quantized"), - ) - return CompressedTensorsConfig( - quant_method=cfg["quant_method"], - format=cfg["format"], - quantization_status=cfg["quantization_status"], - version=cfg["version"], - global_compression_ratio=cfg.get("global_compression_ratio"), - ignore=cfg.get("ignore", ["lm_head"]), - kv_cache_scheme=cfg.get("kv_cache_scheme"), - sparsity_config=cfg.get("sparsity_config", {}), - transform_config=cfg.get("transform_config", {}), - config_groups=groups, - ) - - def get_name(self) -> str: - return self.quant_method - - def get_min_capability(self) -> int: - return 75 - - def get_scaled_act_names(self) -> List[str]: - return [] - - def get_quant_method(self) -> str: - return self.quant_method - - -_QUANT_METHOD_MAP: Dict[str, Type[QuantizationConfig]] = { - "compressed-tensors": CompressedTensorsConfig, -} - -def parse_quant_config(quant_cfg: Dict[str, Any]) -> Optional[QuantizationConfig]: - """统一解析入口,供 LlamaConfig 调用。""" - method = quant_cfg.get("quant_method") - cls = _QUANT_METHOD_MAP.get(method) - if cls is None: - return None - - return cls.from_dict(quant_cfg) \ No newline at end of file From f57de3e94026a19841aab281a57d2347be045c48 Mon Sep 17 00:00:00 2001 From: qinyiqun Date: Thu, 22 Jan 2026 11:41:04 +0800 Subject: [PATCH 06/11] =?UTF-8?q?=E8=B7=9F=E9=9A=8Finifnicore=E4=BF=AE?= =?UTF-8?q?=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- csrc/layers/fused_linear.cpp | 16 ++++++++-------- csrc/layers/fused_linear.hpp | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/csrc/layers/fused_linear.cpp b/csrc/layers/fused_linear.cpp index 700e8fde..e108b275 100644 --- a/csrc/layers/fused_linear.cpp +++ b/csrc/layers/fused_linear.cpp @@ -14,13 +14,13 @@ QKVParallelLinear::QKVParallelLinear(size_t hidden_size, const infinicore::DataType &dtype, const infinicore::Device &device, engine::distributed::RankInfo rank_info, - std::optional quant_config) + std::optional quant_scheme) : QKVParallelLinear(hidden_size, head_dim, head_dim, head_dim, num_q_head, num_kv_head, num_kv_head, bias, bias, bias, dtype, device, rank_info, - quant_config) {} + quant_scheme) {} QKVParallelLinear::QKVParallelLinear(size_t hidden_size, size_t q_dim, size_t k_dim, size_t v_dim, @@ -29,7 +29,7 @@ QKVParallelLinear::QKVParallelLinear(size_t hidden_size, const infinicore::DataType &dtype, const infinicore::Device &device, engine::distributed::RankInfo rank_info, - std::optional quant_config) + std::optional quant_scheme) : infinicore::nn::ColumnParallelLinear( hidden_size, num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim, @@ -38,7 +38,7 @@ QKVParallelLinear::QKVParallelLinear(size_t hidden_size, device, rank_info.tp_rank, rank_info.tp_size, - quant_config), + quant_scheme), q_dim_(q_dim), k_dim_(k_dim), v_dim_(v_dim), @@ -144,15 +144,15 @@ bool QKVParallelLinear::has_v_bias() const { return v_bias_; } GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias, const infinicore::DataType &dtype, const infinicore::Device &device, engine::distributed::RankInfo rank_info, - std::optional quant_config) - : GateUpParallelLinear(hidden_size, intermediate_size, bias, bias, dtype, device, rank_info, quant_config) { + std::optional quant_scheme) + : GateUpParallelLinear(hidden_size, intermediate_size, bias, bias, dtype, device, rank_info, quant_scheme) { } GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, const infinicore::DataType &dtype, const infinicore::Device &device, engine::distributed::RankInfo rank_info, - std::optional quant_config) - : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size, quant_config), gate_bias_(gate_bias), up_bias_(up_bias) { + std::optional quant_scheme) + : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size, quant_scheme), gate_bias_(gate_bias), up_bias_(up_bias) { if (gate_bias_ != up_bias_) { throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time"); } diff --git a/csrc/layers/fused_linear.hpp b/csrc/layers/fused_linear.hpp index f4220fce..f3d95bae 100644 --- a/csrc/layers/fused_linear.hpp +++ b/csrc/layers/fused_linear.hpp @@ -14,7 +14,7 @@ class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear { const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - std::optional quant_config = std::nullopt); + std::optional quant_scheme = std::nullopt); // A more common case where all heads have the same dimension explicit QKVParallelLinear(size_t hidden_size, @@ -24,7 +24,7 @@ class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear { const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - std::optional quant_config = std::nullopt); + std::optional quant_scheme = std::nullopt); std::tuple forward_split(infinicore::Tensor &input); @@ -65,12 +65,12 @@ class GateUpParallelLinear : public infinicore::nn::ColumnParallelLinear { GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias = false, const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - std::optional quant_config = std::nullopt); + std::optional quant_scheme = std::nullopt); GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - std::optional quant_config = std::nullopt); + std::optional quant_scheme = std::nullopt); std::tuple forward_split(infinicore::Tensor &input); From 6af3653ea7aad9f98783c898e119ada89c19fd69 Mon Sep 17 00:00:00 2001 From: qinyiqun Date: Fri, 23 Jan 2026 10:08:34 +0800 Subject: [PATCH 07/11] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=89=80=E6=9C=89?= =?UTF-8?q?=E7=9A=84model=5Fconfig=EF=BC=8C=E7=BB=9F=E4=B8=80=E4=BD=BF?= =?UTF-8?q?=E7=94=A8global=5Fconfig?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- csrc/config/global_config.cpp | 65 +++++++++++++++++++++++ csrc/config/global_config.hpp | 38 +++++++++++++ csrc/engine/infer_engine.cpp | 6 +-- csrc/engine/infer_engine.hpp | 3 +- csrc/engine/rank_worker.cpp | 17 +++--- csrc/engine/rank_worker.hpp | 5 +- csrc/models/llama/llama_attention.cpp | 29 +++++----- csrc/models/llama/llama_attention.hpp | 3 +- csrc/models/llama/llama_config.hpp | 2 - csrc/models/llama/llama_decoder_layer.cpp | 13 +++-- csrc/models/llama/llama_decoder_layer.hpp | 3 +- csrc/models/llama/llama_for_causal_lm.cpp | 10 ++-- csrc/models/llama/llama_for_causal_lm.hpp | 5 +- csrc/models/llama/llama_mlp.cpp | 11 ++-- csrc/models/llama/llama_mlp.hpp | 3 +- csrc/models/llama/llama_model.cpp | 52 +++++++++--------- csrc/models/llama/llama_model.hpp | 7 +-- csrc/models/model_factory.cpp | 8 +-- csrc/models/model_factory.hpp | 2 +- csrc/pybind11/engine/engine.hpp | 6 +-- python/infinilm/infer_engine.py | 2 +- 21 files changed, 188 insertions(+), 102 deletions(-) diff --git a/csrc/config/global_config.cpp b/csrc/config/global_config.cpp index 93cd45be..2f3ce308 100644 --- a/csrc/config/global_config.cpp +++ b/csrc/config/global_config.cpp @@ -20,4 +20,69 @@ GlobalConfig::get_quant_scheme() const { return infinicore::nn::QuantScheme::NONE; } } + +std::shared_ptr +GlobalConfig::get_rope_scaling() const { + if (!config_json.contains("rope_scaling") || config_json["rope_scaling"].is_null()) { + return nullptr; + } + + const auto &rope_scaling = config_json["rope_scaling"]; + if (!rope_scaling.is_object()) { + throw std::runtime_error("rope_scaling must be an object"); + } + + if (!rope_scaling.contains("type")) { + throw std::runtime_error("rope_scaling must contain 'type' field"); + } + + std::string type_str = rope_scaling["type"].get(); + if (type_str == "longrope") { + // Required fields for LongRopeConfig + if (!rope_scaling.contains("short_factor") || !rope_scaling.contains("long_factor") || !rope_scaling.contains("original_max_position_embeddings")) { + throw std::runtime_error( + "LongRopeConfig requires 'short_factor', 'long_factor', and 'original_max_position_embeddings'"); + } + + auto short_factor = rope_scaling["short_factor"].get>(); + auto long_factor = rope_scaling["long_factor"].get>(); + size_t original_max_position_embeddings = rope_scaling["original_max_position_embeddings"].get(); + + float factor = 1.0f; + if (rope_scaling.contains("factor")) { + factor = rope_scaling["factor"].get(); + } + + return std::make_shared( + std::move(short_factor), + std::move(long_factor), + original_max_position_embeddings, + factor); + } else if (type_str == "default" || type_str == "none") { + // Default scaling, no scaling applied + return nullptr; + } else { + throw std::runtime_error("Unsupported rope_scaling type: " + type_str); + } +} + +infinicore::DataType +GlobalConfig::get_dtype() const { + try { + std::string dtype_str = this->get("torch_dtype"); + if (dtype_str == "float32") { + return infinicore::DataType::F32; + } else if (dtype_str == "float16") { + return infinicore::DataType::F16; + } else if (dtype_str == "bfloat16") { + return infinicore::DataType::BF16; + } else if (dtype_str == "int8") { + return infinicore::DataType::I8; + } else { + throw std::runtime_error("Unsupported dtype string: " + dtype_str); + } + } catch (const std::exception &e) { + throw std::runtime_error("Error getting dtype from config: " + std::string(e.what())); + } +} } // namespace infinilm::config::global_config diff --git a/csrc/config/global_config.hpp b/csrc/config/global_config.hpp index 1621142e..dac30565 100644 --- a/csrc/config/global_config.hpp +++ b/csrc/config/global_config.hpp @@ -1,4 +1,8 @@ #pragma once + +// #include "infinicore/nn/quantization.hpp" +#include "infinicore/nn/rope.hpp" +#include "infinicore/ops.hpp" #include "quant_config.hpp" #include #include @@ -13,7 +17,41 @@ struct GlobalConfig { GlobalConfig(const nlohmann::json &json) : config_json(json) {}; GlobalConfig(const std::string &path); + // Template Function to get a value by key with type safety + template + T get(const std::string &key) const { + if (!config_json.contains(key)) { + throw std::out_of_range("Key '" + key + "' not found in config."); + } + try { + return config_json.at(key).get(); + } catch (const nlohmann::json::type_error &e) { + throw std::runtime_error("Type conversion failed for key '" + key + "': " + std::string(e.what())); + } + } + + template + T get_or(const std::string &key, const T &default_value) const { + if (!config_json.contains(key) || config_json.at(key).is_null()) { + return default_value; + } + try { + return config_json.at(key).get(); + } catch (const nlohmann::json::type_error &) { + // If type conversion fails, return default value + return default_value; + } + } + size_t get_kv_dim() const { + return get("hidden_size") * get("num_key_value_heads") / get("num_attention_heads"); + } + size_t get_head_dim() const { + return get("hidden_size") / get("num_attention_heads"); + } + + infinicore::DataType get_dtype() const; infinicore::nn::QuantScheme get_quant_scheme() const; + std::shared_ptr get_rope_scaling() const; private: nlohmann::json config_json; diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp index d63e0b4b..6a903688 100644 --- a/csrc/engine/infer_engine.cpp +++ b/csrc/engine/infer_engine.cpp @@ -8,14 +8,12 @@ namespace infinilm::engine { // Constructor //------------------------------------------------------ InferEngine::InferEngine( - const InfinilmModel::Config &config, const distributed::DistConfig &distributed_config, infinicore::Device::Type device_type, const cache::CacheConfig *cache_config, const std::string &model_path, bool enable_graph_compiling) // Changed parameter - : communication_group_(distributed_config, device_type), - model_config_(config) { + : communication_group_(distributed_config, device_type) { if (cache_config != nullptr) { cache_config_ = cache_config->unique_copy(); @@ -30,7 +28,7 @@ InferEngine::InferEngine( workers_.reserve(world_size); for (int r = 0; r < world_size; ++r) { workers_.emplace_back(std::make_unique( - model_config_, + // model_config_, communication_group_.get_rank_info(r), cache_config_ != nullptr ? cache_config_.get() : nullptr, global_config_, diff --git a/csrc/engine/infer_engine.hpp b/csrc/engine/infer_engine.hpp index 8c0102f8..e8263354 100644 --- a/csrc/engine/infer_engine.hpp +++ b/csrc/engine/infer_engine.hpp @@ -21,7 +21,6 @@ class InferEngine { // Updated constructor: accept CacheConfig instead of CacheType InferEngine( - const InfinilmModel::Config &config, const distributed::DistConfig &distributed_config = distributed::DistConfig(), infinicore::Device::Type device_type = infinicore::context::getDevice().getType(), const cache::CacheConfig *cache_config = nullptr, @@ -52,7 +51,7 @@ class InferEngine { std::vector> workers_; std::unique_ptr barrier_; distributed::CommunicationGroup communication_group_; - const InfinilmModel::Config &model_config_; + // const InfinilmModel::Config &model_config_; std::unique_ptr cache_config_; std::shared_ptr global_config_; }; diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp index 9bc7e8f6..be8ab1f4 100644 --- a/csrc/engine/rank_worker.cpp +++ b/csrc/engine/rank_worker.cpp @@ -10,14 +10,13 @@ namespace infinilm::engine { -RankWorker::RankWorker(const InfinilmModel::Config &model_config, - const distributed::RankInfo &rank_info, - const cache::CacheConfig *cache_config, - std::shared_ptr global_config, - RankBarrier *barrier, - bool enable_graph_compiling) - : model_config_(model_config), - rank_info_(rank_info), +RankWorker::RankWorker( + const distributed::RankInfo &rank_info, + const cache::CacheConfig *cache_config, + std::shared_ptr global_config, + RankBarrier *barrier, + bool enable_graph_compiling) + : rank_info_(rank_info), enable_graph_compiling_(enable_graph_compiling), job_cmd_(Command::INIT), has_job_(false), @@ -196,7 +195,7 @@ void RankWorker::thread_loop() { infinicore::context::setDevice(rank_info_.device); // Create model using factory (may be expensive) - model_ = InfinilmModelFactory::createModel(model_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr, global_config_); + model_ = InfinilmModelFactory::createModel(rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr, global_config_); if (!model_) { throw std::runtime_error("Failed to create model"); } diff --git a/csrc/engine/rank_worker.hpp b/csrc/engine/rank_worker.hpp index b079aef4..7bc9da1d 100644 --- a/csrc/engine/rank_worker.hpp +++ b/csrc/engine/rank_worker.hpp @@ -57,8 +57,7 @@ class RankWorker { infinicore::Tensor output_ids; }; - RankWorker(const InfinilmModel::Config &model_config, - const distributed::RankInfo &rank_info, + RankWorker(const distributed::RankInfo &rank_info, const cache::CacheConfig *cache_config, std::shared_ptr global_config, RankBarrier *barrier, @@ -96,7 +95,7 @@ class RankWorker { private: // Worker properties - const InfinilmModel::Config &model_config_; + // const InfinilmModel::Config &model_config_; distributed::RankInfo rank_info_; std::shared_ptr model_; std::shared_ptr cache_; diff --git a/csrc/models/llama/llama_attention.cpp b/csrc/models/llama/llama_attention.cpp index f1545c5d..879ead0e 100644 --- a/csrc/models/llama/llama_attention.cpp +++ b/csrc/models/llama/llama_attention.cpp @@ -17,29 +17,28 @@ namespace infinilm::models::llama { -LlamaAttention::LlamaAttention(const LlamaConfig &config, - const infinicore::Device &device, +LlamaAttention::LlamaAttention(const infinicore::Device &device, size_t layer_idx, engine::distributed::RankInfo rank_info, std::shared_ptr global_config) : layer_idx_(layer_idx), - hidden_size_(config.hidden_size), - num_attention_heads_(config.num_attention_heads), - num_key_value_heads_(config.num_key_value_heads), - head_dim_(config.head_dim), - kv_dim_(config.kv_dim()), - use_bias_(config.attention_bias), - use_output_bias_(config.attention_output_bias), - max_position_embeddings_(config.max_position_embeddings), + hidden_size_(global_config->get("hidden_size")), + num_attention_heads_(global_config->get("num_attention_heads")), + num_key_value_heads_(global_config->get("num_key_value_heads")), + head_dim_(global_config->get_head_dim()), + kv_dim_(global_config->get_kv_dim()), + use_bias_(global_config->get_or("attention_bias", true)), + use_output_bias_(global_config->get_or("attention_output_bias", false)), + max_position_embeddings_(global_config->get("max_position_embeddings")), rank_info_(rank_info), global_config_(global_config) { - const auto &dtype{config.dtype}; + const auto &dtype{global_config_->get_dtype()}; int tp_rank = rank_info.tp_rank; int tp_size = rank_info.tp_size; - int num_attention_heads = config.num_attention_heads; - int num_key_value_heads = config.num_key_value_heads; + int num_attention_heads = global_config_->get("num_attention_heads"); + int num_key_value_heads = global_config_->get("num_key_value_heads"); if ((num_key_value_heads >= tp_size) && (0 == (num_key_value_heads % tp_size))) { this->num_attention_heads_ = num_attention_heads / tp_size; @@ -52,7 +51,7 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config, auto quant_scheme = this->global_config_->get_quant_scheme(); switch (quant_scheme) { case infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8: - INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, config.num_attention_heads, config.num_key_value_heads, use_bias_, + INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, global_config_->get("num_attention_heads"), global_config_->get("num_key_value_heads"), use_bias_, dtype, device, rank_info, quant_scheme); INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_output_bias_, @@ -60,7 +59,7 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config, break; default: - INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, config.num_attention_heads, config.num_key_value_heads, use_bias_, + INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, global_config_->get("num_attention_heads"), global_config_->get("num_key_value_heads"), use_bias_, dtype, device, rank_info); INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_output_bias_, diff --git a/csrc/models/llama/llama_attention.hpp b/csrc/models/llama/llama_attention.hpp index ca9abe32..231169b0 100644 --- a/csrc/models/llama/llama_attention.hpp +++ b/csrc/models/llama/llama_attention.hpp @@ -37,8 +37,7 @@ class LlamaAttention : public infinicore::nn::Module { * @param layer_idx Layer index for cache access * @param dtype Optional data type for model parameters (defaults to F32) */ - LlamaAttention(const LlamaConfig &config, - const infinicore::Device &device, + LlamaAttention(const infinicore::Device &device, size_t layer_idx, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), std::shared_ptr global_config = nullptr); diff --git a/csrc/models/llama/llama_config.hpp b/csrc/models/llama/llama_config.hpp index 0db2bcc8..fe5ba7e9 100644 --- a/csrc/models/llama/llama_config.hpp +++ b/csrc/models/llama/llama_config.hpp @@ -93,8 +93,6 @@ struct LlamaConfig : public InfinilmModel::Config { } return true; } - - nlohmann::json config_json; }; } // namespace infinilm::models::llama diff --git a/csrc/models/llama/llama_decoder_layer.cpp b/csrc/models/llama/llama_decoder_layer.cpp index 7958820c..c26cdf91 100644 --- a/csrc/models/llama/llama_decoder_layer.cpp +++ b/csrc/models/llama/llama_decoder_layer.cpp @@ -6,22 +6,21 @@ namespace infinilm::models::llama { -LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config, - const infinicore::Device &device, +LlamaDecoderLayer::LlamaDecoderLayer(const infinicore::Device &device, size_t layer_idx, engine::distributed::RankInfo rank_info, std::shared_ptr global_config) : layer_idx_(layer_idx), rank_info_(rank_info), global_config_(global_config) { - const auto &dtype{config.dtype}; + const auto &dtype{global_config_->get_dtype()}; // Initialize layer normalization layers - INFINICORE_NN_MODULE_INIT(input_layernorm, config.hidden_size, config.rms_norm_eps, + INFINICORE_NN_MODULE_INIT(input_layernorm, global_config_->get("hidden_size"), global_config_->get("rms_norm_eps"), dtype, device); - INFINICORE_NN_MODULE_INIT(post_attention_layernorm, config.hidden_size, config.rms_norm_eps, + INFINICORE_NN_MODULE_INIT(post_attention_layernorm, global_config_->get("hidden_size"), global_config_->get("rms_norm_eps"), dtype, device); // Initialize attention and MLP modules - INFINICORE_NN_MODULE_INIT(self_attn, config, device, layer_idx, rank_info_, global_config); - INFINICORE_NN_MODULE_INIT(mlp, config, device, rank_info_, global_config); + INFINICORE_NN_MODULE_INIT(self_attn, device, layer_idx, rank_info_, global_config); + INFINICORE_NN_MODULE_INIT(mlp, device, rank_info_, global_config); } std::tuple diff --git a/csrc/models/llama/llama_decoder_layer.hpp b/csrc/models/llama/llama_decoder_layer.hpp index 2198afb6..cb6bc6ac 100644 --- a/csrc/models/llama/llama_decoder_layer.hpp +++ b/csrc/models/llama/llama_decoder_layer.hpp @@ -33,8 +33,7 @@ class LlamaDecoderLayer : public infinicore::nn::Module { * @param layer_idx Layer index for cache management and debugging * @param dtype Optional data type for model parameters (defaults to F32) */ - LlamaDecoderLayer(const LlamaConfig &config, - const infinicore::Device &device, + LlamaDecoderLayer(const infinicore::Device &device, size_t layer_idx, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), std::shared_ptr global_config = nullptr); diff --git a/csrc/models/llama/llama_for_causal_lm.cpp b/csrc/models/llama/llama_for_causal_lm.cpp index 8fdcffb4..3587a029 100644 --- a/csrc/models/llama/llama_for_causal_lm.cpp +++ b/csrc/models/llama/llama_for_causal_lm.cpp @@ -6,23 +6,23 @@ namespace infinilm::models::llama { -LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config, - const infinicore::Device &device, +LlamaForCausalLM::LlamaForCausalLM(const infinicore::Device &device, engine::distributed::RankInfo rank_info, std::shared_ptr global_config) { // Initialize module's device_ member device_ = device; - const auto &dtype{config.dtype}; + const auto &dtype{global_config->get_dtype()}; // Initialize base model - INFINICORE_NN_MODULE_INIT(model, config, device, rank_info, global_config); + INFINICORE_NN_MODULE_INIT(model, device, rank_info, global_config); // Initialize language modeling head // Note: If tie_word_embeddings is true, we would share weights with embed_tokens // For now, we create a separate linear layer - INFINICORE_NN_MODULE_INIT(lm_head, config.hidden_size, config.vocab_size, false, + + INFINICORE_NN_MODULE_INIT(lm_head, global_config->get("hidden_size"), global_config->get("vocab_size"), false, dtype, device); } diff --git a/csrc/models/llama/llama_for_causal_lm.hpp b/csrc/models/llama/llama_for_causal_lm.hpp index 5d260230..43270a9d 100644 --- a/csrc/models/llama/llama_for_causal_lm.hpp +++ b/csrc/models/llama/llama_for_causal_lm.hpp @@ -28,8 +28,7 @@ class LlamaForCausalLM : public InfinilmModel { * @param config Model configuration * @param device Device to create tensors on */ - LlamaForCausalLM(const LlamaConfig &config, - const infinicore::Device &device, + LlamaForCausalLM(const infinicore::Device &device, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), std::shared_ptr global_config = nullptr); @@ -46,7 +45,7 @@ class LlamaForCausalLM : public InfinilmModel { const cache::CacheConfig *get_cache_config() const override; // Module information - const LlamaConfig &config() const { return model_->config(); } + // const LlamaConfig &config() const { return model_->config(); } LlamaModel &model() { return *model_; } const LlamaModel &model() const { return *model_; } diff --git a/csrc/models/llama/llama_mlp.cpp b/csrc/models/llama/llama_mlp.cpp index 3f457d82..1f4ee436 100644 --- a/csrc/models/llama/llama_mlp.cpp +++ b/csrc/models/llama/llama_mlp.cpp @@ -5,14 +5,13 @@ namespace infinilm::models::llama { -LlamaMLP::LlamaMLP(const LlamaConfig &config, - const infinicore::Device &device, +LlamaMLP::LlamaMLP(const infinicore::Device &device, engine::distributed::RankInfo rank_info, std::shared_ptr global_config) - : hidden_size_(config.hidden_size), - intermediate_size_(config.intermediate_size), - use_bias_(config.mlp_bias), rank_info_(rank_info), global_config_(global_config) { - const auto &dtype{config.dtype}; + : hidden_size_(global_config->get("hidden_size")), + intermediate_size_(global_config->get("intermediate_size")), + use_bias_(global_config->get_or("mlp_bias", false)), rank_info_(rank_info), global_config_(global_config) { + const auto &dtype{global_config_->get_dtype()}; int tp_rank = rank_info.tp_rank; int tp_size = rank_info.tp_size; diff --git a/csrc/models/llama/llama_mlp.hpp b/csrc/models/llama/llama_mlp.hpp index 42eacc1e..38249cb3 100644 --- a/csrc/models/llama/llama_mlp.hpp +++ b/csrc/models/llama/llama_mlp.hpp @@ -34,8 +34,7 @@ class LlamaMLP : public infinicore::nn::Module { * @param device Device to create tensors on * @param dtype Optional data type for model parameters (defaults to F32) */ - LlamaMLP(const LlamaConfig &config, - const infinicore::Device &device, + LlamaMLP(const infinicore::Device &device, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), std::shared_ptr global_config = nullptr); diff --git a/csrc/models/llama/llama_model.cpp b/csrc/models/llama/llama_model.cpp index e71cc55b..946e45f1 100644 --- a/csrc/models/llama/llama_model.cpp +++ b/csrc/models/llama/llama_model.cpp @@ -7,34 +7,33 @@ namespace infinilm::models::llama { -LlamaModel::LlamaModel(const LlamaConfig &config, - const infinicore::Device &device, +LlamaModel::LlamaModel(const infinicore::Device &device, engine::distributed::RankInfo rank_info, std::shared_ptr global_config) - : config_(config), rank_info_(rank_info), global_config_(global_config) { - const auto &dtype{config.dtype}; + : rank_info_(rank_info), global_config_(global_config) { + const auto &dtype{global_config_->get_dtype()}; // Initialize token embeddings - INFINICORE_NN_MODULE_INIT(embed_tokens, config.vocab_size, config.hidden_size, + INFINICORE_NN_MODULE_INIT(embed_tokens, global_config_->get("vocab_size"), global_config_->get("hidden_size"), std::nullopt, dtype, device); // Initialize decoder layers with layer indices // TODO: Update INFINICORE_NN_MODULE_VEC_INIT macro to support per-layer constructor arguments // (e.g., via a factory function or lambda that receives the layer index) // Currently, we can't use the macro because each layer needs a different layer_idx - layers_.reserve(config.num_hidden_layers); - for (size_t i = 0; i < config.num_hidden_layers; ++i) { + layers_.reserve(global_config_->get("num_hidden_layers")); + for (size_t i = 0; i < global_config_->get("num_hidden_layers"); ++i) { layers_.push_back(this->register_module( - "layers." + std::to_string(i), config, device, i, rank_info, global_config_)); + "layers." + std::to_string(i), device, i, rank_info, global_config_)); } // Initialize final layer normalization - INFINICORE_NN_MODULE_INIT(norm, config.hidden_size, config.rms_norm_eps, + INFINICORE_NN_MODULE_INIT(norm, global_config_->get("hidden_size"), global_config_->get("rms_norm_eps"), dtype, device); // Initialize Rotary Position Embeddings (shared across all layers) // Use GPT-J-style inverse frequencies (default) and GPT_NEOX rotation pairing - INFINICORE_NN_MODULE_INIT(rotary_emb, config.head_dim, config.max_position_embeddings, - config.rope_theta, infinicore::nn::RoPE::Algo::GPT_NEOX, - dtype, device, config.rope_scaling); + INFINICORE_NN_MODULE_INIT(rotary_emb, global_config_->get_head_dim(), global_config_->get("max_position_embeddings"), + global_config_->get("rope_theta"), infinicore::nn::RoPE::Algo::GPT_NEOX, + dtype, device, global_config_->get_rope_scaling()); for (auto &layer : layers_) { if (layer) { @@ -81,24 +80,25 @@ void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) { } if (auto kv_cache_config = dynamic_cast(cache_config)) { kv_cache_ = std::make_shared( - config_.head_dim, - config_.head_dim, - config_.num_key_value_heads, - config_.num_key_value_heads, - config_.num_hidden_layers, - config_.max_position_embeddings, - config_.dtype, + global_config_->get("hidden_size") / global_config_->get("num_attention_heads"), + global_config_->get("hidden_size") / global_config_->get("num_attention_heads"), + global_config_->get("num_key_value_heads"), + global_config_->get("num_key_value_heads"), + global_config_->get("num_hidden_layers"), + global_config_->get("max_position_embeddings"), + // config_.dtype, + global_config_->get_dtype(), *kv_cache_config, rank_info_); - } else if (auto paged_kv_cache_config = dynamic_cast(cache_config)) { kv_cache_ = std::make_shared( - config_.head_dim, - config_.head_dim, - config_.num_key_value_heads, - config_.num_key_value_heads, - config_.num_hidden_layers, - config_.dtype, + global_config_->get("hidden_size") / global_config_->get("num_attention_heads"), + global_config_->get("hidden_size") / global_config_->get("num_attention_heads"), + global_config_->get("num_key_value_heads"), + global_config_->get("num_key_value_heads"), + global_config_->get("num_hidden_layers"), + // config_.dtype, + global_config_->get_dtype(), *paged_kv_cache_config, rank_info_); } else { diff --git a/csrc/models/llama/llama_model.hpp b/csrc/models/llama/llama_model.hpp index b43fa542..422c1bd6 100644 --- a/csrc/models/llama/llama_model.hpp +++ b/csrc/models/llama/llama_model.hpp @@ -37,8 +37,7 @@ class LlamaModel : public infinicore::nn::Module { * @param device Device to create tensors on * @param dtype Optional data type for model parameters (defaults to F32) */ - LlamaModel(const LlamaConfig &config, - const infinicore::Device &device, + LlamaModel(const infinicore::Device &device, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), std::shared_ptr global_config = nullptr); @@ -64,8 +63,7 @@ class LlamaModel : public infinicore::nn::Module { void reset_cache(const cache::CacheConfig *cache_config); // Module information - const LlamaConfig &config() const { return config_; } - size_t num_layers() const { return config_.num_hidden_layers; } + size_t num_layers() const { return global_config_->get("num_hidden_layers"); } protected: // Token embeddings @@ -85,7 +83,6 @@ class LlamaModel : public infinicore::nn::Module { std::shared_ptr kv_cache_; private: - LlamaConfig config_; std::shared_ptr global_config_; }; diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp index cf783fb8..b4fd634a 100644 --- a/csrc/models/model_factory.cpp +++ b/csrc/models/model_factory.cpp @@ -3,16 +3,16 @@ namespace infinilm { std::shared_ptr InfinilmModelFactory::createModel( - const InfinilmModel::Config &config, engine::distributed::RankInfo rank_info, const cache::CacheConfig *cache, std::shared_ptr global_config) { std::shared_ptr model; - if (const auto llama_config_ptr = dynamic_cast(&config)) { - const auto &llama_config = *llama_config_ptr; + //****************************NEED TO BE FIXED */ + if (true) { + // const auto &llama_config = *llama_config_ptr; model = std::make_shared( - llama_config, rank_info.device, rank_info, global_config); + rank_info.device, rank_info, global_config); } else { throw std::invalid_argument("InfinilmModelFactory::createModel: Unsupported model config type"); } diff --git a/csrc/models/model_factory.hpp b/csrc/models/model_factory.hpp index fcf60708..c020f6a5 100644 --- a/csrc/models/model_factory.hpp +++ b/csrc/models/model_factory.hpp @@ -9,7 +9,7 @@ namespace infinilm { class InfinilmModelFactory { public: static std::shared_ptr createModel( - const InfinilmModel::Config &config, + // const InfinilmModel::Config &config, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), const cache::CacheConfig *cache = nullptr, std::shared_ptr global_config = nullptr); diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp index c64b9905..7a4b5a99 100644 --- a/csrc/pybind11/engine/engine.hpp +++ b/csrc/pybind11/engine/engine.hpp @@ -32,21 +32,21 @@ inline void bind_infer_engine(py::module &m) { py::class_> infer_engine(m, "InferEngine"); infer_engine .def(py::init([]( - const InfinilmModel::Config &cfg, + // const InfinilmModel::Config &cfg, const distributed::DistConfig &dist, infinicore::Device::Type dev, std::shared_ptr cache_cfg, const std::string &modle_path, bool enable_graph_compiling) { return std::make_shared( - cfg, + // cfg, dist, dev, cache_cfg ? cache_cfg.get() : nullptr, modle_path, enable_graph_compiling); }), - py::arg("config"), + // py::arg("config"), py::arg("distributed_config") = distributed::DistConfig(), py::arg("device_type") = infinicore::context::getDevice().getType(), py::arg("cache_config") = py::none(), diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py index fa51e4fe..e28e9a7c 100644 --- a/python/infinilm/infer_engine.py +++ b/python/infinilm/infer_engine.py @@ -36,7 +36,7 @@ def __init__( device = infinicore.device() super().__init__( - self.config, + # self.config, distributed_config._underlying, device._underlying.type, cache_config, From fae9285f519173e8826bda722d50371a3b02d198 Mon Sep 17 00:00:00 2001 From: qinyiqun Date: Fri, 23 Jan 2026 11:20:41 +0800 Subject: [PATCH 08/11] =?UTF-8?q?=E8=B7=9F=E9=9A=8FInfiniLM=E6=9C=80?= =?UTF-8?q?=E6=96=B0=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- csrc/config/global_config.hpp | 3 +++ csrc/models/llama/llama_attention.cpp | 14 ++++++++++---- csrc/models/llama/llama_attention.hpp | 1 - csrc/models/llama/llama_model.cpp | 10 ++++------ 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/csrc/config/global_config.hpp b/csrc/config/global_config.hpp index dac30565..e8be1ec2 100644 --- a/csrc/config/global_config.hpp +++ b/csrc/config/global_config.hpp @@ -46,6 +46,9 @@ struct GlobalConfig { return get("hidden_size") * get("num_key_value_heads") / get("num_attention_heads"); } size_t get_head_dim() const { + if (config_json.contains("head_dim")) { + return get("head_dim"); + } return get("hidden_size") / get("num_attention_heads"); } diff --git a/csrc/models/llama/llama_attention.cpp b/csrc/models/llama/llama_attention.cpp index 879ead0e..f1d85ea9 100644 --- a/csrc/models/llama/llama_attention.cpp +++ b/csrc/models/llama/llama_attention.cpp @@ -54,7 +54,9 @@ LlamaAttention::LlamaAttention(const infinicore::Device &device, INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, global_config_->get("num_attention_heads"), global_config_->get("num_key_value_heads"), use_bias_, dtype, device, rank_info, quant_scheme); - INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_output_bias_, + // INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_output_bias_, + // dtype, device, tp_rank, tp_size, rank_info.comm, quant_scheme); + INFINICORE_NN_MODULE_INIT(o_proj, global_config_->get("num_attention_heads") * head_dim_, hidden_size_, use_output_bias_, dtype, device, tp_rank, tp_size, rank_info.comm, quant_scheme); break; @@ -62,10 +64,14 @@ LlamaAttention::LlamaAttention(const infinicore::Device &device, INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, global_config_->get("num_attention_heads"), global_config_->get("num_key_value_heads"), use_bias_, dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_output_bias_, + INFINICORE_NN_MODULE_INIT(o_proj, global_config_->get("num_attention_heads") * head_dim_, hidden_size_, use_output_bias_, dtype, device, tp_rank, tp_size, rank_info.comm); break; } + if (global_config_->get("model_type") == "qwen3") { + INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, global_config_->get("rms_norm_eps"), dtype, device); + INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, global_config_->get("rms_norm_eps"), dtype, device); + } } infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_states, @@ -82,7 +88,7 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta // 1. Project Q, K, V auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable); - if (use_qk_norm_) { + if (global_config_->get("model_type") == "qwen3") { q = q_norm_->forward(q->view({batch_size * seq_len, num_attention_heads_, head_dim_})); k = k_norm_->forward(k->view({batch_size * seq_len, num_key_value_heads_, head_dim_})); } @@ -206,7 +212,7 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd auto k_reshaped = k->view({seq_len, num_key_value_heads_, head_dim_}); auto v_reshaped = v->view({seq_len, num_key_value_heads_, head_dim_}); - if (use_qk_norm_) { + if (global_config_->get("model_type") == "qwen3") { q_reshaped = q_norm_->forward(q_reshaped); k_reshaped = k_norm_->forward(k_reshaped); } diff --git a/csrc/models/llama/llama_attention.hpp b/csrc/models/llama/llama_attention.hpp index 231169b0..17f6f95e 100644 --- a/csrc/models/llama/llama_attention.hpp +++ b/csrc/models/llama/llama_attention.hpp @@ -110,7 +110,6 @@ class LlamaAttention : public infinicore::nn::Module { size_t kv_dim_; bool use_bias_; // Bias for Q/K/V projections bool use_output_bias_; // Bias for output projection (o_proj) - bool use_qk_norm_; // Whether to use QK RMSNorm size_t max_position_embeddings_; // For cache initialization (deprecated, kept for compatibility) float scaling_; diff --git a/csrc/models/llama/llama_model.cpp b/csrc/models/llama/llama_model.cpp index 946e45f1..182a7a5c 100644 --- a/csrc/models/llama/llama_model.cpp +++ b/csrc/models/llama/llama_model.cpp @@ -80,24 +80,22 @@ void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) { } if (auto kv_cache_config = dynamic_cast(cache_config)) { kv_cache_ = std::make_shared( - global_config_->get("hidden_size") / global_config_->get("num_attention_heads"), - global_config_->get("hidden_size") / global_config_->get("num_attention_heads"), + global_config_->get_head_dim(), + global_config_->get_head_dim(), global_config_->get("num_key_value_heads"), global_config_->get("num_key_value_heads"), global_config_->get("num_hidden_layers"), global_config_->get("max_position_embeddings"), - // config_.dtype, global_config_->get_dtype(), *kv_cache_config, rank_info_); } else if (auto paged_kv_cache_config = dynamic_cast(cache_config)) { kv_cache_ = std::make_shared( - global_config_->get("hidden_size") / global_config_->get("num_attention_heads"), - global_config_->get("hidden_size") / global_config_->get("num_attention_heads"), + global_config_->get_head_dim(), + global_config_->get_head_dim(), global_config_->get("num_key_value_heads"), global_config_->get("num_key_value_heads"), global_config_->get("num_hidden_layers"), - // config_.dtype, global_config_->get_dtype(), *paged_kv_cache_config, rank_info_); From a0435d4315b7ed6953979245d707ba38d944c6bc Mon Sep 17 00:00:00 2001 From: qinyiqun Date: Mon, 26 Jan 2026 13:54:51 +0800 Subject: [PATCH 09/11] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=87=BD=E6=95=B0?= =?UTF-8?q?=E5=8F=82=E6=95=B0=E9=A1=BA=E5=BA=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- csrc/config/global_config.hpp | 1 - csrc/engine/rank_worker.cpp | 2 +- csrc/models/llama/llama.hpp | 1 - csrc/models/llama/llama_attention.cpp | 14 ++++++-------- csrc/models/llama/llama_attention.hpp | 8 ++++---- csrc/models/llama/llama_decoder_layer.cpp | 10 +++++----- csrc/models/llama/llama_decoder_layer.hpp | 6 +++--- csrc/models/llama/llama_for_causal_lm.cpp | 8 ++++---- csrc/models/llama/llama_for_causal_lm.hpp | 7 +++---- csrc/models/llama/llama_mlp.cpp | 10 +++++----- csrc/models/llama/llama_mlp.hpp | 6 +++--- csrc/models/llama/llama_model.cpp | 10 +++++----- csrc/models/llama/llama_model.hpp | 6 +++--- csrc/models/model_factory.cpp | 6 +++--- csrc/models/model_factory.hpp | 5 ++--- .../infinilm/models/llama/configuration_llama.py | 1 + 16 files changed, 48 insertions(+), 53 deletions(-) diff --git a/csrc/config/global_config.hpp b/csrc/config/global_config.hpp index e8be1ec2..b82f716d 100644 --- a/csrc/config/global_config.hpp +++ b/csrc/config/global_config.hpp @@ -1,6 +1,5 @@ #pragma once -// #include "infinicore/nn/quantization.hpp" #include "infinicore/nn/rope.hpp" #include "infinicore/ops.hpp" #include "quant_config.hpp" diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp index be8ab1f4..1a7643aa 100644 --- a/csrc/engine/rank_worker.cpp +++ b/csrc/engine/rank_worker.cpp @@ -195,7 +195,7 @@ void RankWorker::thread_loop() { infinicore::context::setDevice(rank_info_.device); // Create model using factory (may be expensive) - model_ = InfinilmModelFactory::createModel(rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr, global_config_); + model_ = InfinilmModelFactory::createModel(global_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr); if (!model_) { throw std::runtime_error("Failed to create model"); } diff --git a/csrc/models/llama/llama.hpp b/csrc/models/llama/llama.hpp index eebac92b..d1d5de85 100644 --- a/csrc/models/llama/llama.hpp +++ b/csrc/models/llama/llama.hpp @@ -18,7 +18,6 @@ #include "../../config/global_config.hpp" #include "llama_attention.hpp" -#include "llama_config.hpp" #include "llama_decoder_layer.hpp" #include "llama_for_causal_lm.hpp" #include "llama_mlp.hpp" diff --git a/csrc/models/llama/llama_attention.cpp b/csrc/models/llama/llama_attention.cpp index f1d85ea9..28af6707 100644 --- a/csrc/models/llama/llama_attention.cpp +++ b/csrc/models/llama/llama_attention.cpp @@ -17,11 +17,12 @@ namespace infinilm::models::llama { -LlamaAttention::LlamaAttention(const infinicore::Device &device, +LlamaAttention::LlamaAttention(std::shared_ptr global_config, + const infinicore::Device &device, size_t layer_idx, - engine::distributed::RankInfo rank_info, - std::shared_ptr global_config) - : layer_idx_(layer_idx), + engine::distributed::RankInfo rank_info) + : global_config_(global_config), + layer_idx_(layer_idx), hidden_size_(global_config->get("hidden_size")), num_attention_heads_(global_config->get("num_attention_heads")), num_key_value_heads_(global_config->get("num_key_value_heads")), @@ -30,8 +31,7 @@ LlamaAttention::LlamaAttention(const infinicore::Device &device, use_bias_(global_config->get_or("attention_bias", true)), use_output_bias_(global_config->get_or("attention_output_bias", false)), max_position_embeddings_(global_config->get("max_position_embeddings")), - rank_info_(rank_info), - global_config_(global_config) { + rank_info_(rank_info) { const auto &dtype{global_config_->get_dtype()}; int tp_rank = rank_info.tp_rank; @@ -54,8 +54,6 @@ LlamaAttention::LlamaAttention(const infinicore::Device &device, INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, global_config_->get("num_attention_heads"), global_config_->get("num_key_value_heads"), use_bias_, dtype, device, rank_info, quant_scheme); - // INFINICORE_NN_MODULE_INIT(o_proj, hidden_size_, hidden_size_, use_output_bias_, - // dtype, device, tp_rank, tp_size, rank_info.comm, quant_scheme); INFINICORE_NN_MODULE_INIT(o_proj, global_config_->get("num_attention_heads") * head_dim_, hidden_size_, use_output_bias_, dtype, device, tp_rank, tp_size, rank_info.comm, quant_scheme); break; diff --git a/csrc/models/llama/llama_attention.hpp b/csrc/models/llama/llama_attention.hpp index 17f6f95e..20df89df 100644 --- a/csrc/models/llama/llama_attention.hpp +++ b/csrc/models/llama/llama_attention.hpp @@ -37,10 +37,10 @@ class LlamaAttention : public infinicore::nn::Module { * @param layer_idx Layer index for cache access * @param dtype Optional data type for model parameters (defaults to F32) */ - LlamaAttention(const infinicore::Device &device, + LlamaAttention(std::shared_ptr global_config, + const infinicore::Device &device, size_t layer_idx, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - std::shared_ptr global_config = nullptr); + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); /** * @brief Forward pass: compute attention @@ -102,6 +102,7 @@ class LlamaAttention : public infinicore::nn::Module { std::shared_ptr rotary_emb_; private: + std::shared_ptr global_config_; size_t layer_idx_; // Layer index for cache access size_t hidden_size_; size_t num_attention_heads_; @@ -113,7 +114,6 @@ class LlamaAttention : public infinicore::nn::Module { size_t max_position_embeddings_; // For cache initialization (deprecated, kept for compatibility) float scaling_; - std::shared_ptr global_config_; }; } // namespace infinilm::models::llama diff --git a/csrc/models/llama/llama_decoder_layer.cpp b/csrc/models/llama/llama_decoder_layer.cpp index c26cdf91..b5d1b82d 100644 --- a/csrc/models/llama/llama_decoder_layer.cpp +++ b/csrc/models/llama/llama_decoder_layer.cpp @@ -6,10 +6,10 @@ namespace infinilm::models::llama { -LlamaDecoderLayer::LlamaDecoderLayer(const infinicore::Device &device, +LlamaDecoderLayer::LlamaDecoderLayer(std::shared_ptr global_config, + const infinicore::Device &device, size_t layer_idx, - engine::distributed::RankInfo rank_info, - std::shared_ptr global_config) : layer_idx_(layer_idx), rank_info_(rank_info), global_config_(global_config) { + engine::distributed::RankInfo rank_info) : global_config_(global_config), layer_idx_(layer_idx), rank_info_(rank_info) { const auto &dtype{global_config_->get_dtype()}; // Initialize layer normalization layers @@ -19,8 +19,8 @@ LlamaDecoderLayer::LlamaDecoderLayer(const infinicore::Device &device, dtype, device); // Initialize attention and MLP modules - INFINICORE_NN_MODULE_INIT(self_attn, device, layer_idx, rank_info_, global_config); - INFINICORE_NN_MODULE_INIT(mlp, device, rank_info_, global_config); + INFINICORE_NN_MODULE_INIT(self_attn, global_config, device, layer_idx, rank_info_); + INFINICORE_NN_MODULE_INIT(mlp, global_config, device, rank_info_); } std::tuple diff --git a/csrc/models/llama/llama_decoder_layer.hpp b/csrc/models/llama/llama_decoder_layer.hpp index cb6bc6ac..f632645b 100644 --- a/csrc/models/llama/llama_decoder_layer.hpp +++ b/csrc/models/llama/llama_decoder_layer.hpp @@ -33,10 +33,10 @@ class LlamaDecoderLayer : public infinicore::nn::Module { * @param layer_idx Layer index for cache management and debugging * @param dtype Optional data type for model parameters (defaults to F32) */ - LlamaDecoderLayer(const infinicore::Device &device, + LlamaDecoderLayer(std::shared_ptr global_config, + const infinicore::Device &device, size_t layer_idx, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - std::shared_ptr global_config = nullptr); + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); /** * @brief Forward pass: process one decoder layer diff --git a/csrc/models/llama/llama_for_causal_lm.cpp b/csrc/models/llama/llama_for_causal_lm.cpp index 3587a029..c8e261a7 100644 --- a/csrc/models/llama/llama_for_causal_lm.cpp +++ b/csrc/models/llama/llama_for_causal_lm.cpp @@ -6,9 +6,9 @@ namespace infinilm::models::llama { -LlamaForCausalLM::LlamaForCausalLM(const infinicore::Device &device, - engine::distributed::RankInfo rank_info, - std::shared_ptr global_config) { +LlamaForCausalLM::LlamaForCausalLM(std::shared_ptr global_config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info) { // Initialize module's device_ member device_ = device; @@ -16,7 +16,7 @@ LlamaForCausalLM::LlamaForCausalLM(const infinicore::Device &device, const auto &dtype{global_config->get_dtype()}; // Initialize base model - INFINICORE_NN_MODULE_INIT(model, device, rank_info, global_config); + INFINICORE_NN_MODULE_INIT(model, global_config, device, rank_info); // Initialize language modeling head // Note: If tie_word_embeddings is true, we would share weights with embed_tokens diff --git a/csrc/models/llama/llama_for_causal_lm.hpp b/csrc/models/llama/llama_for_causal_lm.hpp index 43270a9d..1609a59a 100644 --- a/csrc/models/llama/llama_for_causal_lm.hpp +++ b/csrc/models/llama/llama_for_causal_lm.hpp @@ -28,9 +28,9 @@ class LlamaForCausalLM : public InfinilmModel { * @param config Model configuration * @param device Device to create tensors on */ - LlamaForCausalLM(const infinicore::Device &device, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - std::shared_ptr global_config = nullptr); + LlamaForCausalLM(std::shared_ptr global_config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); /** * @brief Forward pass: compute language modeling logits @@ -45,7 +45,6 @@ class LlamaForCausalLM : public InfinilmModel { const cache::CacheConfig *get_cache_config() const override; // Module information - // const LlamaConfig &config() const { return model_->config(); } LlamaModel &model() { return *model_; } const LlamaModel &model() const { return *model_; } diff --git a/csrc/models/llama/llama_mlp.cpp b/csrc/models/llama/llama_mlp.cpp index 1f4ee436..30b26a6b 100644 --- a/csrc/models/llama/llama_mlp.cpp +++ b/csrc/models/llama/llama_mlp.cpp @@ -5,12 +5,12 @@ namespace infinilm::models::llama { -LlamaMLP::LlamaMLP(const infinicore::Device &device, - engine::distributed::RankInfo rank_info, - std::shared_ptr global_config) - : hidden_size_(global_config->get("hidden_size")), +LlamaMLP::LlamaMLP(std::shared_ptr global_config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info) + : global_config_(global_config), hidden_size_(global_config->get("hidden_size")), intermediate_size_(global_config->get("intermediate_size")), - use_bias_(global_config->get_or("mlp_bias", false)), rank_info_(rank_info), global_config_(global_config) { + use_bias_(global_config->get_or("mlp_bias", false)), rank_info_(rank_info) { const auto &dtype{global_config_->get_dtype()}; int tp_rank = rank_info.tp_rank; diff --git a/csrc/models/llama/llama_mlp.hpp b/csrc/models/llama/llama_mlp.hpp index 38249cb3..af7467a0 100644 --- a/csrc/models/llama/llama_mlp.hpp +++ b/csrc/models/llama/llama_mlp.hpp @@ -34,9 +34,9 @@ class LlamaMLP : public infinicore::nn::Module { * @param device Device to create tensors on * @param dtype Optional data type for model parameters (defaults to F32) */ - LlamaMLP(const infinicore::Device &device, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - std::shared_ptr global_config = nullptr); + LlamaMLP(std::shared_ptr global_config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); /** * @brief Forward pass: compute MLP output diff --git a/csrc/models/llama/llama_model.cpp b/csrc/models/llama/llama_model.cpp index 182a7a5c..94b45e4b 100644 --- a/csrc/models/llama/llama_model.cpp +++ b/csrc/models/llama/llama_model.cpp @@ -7,10 +7,10 @@ namespace infinilm::models::llama { -LlamaModel::LlamaModel(const infinicore::Device &device, - engine::distributed::RankInfo rank_info, - std::shared_ptr global_config) - : rank_info_(rank_info), global_config_(global_config) { +LlamaModel::LlamaModel(std::shared_ptr global_config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info) + : global_config_(global_config), rank_info_(rank_info) { const auto &dtype{global_config_->get_dtype()}; // Initialize token embeddings INFINICORE_NN_MODULE_INIT(embed_tokens, global_config_->get("vocab_size"), global_config_->get("hidden_size"), @@ -22,7 +22,7 @@ LlamaModel::LlamaModel(const infinicore::Device &device, layers_.reserve(global_config_->get("num_hidden_layers")); for (size_t i = 0; i < global_config_->get("num_hidden_layers"); ++i) { layers_.push_back(this->register_module( - "layers." + std::to_string(i), device, i, rank_info, global_config_)); + "layers." + std::to_string(i), global_config_, device, i, rank_info)); } // Initialize final layer normalization diff --git a/csrc/models/llama/llama_model.hpp b/csrc/models/llama/llama_model.hpp index 422c1bd6..3a54d0a8 100644 --- a/csrc/models/llama/llama_model.hpp +++ b/csrc/models/llama/llama_model.hpp @@ -37,9 +37,9 @@ class LlamaModel : public infinicore::nn::Module { * @param device Device to create tensors on * @param dtype Optional data type for model parameters (defaults to F32) */ - LlamaModel(const infinicore::Device &device, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - std::shared_ptr global_config = nullptr); + LlamaModel(std::shared_ptr global_config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); /** * @brief Forward pass: process input through the model diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp index b4fd634a..4af23fa8 100644 --- a/csrc/models/model_factory.cpp +++ b/csrc/models/model_factory.cpp @@ -3,16 +3,16 @@ namespace infinilm { std::shared_ptr InfinilmModelFactory::createModel( + std::shared_ptr global_config, engine::distributed::RankInfo rank_info, - const cache::CacheConfig *cache, - std::shared_ptr global_config) { + const cache::CacheConfig *cache) { std::shared_ptr model; //****************************NEED TO BE FIXED */ if (true) { // const auto &llama_config = *llama_config_ptr; model = std::make_shared( - rank_info.device, rank_info, global_config); + global_config, rank_info.device, rank_info); } else { throw std::invalid_argument("InfinilmModelFactory::createModel: Unsupported model config type"); } diff --git a/csrc/models/model_factory.hpp b/csrc/models/model_factory.hpp index c020f6a5..4bd566e3 100644 --- a/csrc/models/model_factory.hpp +++ b/csrc/models/model_factory.hpp @@ -9,9 +9,8 @@ namespace infinilm { class InfinilmModelFactory { public: static std::shared_ptr createModel( - // const InfinilmModel::Config &config, + std::shared_ptr global_config, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - const cache::CacheConfig *cache = nullptr, - std::shared_ptr global_config = nullptr); + const cache::CacheConfig *cache = nullptr); }; } // namespace infinilm diff --git a/python/infinilm/models/llama/configuration_llama.py b/python/infinilm/models/llama/configuration_llama.py index 8d07a657..5dce2b04 100644 --- a/python/infinilm/models/llama/configuration_llama.py +++ b/python/infinilm/models/llama/configuration_llama.py @@ -21,6 +21,7 @@ from ...configuration_utils import PretrainedConfig + class LlamaConfig(PretrainedConfig, _infinilm.LlamaConfig): r""" This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA From ea019e05cd35cdbbdf2ce4b2b1c5fa62062540b0 Mon Sep 17 00:00:00 2001 From: qinyiqun Date: Tue, 27 Jan 2026 10:27:31 +0800 Subject: [PATCH 10/11] =?UTF-8?q?=E6=94=B9=E5=90=8Dglobal=20config=20?= =?UTF-8?q?=E4=B8=BAmodel=20config?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../{global_config.cpp => model_config.cpp} | 16 +++--- .../{global_config.hpp => model_config.hpp} | 19 +++---- csrc/config/quant_config.cpp | 4 +- csrc/config/quant_config.hpp | 4 +- csrc/engine/infer_engine.cpp | 7 ++- csrc/engine/infer_engine.hpp | 7 ++- csrc/engine/rank_worker.cpp | 8 +-- csrc/engine/rank_worker.hpp | 8 +-- csrc/models/llama/llama.hpp | 2 +- csrc/models/llama/llama_attention.cpp | 47 +++++++++--------- csrc/models/llama/llama_attention.hpp | 6 +-- csrc/models/llama/llama_decoder_layer.cpp | 16 +++--- csrc/models/llama/llama_decoder_layer.hpp | 4 +- csrc/models/llama/llama_for_causal_lm.cpp | 10 ++-- csrc/models/llama/llama_for_causal_lm.hpp | 2 +- csrc/models/llama/llama_mlp.cpp | 15 +++--- csrc/models/llama/llama_mlp.hpp | 6 +-- csrc/models/llama/llama_model.cpp | 49 +++++++++---------- csrc/models/llama/llama_model.hpp | 6 +-- csrc/models/model_factory.cpp | 4 +- csrc/models/model_factory.hpp | 4 +- csrc/pybind11/engine/engine.hpp | 7 +-- python/infinilm/infer_engine.py | 2 - .../models/llama/configuration_llama.py | 2 +- 24 files changed, 121 insertions(+), 134 deletions(-) rename csrc/config/{global_config.cpp => model_config.cpp} (88%) rename csrc/config/{global_config.hpp => model_config.hpp} (77%) diff --git a/csrc/config/global_config.cpp b/csrc/config/model_config.cpp similarity index 88% rename from csrc/config/global_config.cpp rename to csrc/config/model_config.cpp index 2f3ce308..ec15967a 100644 --- a/csrc/config/global_config.cpp +++ b/csrc/config/model_config.cpp @@ -1,7 +1,7 @@ -#include "global_config.hpp" +#include "model_config.hpp" -namespace infinilm::config::global_config { -GlobalConfig::GlobalConfig(const std::string &path) { +namespace infinilm::config { +ModelConfig::ModelConfig(const std::string &path) { std::ifstream file(path); if (file.is_open()) { file >> config_json; @@ -9,11 +9,11 @@ GlobalConfig::GlobalConfig(const std::string &path) { } else { throw std::runtime_error("Could not open config file: " + path); } - this->quant_config = quantization::QuantConfig(config_json["quantization_config"]); + this->quant_config = QuantConfig(config_json["quantization_config"]); } infinicore::nn::QuantScheme -GlobalConfig::get_quant_scheme() const { +ModelConfig::get_quant_scheme() const { if (quant_config.get_quant_scheme() != infinicore::nn::QuantScheme::NONE) { return quant_config.get_quant_scheme(); } else { @@ -22,7 +22,7 @@ GlobalConfig::get_quant_scheme() const { } std::shared_ptr -GlobalConfig::get_rope_scaling() const { +ModelConfig::get_rope_scaling() const { if (!config_json.contains("rope_scaling") || config_json["rope_scaling"].is_null()) { return nullptr; } @@ -67,7 +67,7 @@ GlobalConfig::get_rope_scaling() const { } infinicore::DataType -GlobalConfig::get_dtype() const { +ModelConfig::get_dtype() const { try { std::string dtype_str = this->get("torch_dtype"); if (dtype_str == "float32") { @@ -85,4 +85,4 @@ GlobalConfig::get_dtype() const { throw std::runtime_error("Error getting dtype from config: " + std::string(e.what())); } } -} // namespace infinilm::config::global_config +} // namespace infinilm::config diff --git a/csrc/config/global_config.hpp b/csrc/config/model_config.hpp similarity index 77% rename from csrc/config/global_config.hpp rename to csrc/config/model_config.hpp index b82f716d..2682c6d2 100644 --- a/csrc/config/global_config.hpp +++ b/csrc/config/model_config.hpp @@ -6,15 +6,16 @@ #include #include -namespace infinilm::config::global_config { -struct GlobalConfig { - // Global config is implemented using nlohmann/json and is primarily used for advanced configuration - // beyond the standard model config. It is initialized via GlobalConfig(const std::string& path) +namespace infinilm::config { +class ModelConfig { + // Model config is implemented using nlohmann/json and is primarily used for advanced configuration + // beyond the standard model config. It is initialized via ModelConfig(const std::string& path) // and passed through the InferEngine during inference. public: - GlobalConfig() = default; - GlobalConfig(const nlohmann::json &json) : config_json(json) {}; - GlobalConfig(const std::string &path); + ModelConfig() = default; + // Not Implemented + // ModelConfig(const nlohmann::json &json) : config_json(json) {}; + ModelConfig(const std::string &path); // Template Function to get a value by key with type safety template @@ -57,6 +58,6 @@ struct GlobalConfig { private: nlohmann::json config_json; - quantization::QuantConfig quant_config; + QuantConfig quant_config; }; -} // namespace infinilm::config::global_config +} // namespace infinilm::config diff --git a/csrc/config/quant_config.cpp b/csrc/config/quant_config.cpp index 8984661f..0f154407 100644 --- a/csrc/config/quant_config.cpp +++ b/csrc/config/quant_config.cpp @@ -1,6 +1,6 @@ #include "quant_config.hpp" -namespace infinilm::config::quantization { +namespace infinilm::config { QuantConfig::QuantConfig(const nlohmann::json &json) : quantization_config(json) { this->quantization_method = get_quantization_method(); } @@ -19,4 +19,4 @@ QuantConfig::get_quantization_method() const { return nullptr; // Default case if no matching scheme } -} // namespace infinilm::config::quantization +} // namespace infinilm::config diff --git a/csrc/config/quant_config.hpp b/csrc/config/quant_config.hpp index dec3750e..9fda4224 100644 --- a/csrc/config/quant_config.hpp +++ b/csrc/config/quant_config.hpp @@ -2,7 +2,7 @@ #include "../quantization/quantization.hpp" #include "nlohmann/json.hpp" -namespace infinilm::config::quantization { +namespace infinilm::config { class QuantConfig { // QuantConfig is used to store and parse the "quantization" field from config.json. @@ -25,4 +25,4 @@ class QuantConfig { std::shared_ptr quantization_method; }; -} // namespace infinilm::config::quantization +} // namespace infinilm::config diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp index 6a903688..81e67a67 100644 --- a/csrc/engine/infer_engine.cpp +++ b/csrc/engine/infer_engine.cpp @@ -19,8 +19,8 @@ InferEngine::InferEngine( cache_config_ = cache_config->unique_copy(); } - // Load global config if model_path is provided, model_path must be valid, and config.json exists - this->global_config_ = std::make_shared(model_path + "/config.json"); + // Load model config if model_path is provided, model_path must be valid, and config.json exists + this->model_config_ = std::make_shared(model_path + "/config.json"); // Create one RankWorker per rank int world_size = communication_group_.get_world_size(); @@ -28,10 +28,9 @@ InferEngine::InferEngine( workers_.reserve(world_size); for (int r = 0; r < world_size; ++r) { workers_.emplace_back(std::make_unique( - // model_config_, + model_config_, communication_group_.get_rank_info(r), cache_config_ != nullptr ? cache_config_.get() : nullptr, - global_config_, barrier_.get(), enable_graph_compiling)); } diff --git a/csrc/engine/infer_engine.hpp b/csrc/engine/infer_engine.hpp index e8263354..11db3b7b 100644 --- a/csrc/engine/infer_engine.hpp +++ b/csrc/engine/infer_engine.hpp @@ -1,6 +1,6 @@ #pragma once -#include "../config/global_config.hpp" +#include "../config/model_config.hpp" #include "../models/infinilm_model.hpp" #include "../models/llama/llama_config.hpp" #include "distributed/distributed.hpp" @@ -24,7 +24,7 @@ class InferEngine { const distributed::DistConfig &distributed_config = distributed::DistConfig(), infinicore::Device::Type device_type = infinicore::context::getDevice().getType(), const cache::CacheConfig *cache_config = nullptr, - const std::string &modle_path = "", + const std::string &model_path = "", bool enable_graph_compiling = false); // Load a parameter to all workers (each can extract its shard inside RankWorker) @@ -51,9 +51,8 @@ class InferEngine { std::vector> workers_; std::unique_ptr barrier_; distributed::CommunicationGroup communication_group_; - // const InfinilmModel::Config &model_config_; std::unique_ptr cache_config_; - std::shared_ptr global_config_; + std::shared_ptr model_config_; }; } // namespace infinilm::engine diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp index 1a7643aa..605dd1fe 100644 --- a/csrc/engine/rank_worker.cpp +++ b/csrc/engine/rank_worker.cpp @@ -11,17 +11,17 @@ namespace infinilm::engine { RankWorker::RankWorker( + std::shared_ptr model_config, const distributed::RankInfo &rank_info, const cache::CacheConfig *cache_config, - std::shared_ptr global_config, RankBarrier *barrier, bool enable_graph_compiling) - : rank_info_(rank_info), + : model_config_(model_config), + rank_info_(rank_info), enable_graph_compiling_(enable_graph_compiling), job_cmd_(Command::INIT), has_job_(false), job_done_(false), - global_config_(global_config), should_exit_(false), init_done_(false), rng_(std::random_device{}()), @@ -195,7 +195,7 @@ void RankWorker::thread_loop() { infinicore::context::setDevice(rank_info_.device); // Create model using factory (may be expensive) - model_ = InfinilmModelFactory::createModel(global_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr); + model_ = InfinilmModelFactory::createModel(model_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr); if (!model_) { throw std::runtime_error("Failed to create model"); } diff --git a/csrc/engine/rank_worker.hpp b/csrc/engine/rank_worker.hpp index 7bc9da1d..bbbfae7a 100644 --- a/csrc/engine/rank_worker.hpp +++ b/csrc/engine/rank_worker.hpp @@ -1,7 +1,7 @@ #pragma once #include "../cache/cache.hpp" -#include "../config/global_config.hpp" +#include "../config/model_config.hpp" #include "../models/model_factory.hpp" #include "compiler/general_compiler.hpp" #include "distributed/distributed.hpp" @@ -57,9 +57,9 @@ class RankWorker { infinicore::Tensor output_ids; }; - RankWorker(const distributed::RankInfo &rank_info, + RankWorker(std::shared_ptr model_config, + const distributed::RankInfo &rank_info, const cache::CacheConfig *cache_config, - std::shared_ptr global_config, RankBarrier *barrier, bool enable_graph_compiling); @@ -99,7 +99,7 @@ class RankWorker { distributed::RankInfo rank_info_; std::shared_ptr model_; std::shared_ptr cache_; - std::shared_ptr global_config_; + std::shared_ptr model_config_; // Graph Compiling bool enable_graph_compiling_; diff --git a/csrc/models/llama/llama.hpp b/csrc/models/llama/llama.hpp index d1d5de85..8402a1ab 100644 --- a/csrc/models/llama/llama.hpp +++ b/csrc/models/llama/llama.hpp @@ -16,7 +16,7 @@ * - LlamaForCausalLM: Complete model with language modeling head */ -#include "../../config/global_config.hpp" +#include "../../config/model_config.hpp" #include "llama_attention.hpp" #include "llama_decoder_layer.hpp" #include "llama_for_causal_lm.hpp" diff --git a/csrc/models/llama/llama_attention.cpp b/csrc/models/llama/llama_attention.cpp index 28af6707..cdf80600 100644 --- a/csrc/models/llama/llama_attention.cpp +++ b/csrc/models/llama/llama_attention.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -17,28 +16,28 @@ namespace infinilm::models::llama { -LlamaAttention::LlamaAttention(std::shared_ptr global_config, +LlamaAttention::LlamaAttention(std::shared_ptr model_config, const infinicore::Device &device, size_t layer_idx, engine::distributed::RankInfo rank_info) - : global_config_(global_config), + : model_config_(model_config), layer_idx_(layer_idx), - hidden_size_(global_config->get("hidden_size")), - num_attention_heads_(global_config->get("num_attention_heads")), - num_key_value_heads_(global_config->get("num_key_value_heads")), - head_dim_(global_config->get_head_dim()), - kv_dim_(global_config->get_kv_dim()), - use_bias_(global_config->get_or("attention_bias", true)), - use_output_bias_(global_config->get_or("attention_output_bias", false)), - max_position_embeddings_(global_config->get("max_position_embeddings")), + hidden_size_(model_config->get("hidden_size")), + num_attention_heads_(model_config->get("num_attention_heads")), + num_key_value_heads_(model_config->get("num_key_value_heads")), + head_dim_(model_config->get_head_dim()), + kv_dim_(model_config->get_kv_dim()), + use_bias_(model_config->get_or("attention_bias", true)), + use_output_bias_(model_config->get_or("attention_output_bias", false)), + max_position_embeddings_(model_config->get("max_position_embeddings")), rank_info_(rank_info) { - const auto &dtype{global_config_->get_dtype()}; + const auto &dtype{model_config_->get_dtype()}; int tp_rank = rank_info.tp_rank; int tp_size = rank_info.tp_size; - int num_attention_heads = global_config_->get("num_attention_heads"); - int num_key_value_heads = global_config_->get("num_key_value_heads"); + int num_attention_heads = model_config_->get("num_attention_heads"); + int num_key_value_heads = model_config_->get("num_key_value_heads"); if ((num_key_value_heads >= tp_size) && (0 == (num_key_value_heads % tp_size))) { this->num_attention_heads_ = num_attention_heads / tp_size; @@ -48,27 +47,27 @@ LlamaAttention::LlamaAttention(std::shared_ptr(head_dim_)); - auto quant_scheme = this->global_config_->get_quant_scheme(); + auto quant_scheme = this->model_config_->get_quant_scheme(); switch (quant_scheme) { case infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8: - INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, global_config_->get("num_attention_heads"), global_config_->get("num_key_value_heads"), use_bias_, + INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), use_bias_, dtype, device, rank_info, quant_scheme); - INFINICORE_NN_MODULE_INIT(o_proj, global_config_->get("num_attention_heads") * head_dim_, hidden_size_, use_output_bias_, + INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get("num_attention_heads") * head_dim_, hidden_size_, use_output_bias_, dtype, device, tp_rank, tp_size, rank_info.comm, quant_scheme); break; default: - INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, global_config_->get("num_attention_heads"), global_config_->get("num_key_value_heads"), use_bias_, + INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), use_bias_, dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, global_config_->get("num_attention_heads") * head_dim_, hidden_size_, use_output_bias_, + INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get("num_attention_heads") * head_dim_, hidden_size_, use_output_bias_, dtype, device, tp_rank, tp_size, rank_info.comm); break; } - if (global_config_->get("model_type") == "qwen3") { - INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, global_config_->get("rms_norm_eps"), dtype, device); - INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, global_config_->get("rms_norm_eps"), dtype, device); + if (model_config_->get("model_type") == "qwen3") { + INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config_->get("rms_norm_eps"), dtype, device); + INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config_->get("rms_norm_eps"), dtype, device); } } @@ -86,7 +85,7 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta // 1. Project Q, K, V auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable); - if (global_config_->get("model_type") == "qwen3") { + if (model_config_->get("model_type") == "qwen3") { q = q_norm_->forward(q->view({batch_size * seq_len, num_attention_heads_, head_dim_})); k = k_norm_->forward(k->view({batch_size * seq_len, num_key_value_heads_, head_dim_})); } @@ -210,7 +209,7 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd auto k_reshaped = k->view({seq_len, num_key_value_heads_, head_dim_}); auto v_reshaped = v->view({seq_len, num_key_value_heads_, head_dim_}); - if (global_config_->get("model_type") == "qwen3") { + if (model_config_->get("model_type") == "qwen3") { q_reshaped = q_norm_->forward(q_reshaped); k_reshaped = k_norm_->forward(k_reshaped); } diff --git a/csrc/models/llama/llama_attention.hpp b/csrc/models/llama/llama_attention.hpp index 20df89df..70e43bdc 100644 --- a/csrc/models/llama/llama_attention.hpp +++ b/csrc/models/llama/llama_attention.hpp @@ -1,7 +1,7 @@ #pragma once #include "../../cache/kv_cache.hpp" -#include "../../config/global_config.hpp" +#include "../../config/model_config.hpp" #include "../../engine/distributed/distributed.hpp" #include "../../layers/fused_linear.hpp" #include "llama_config.hpp" @@ -37,7 +37,7 @@ class LlamaAttention : public infinicore::nn::Module { * @param layer_idx Layer index for cache access * @param dtype Optional data type for model parameters (defaults to F32) */ - LlamaAttention(std::shared_ptr global_config, + LlamaAttention(std::shared_ptr model_config, const infinicore::Device &device, size_t layer_idx, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); @@ -102,7 +102,7 @@ class LlamaAttention : public infinicore::nn::Module { std::shared_ptr rotary_emb_; private: - std::shared_ptr global_config_; + std::shared_ptr model_config_; size_t layer_idx_; // Layer index for cache access size_t hidden_size_; size_t num_attention_heads_; diff --git a/csrc/models/llama/llama_decoder_layer.cpp b/csrc/models/llama/llama_decoder_layer.cpp index b5d1b82d..9d03059d 100644 --- a/csrc/models/llama/llama_decoder_layer.cpp +++ b/csrc/models/llama/llama_decoder_layer.cpp @@ -1,26 +1,24 @@ #include "llama_decoder_layer.hpp" #include "infinicore/nn/rmsnorm.hpp" #include "infinicore/ops.hpp" - #include namespace infinilm::models::llama { -LlamaDecoderLayer::LlamaDecoderLayer(std::shared_ptr global_config, +LlamaDecoderLayer::LlamaDecoderLayer(std::shared_ptr model_config, const infinicore::Device &device, size_t layer_idx, - engine::distributed::RankInfo rank_info) : global_config_(global_config), layer_idx_(layer_idx), rank_info_(rank_info) { - const auto &dtype{global_config_->get_dtype()}; - + engine::distributed::RankInfo rank_info) : model_config_(model_config), layer_idx_(layer_idx), rank_info_(rank_info) { + const auto &dtype{model_config_->get_dtype()}; // Initialize layer normalization layers - INFINICORE_NN_MODULE_INIT(input_layernorm, global_config_->get("hidden_size"), global_config_->get("rms_norm_eps"), + INFINICORE_NN_MODULE_INIT(input_layernorm, model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), dtype, device); - INFINICORE_NN_MODULE_INIT(post_attention_layernorm, global_config_->get("hidden_size"), global_config_->get("rms_norm_eps"), + INFINICORE_NN_MODULE_INIT(post_attention_layernorm, model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), dtype, device); // Initialize attention and MLP modules - INFINICORE_NN_MODULE_INIT(self_attn, global_config, device, layer_idx, rank_info_); - INFINICORE_NN_MODULE_INIT(mlp, global_config, device, rank_info_); + INFINICORE_NN_MODULE_INIT(self_attn, model_config_, device, layer_idx, rank_info_); + INFINICORE_NN_MODULE_INIT(mlp, model_config_, device, rank_info_); } std::tuple diff --git a/csrc/models/llama/llama_decoder_layer.hpp b/csrc/models/llama/llama_decoder_layer.hpp index f632645b..1ba58a30 100644 --- a/csrc/models/llama/llama_decoder_layer.hpp +++ b/csrc/models/llama/llama_decoder_layer.hpp @@ -33,7 +33,7 @@ class LlamaDecoderLayer : public infinicore::nn::Module { * @param layer_idx Layer index for cache management and debugging * @param dtype Optional data type for model parameters (defaults to F32) */ - LlamaDecoderLayer(std::shared_ptr global_config, + LlamaDecoderLayer(std::shared_ptr model_config, const infinicore::Device &device, size_t layer_idx, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); @@ -79,7 +79,7 @@ class LlamaDecoderLayer : public infinicore::nn::Module { INFINICORE_NN_MODULE(LlamaAttention, self_attn); INFINICORE_NN_MODULE(LlamaMLP, mlp); engine::distributed::RankInfo rank_info_; - std::shared_ptr global_config_; + std::shared_ptr model_config_; private: size_t layer_idx_; // Layer index for cache management and debugging diff --git a/csrc/models/llama/llama_for_causal_lm.cpp b/csrc/models/llama/llama_for_causal_lm.cpp index c8e261a7..060737d1 100644 --- a/csrc/models/llama/llama_for_causal_lm.cpp +++ b/csrc/models/llama/llama_for_causal_lm.cpp @@ -2,27 +2,25 @@ #include "infinicore/context/context.hpp" #include "infinicore/nn/linear.hpp" #include "infinicore/ops.hpp" -#include namespace infinilm::models::llama { -LlamaForCausalLM::LlamaForCausalLM(std::shared_ptr global_config, +LlamaForCausalLM::LlamaForCausalLM(std::shared_ptr model_config, const infinicore::Device &device, engine::distributed::RankInfo rank_info) { // Initialize module's device_ member device_ = device; - const auto &dtype{global_config->get_dtype()}; + const auto &dtype{model_config->get_dtype()}; // Initialize base model - INFINICORE_NN_MODULE_INIT(model, global_config, device, rank_info); - + INFINICORE_NN_MODULE_INIT(model, model_config, device, rank_info); // Initialize language modeling head // Note: If tie_word_embeddings is true, we would share weights with embed_tokens // For now, we create a separate linear layer - INFINICORE_NN_MODULE_INIT(lm_head, global_config->get("hidden_size"), global_config->get("vocab_size"), false, + INFINICORE_NN_MODULE_INIT(lm_head, model_config->get("hidden_size"), model_config->get("vocab_size"), false, dtype, device); } diff --git a/csrc/models/llama/llama_for_causal_lm.hpp b/csrc/models/llama/llama_for_causal_lm.hpp index 1609a59a..59dac7ae 100644 --- a/csrc/models/llama/llama_for_causal_lm.hpp +++ b/csrc/models/llama/llama_for_causal_lm.hpp @@ -28,7 +28,7 @@ class LlamaForCausalLM : public InfinilmModel { * @param config Model configuration * @param device Device to create tensors on */ - LlamaForCausalLM(std::shared_ptr global_config, + LlamaForCausalLM(std::shared_ptr model_config, const infinicore::Device &device, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); diff --git a/csrc/models/llama/llama_mlp.cpp b/csrc/models/llama/llama_mlp.cpp index 30b26a6b..f9601118 100644 --- a/csrc/models/llama/llama_mlp.cpp +++ b/csrc/models/llama/llama_mlp.cpp @@ -1,24 +1,23 @@ #include "llama_mlp.hpp" #include "infinicore/nn/linear.hpp" #include "infinicore/ops.hpp" -#include namespace infinilm::models::llama { -LlamaMLP::LlamaMLP(std::shared_ptr global_config, +LlamaMLP::LlamaMLP(std::shared_ptr model_config, const infinicore::Device &device, engine::distributed::RankInfo rank_info) - : global_config_(global_config), hidden_size_(global_config->get("hidden_size")), - intermediate_size_(global_config->get("intermediate_size")), - use_bias_(global_config->get_or("mlp_bias", false)), rank_info_(rank_info) { - const auto &dtype{global_config_->get_dtype()}; + : model_config_(model_config), hidden_size_(model_config->get("hidden_size")), + intermediate_size_(model_config->get("intermediate_size")), + use_bias_(model_config->get_or("mlp_bias", false)), rank_info_(rank_info) { + + const auto &dtype{model_config_->get_dtype()}; int tp_rank = rank_info.tp_rank; int tp_size = rank_info.tp_size; // Initialize projection layers - auto quant_scheme = this->global_config_->get_quant_scheme(); - // std::cout << "LlamaMLP quant_scheme: " << static_cast(quant_scheme) << std::endl; + auto quant_scheme = this->model_config_->get_quant_scheme(); switch (quant_scheme) { case infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8: INFINILM_GATE_UP_LINEAR_W8A8_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, use_bias_, diff --git a/csrc/models/llama/llama_mlp.hpp b/csrc/models/llama/llama_mlp.hpp index af7467a0..45df91f5 100644 --- a/csrc/models/llama/llama_mlp.hpp +++ b/csrc/models/llama/llama_mlp.hpp @@ -3,7 +3,7 @@ #include "../../layers/fused_linear.hpp" #include "llama_config.hpp" -#include "../../config/global_config.hpp" +#include "../../config/model_config.hpp" #include "infinicore/device.hpp" #include "infinicore/nn/linear.hpp" #include "infinicore/nn/module.hpp" @@ -34,7 +34,7 @@ class LlamaMLP : public infinicore::nn::Module { * @param device Device to create tensors on * @param dtype Optional data type for model parameters (defaults to F32) */ - LlamaMLP(std::shared_ptr global_config, + LlamaMLP(std::shared_ptr model_config, const infinicore::Device &device, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); @@ -59,7 +59,7 @@ class LlamaMLP : public infinicore::nn::Module { size_t intermediate_size_; bool use_bias_; - std::shared_ptr global_config_; + std::shared_ptr model_config_; }; } // namespace infinilm::models::llama diff --git a/csrc/models/llama/llama_model.cpp b/csrc/models/llama/llama_model.cpp index 94b45e4b..0771fdd7 100644 --- a/csrc/models/llama/llama_model.cpp +++ b/csrc/models/llama/llama_model.cpp @@ -3,37 +3,36 @@ #include "infinicore/nn/rmsnorm.hpp" #include "infinicore/nn/rope.hpp" #include "infinicore/ops.hpp" -#include namespace infinilm::models::llama { -LlamaModel::LlamaModel(std::shared_ptr global_config, +LlamaModel::LlamaModel(std::shared_ptr model_config, const infinicore::Device &device, engine::distributed::RankInfo rank_info) - : global_config_(global_config), rank_info_(rank_info) { - const auto &dtype{global_config_->get_dtype()}; + : model_config_(model_config), rank_info_(rank_info) { + const auto &dtype{model_config_->get_dtype()}; // Initialize token embeddings - INFINICORE_NN_MODULE_INIT(embed_tokens, global_config_->get("vocab_size"), global_config_->get("hidden_size"), + INFINICORE_NN_MODULE_INIT(embed_tokens, model_config_->get("vocab_size"), model_config_->get("hidden_size"), std::nullopt, dtype, device); // Initialize decoder layers with layer indices // TODO: Update INFINICORE_NN_MODULE_VEC_INIT macro to support per-layer constructor arguments // (e.g., via a factory function or lambda that receives the layer index) // Currently, we can't use the macro because each layer needs a different layer_idx - layers_.reserve(global_config_->get("num_hidden_layers")); - for (size_t i = 0; i < global_config_->get("num_hidden_layers"); ++i) { + layers_.reserve(model_config_->get("num_hidden_layers")); + for (size_t i = 0; i < model_config_->get("num_hidden_layers"); ++i) { layers_.push_back(this->register_module( - "layers." + std::to_string(i), global_config_, device, i, rank_info)); + "layers." + std::to_string(i), model_config_, device, i, rank_info)); } // Initialize final layer normalization - INFINICORE_NN_MODULE_INIT(norm, global_config_->get("hidden_size"), global_config_->get("rms_norm_eps"), + INFINICORE_NN_MODULE_INIT(norm, model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), dtype, device); // Initialize Rotary Position Embeddings (shared across all layers) // Use GPT-J-style inverse frequencies (default) and GPT_NEOX rotation pairing - INFINICORE_NN_MODULE_INIT(rotary_emb, global_config_->get_head_dim(), global_config_->get("max_position_embeddings"), - global_config_->get("rope_theta"), infinicore::nn::RoPE::Algo::GPT_NEOX, - dtype, device, global_config_->get_rope_scaling()); + INFINICORE_NN_MODULE_INIT(rotary_emb, model_config_->get_head_dim(), model_config_->get("max_position_embeddings"), + model_config_->get("rope_theta"), infinicore::nn::RoPE::Algo::GPT_NEOX, + dtype, device, model_config_->get_rope_scaling()); for (auto &layer : layers_) { if (layer) { @@ -80,23 +79,23 @@ void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) { } if (auto kv_cache_config = dynamic_cast(cache_config)) { kv_cache_ = std::make_shared( - global_config_->get_head_dim(), - global_config_->get_head_dim(), - global_config_->get("num_key_value_heads"), - global_config_->get("num_key_value_heads"), - global_config_->get("num_hidden_layers"), - global_config_->get("max_position_embeddings"), - global_config_->get_dtype(), + model_config_->get_head_dim(), + model_config_->get_head_dim(), + model_config_->get("num_key_value_heads"), + model_config_->get("num_key_value_heads"), + model_config_->get("num_hidden_layers"), + model_config_->get("max_position_embeddings"), + model_config_->get_dtype(), *kv_cache_config, rank_info_); } else if (auto paged_kv_cache_config = dynamic_cast(cache_config)) { kv_cache_ = std::make_shared( - global_config_->get_head_dim(), - global_config_->get_head_dim(), - global_config_->get("num_key_value_heads"), - global_config_->get("num_key_value_heads"), - global_config_->get("num_hidden_layers"), - global_config_->get_dtype(), + model_config_->get_head_dim(), + model_config_->get_head_dim(), + model_config_->get("num_key_value_heads"), + model_config_->get("num_key_value_heads"), + model_config_->get("num_hidden_layers"), + model_config_->get_dtype(), *paged_kv_cache_config, rank_info_); } else { diff --git a/csrc/models/llama/llama_model.hpp b/csrc/models/llama/llama_model.hpp index 3a54d0a8..11a8547d 100644 --- a/csrc/models/llama/llama_model.hpp +++ b/csrc/models/llama/llama_model.hpp @@ -37,7 +37,7 @@ class LlamaModel : public infinicore::nn::Module { * @param device Device to create tensors on * @param dtype Optional data type for model parameters (defaults to F32) */ - LlamaModel(std::shared_ptr global_config, + LlamaModel(std::shared_ptr model_config, const infinicore::Device &device, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); @@ -63,7 +63,7 @@ class LlamaModel : public infinicore::nn::Module { void reset_cache(const cache::CacheConfig *cache_config); // Module information - size_t num_layers() const { return global_config_->get("num_hidden_layers"); } + size_t num_layers() const { return model_config_->get("num_hidden_layers"); } protected: // Token embeddings @@ -83,7 +83,7 @@ class LlamaModel : public infinicore::nn::Module { std::shared_ptr kv_cache_; private: - std::shared_ptr global_config_; + std::shared_ptr model_config_; }; } // namespace infinilm::models::llama diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp index 4af23fa8..38d119cf 100644 --- a/csrc/models/model_factory.cpp +++ b/csrc/models/model_factory.cpp @@ -3,7 +3,7 @@ namespace infinilm { std::shared_ptr InfinilmModelFactory::createModel( - std::shared_ptr global_config, + std::shared_ptr model_config, engine::distributed::RankInfo rank_info, const cache::CacheConfig *cache) { @@ -12,7 +12,7 @@ std::shared_ptr InfinilmModelFactory::createModel( if (true) { // const auto &llama_config = *llama_config_ptr; model = std::make_shared( - global_config, rank_info.device, rank_info); + model_config, rank_info.device, rank_info); } else { throw std::invalid_argument("InfinilmModelFactory::createModel: Unsupported model config type"); } diff --git a/csrc/models/model_factory.hpp b/csrc/models/model_factory.hpp index 4bd566e3..627d9447 100644 --- a/csrc/models/model_factory.hpp +++ b/csrc/models/model_factory.hpp @@ -1,6 +1,6 @@ #pragma once -#include "../config/global_config.hpp" +#include "../config/model_config.hpp" #include "infinilm_model.hpp" #include "../engine/distributed/distributed.hpp" @@ -9,7 +9,7 @@ namespace infinilm { class InfinilmModelFactory { public: static std::shared_ptr createModel( - std::shared_ptr global_config, + std::shared_ptr model_config, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), const cache::CacheConfig *cache = nullptr); }; diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp index 7a4b5a99..3a0471ac 100644 --- a/csrc/pybind11/engine/engine.hpp +++ b/csrc/pybind11/engine/engine.hpp @@ -32,21 +32,18 @@ inline void bind_infer_engine(py::module &m) { py::class_> infer_engine(m, "InferEngine"); infer_engine .def(py::init([]( - // const InfinilmModel::Config &cfg, const distributed::DistConfig &dist, infinicore::Device::Type dev, std::shared_ptr cache_cfg, - const std::string &modle_path, + const std::string &model_path, bool enable_graph_compiling) { return std::make_shared( - // cfg, dist, dev, cache_cfg ? cache_cfg.get() : nullptr, - modle_path, + model_path, enable_graph_compiling); }), - // py::arg("config"), py::arg("distributed_config") = distributed::DistConfig(), py::arg("device_type") = infinicore::context::getDevice().getType(), py::arg("cache_config") = py::none(), diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py index e28e9a7c..e41a89c7 100644 --- a/python/infinilm/infer_engine.py +++ b/python/infinilm/infer_engine.py @@ -34,7 +34,6 @@ def __init__( if device is None: device = infinicore.device() - super().__init__( # self.config, distributed_config._underlying, @@ -43,7 +42,6 @@ def __init__( model_path, enable_graph_compiling, ) - self.use_cache = False self.enable_paged_attn = isinstance(cache_config, PagedKVCacheConfig) diff --git a/python/infinilm/models/llama/configuration_llama.py b/python/infinilm/models/llama/configuration_llama.py index 5dce2b04..15776c84 100644 --- a/python/infinilm/models/llama/configuration_llama.py +++ b/python/infinilm/models/llama/configuration_llama.py @@ -244,4 +244,4 @@ def __init__( eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs, - ) \ No newline at end of file + ) From 36d173e504f5cc9eac5cd69631f60142bc6543f1 Mon Sep 17 00:00:00 2001 From: qinyiqun Date: Fri, 30 Jan 2026 15:05:26 +0800 Subject: [PATCH 11/11] Refactor: add new API alongside legacy interfaces with deprecation warnings --- =0.34.0, | 0 csrc/engine/infer_engine.cpp | 45 ++++++++- csrc/engine/infer_engine.hpp | 20 ++++ csrc/engine/rank_worker.cpp | 38 ++++++- csrc/engine/rank_worker.hpp | 10 +- csrc/layers/fused_linear.cpp | 117 +++++++++++++++++++--- csrc/layers/fused_linear.hpp | 58 ++++++++--- csrc/models/llama/llama_attention.cpp | 68 +++++++++++-- csrc/models/llama/llama_attention.hpp | 18 ++++ csrc/models/llama/llama_config.hpp | 7 +- csrc/models/llama/llama_decoder_layer.cpp | 28 ++++++ csrc/models/llama/llama_decoder_layer.hpp | 17 ++++ csrc/models/llama/llama_for_causal_lm.cpp | 30 ++++++ csrc/models/llama/llama_for_causal_lm.hpp | 16 +++ csrc/models/llama/llama_mlp.cpp | 41 ++++++-- csrc/models/llama/llama_mlp.hpp | 16 +++ csrc/models/llama/llama_model.cpp | 48 +++++++++ csrc/models/llama/llama_model.hpp | 18 ++++ csrc/models/model_factory.cpp | 35 ++++++- csrc/models/model_factory.hpp | 17 ++++ csrc/pybind11/engine/engine.hpp | 44 +++++++- examples/bench.py | 18 +++- python/infinilm/infer_engine.py | 11 +- 23 files changed, 663 insertions(+), 57 deletions(-) create mode 100644 =0.34.0, diff --git a/=0.34.0, b/=0.34.0, new file mode 100644 index 00000000..e69de29b diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp index 81e67a67..c86b6bf1 100644 --- a/csrc/engine/infer_engine.cpp +++ b/csrc/engine/infer_engine.cpp @@ -1,27 +1,65 @@ #include "infer_engine.hpp" #include "spdlog/spdlog.h" -#include namespace infinilm::engine { //------------------------------------------------------ // Constructor //------------------------------------------------------ +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ InferEngine::InferEngine( + const InfinilmModel::Config &config, const distributed::DistConfig &distributed_config, infinicore::Device::Type device_type, const cache::CacheConfig *cache_config, + bool enable_graph_compiling) // Changed parameter + : communication_group_(distributed_config, device_type), + legacy_model_config_(config) { + + if (cache_config != nullptr) { + cache_config_ = cache_config->unique_copy(); + } + // Create one RankWorker per rank + int world_size = communication_group_.get_world_size(); + barrier_ = std::make_unique((size_t)world_size); + workers_.reserve(world_size); + for (int r = 0; r < world_size; ++r) { + workers_.emplace_back(std::make_unique( + legacy_model_config_, + communication_group_.get_rank_info(r), + cache_config_ != nullptr ? cache_config_.get() : nullptr, + barrier_.get(), + enable_graph_compiling)); + } + + // Compile the model on all workers + this->compile(); +} + +InferEngine::InferEngine( const std::string &model_path, + const distributed::DistConfig &distributed_config, + infinicore::Device::Type device_type, + const cache::CacheConfig *cache_config, bool enable_graph_compiling) // Changed parameter : communication_group_(distributed_config, device_type) { - if (cache_config != nullptr) { cache_config_ = cache_config->unique_copy(); } // Load model config if model_path is provided, model_path must be valid, and config.json exists this->model_config_ = std::make_shared(model_path + "/config.json"); - // Create one RankWorker per rank int world_size = communication_group_.get_world_size(); barrier_ = std::make_unique((size_t)world_size); @@ -34,7 +72,6 @@ InferEngine::InferEngine( barrier_.get(), enable_graph_compiling)); } - // Compile the model on all workers this->compile(); } diff --git a/csrc/engine/infer_engine.hpp b/csrc/engine/infer_engine.hpp index 11db3b7b..22e428ec 100644 --- a/csrc/engine/infer_engine.hpp +++ b/csrc/engine/infer_engine.hpp @@ -20,11 +20,30 @@ class InferEngine { using Output = RankWorker::Output; // Updated constructor: accept CacheConfig instead of CacheType + /** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ InferEngine( + const InfinilmModel::Config &config, const distributed::DistConfig &distributed_config = distributed::DistConfig(), infinicore::Device::Type device_type = infinicore::context::getDevice().getType(), const cache::CacheConfig *cache_config = nullptr, + bool enable_graph_compiling = false); + + InferEngine( const std::string &model_path = "", + const distributed::DistConfig &distributed_config = distributed::DistConfig(), + infinicore::Device::Type device_type = infinicore::context::getDevice().getType(), + const cache::CacheConfig *cache_config = nullptr, bool enable_graph_compiling = false); // Load a parameter to all workers (each can extract its shard inside RankWorker) @@ -52,6 +71,7 @@ class InferEngine { std::unique_ptr barrier_; distributed::CommunicationGroup communication_group_; std::unique_ptr cache_config_; + const InfinilmModel::Config &legacy_model_config_ = InfinilmModel::Config(); std::shared_ptr model_config_; }; diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp index 605dd1fe..02b8a907 100644 --- a/csrc/engine/rank_worker.cpp +++ b/csrc/engine/rank_worker.cpp @@ -4,12 +4,48 @@ #include "infinicore/ops.hpp" -#include #include #include namespace infinilm::engine { +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ +RankWorker::RankWorker(const InfinilmModel::Config &model_config, + const distributed::RankInfo &rank_info, + const cache::CacheConfig *cache_config, + RankBarrier *barrier, + bool enable_graph_compiling) + : legacy_model_config_(model_config), + rank_info_(rank_info), + enable_graph_compiling_(enable_graph_compiling), + job_cmd_(Command::INIT), + has_job_(false), + job_done_(false), + should_exit_(false), + init_done_(false), + barrier_(barrier) { + if (cache_config != nullptr) { + pending_cache_config_ = cache_config->unique_copy(); + } + // start the thread + thread_ = std::thread(&RankWorker::thread_loop, this); + + // Wait until the worker thread finishes initialization (model created) + std::unique_lock lk(mutex_); + cv_.wait(lk, [&] { return init_done_; }); +} + RankWorker::RankWorker( std::shared_ptr model_config, const distributed::RankInfo &rank_info, diff --git a/csrc/engine/rank_worker.hpp b/csrc/engine/rank_worker.hpp index bbbfae7a..f738ec1f 100644 --- a/csrc/engine/rank_worker.hpp +++ b/csrc/engine/rank_worker.hpp @@ -57,6 +57,12 @@ class RankWorker { infinicore::Tensor output_ids; }; + RankWorker(const InfinilmModel::Config &model_config, + const distributed::RankInfo &rank_info, + const cache::CacheConfig *cache_config, + RankBarrier *barrier, + bool enable_graph_compiling); + RankWorker(std::shared_ptr model_config, const distributed::RankInfo &rank_info, const cache::CacheConfig *cache_config, @@ -95,11 +101,11 @@ class RankWorker { private: // Worker properties - // const InfinilmModel::Config &model_config_; + const InfinilmModel::Config &legacy_model_config_ = InfinilmModel::Config(); + std::shared_ptr model_config_; distributed::RankInfo rank_info_; std::shared_ptr model_; std::shared_ptr cache_; - std::shared_ptr model_config_; // Graph Compiling bool enable_graph_compiling_; diff --git a/csrc/layers/fused_linear.cpp b/csrc/layers/fused_linear.cpp index e108b275..41d5bb21 100644 --- a/csrc/layers/fused_linear.cpp +++ b/csrc/layers/fused_linear.cpp @@ -6,6 +6,18 @@ namespace infinilm::layers { // --------------------------------------------------------- // QKV Parallel Linear // --------------------------------------------------------- +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ QKVParallelLinear::QKVParallelLinear(size_t hidden_size, size_t head_dim, size_t num_q_head, @@ -13,14 +25,12 @@ QKVParallelLinear::QKVParallelLinear(size_t hidden_size, bool bias, const infinicore::DataType &dtype, const infinicore::Device &device, - engine::distributed::RankInfo rank_info, - std::optional quant_scheme) + engine::distributed::RankInfo rank_info) : QKVParallelLinear(hidden_size, head_dim, head_dim, head_dim, num_q_head, num_kv_head, num_kv_head, bias, bias, bias, - dtype, device, rank_info, - quant_scheme) {} + dtype, device, rank_info) {} QKVParallelLinear::QKVParallelLinear(size_t hidden_size, size_t q_dim, size_t k_dim, size_t v_dim, @@ -28,8 +38,7 @@ QKVParallelLinear::QKVParallelLinear(size_t hidden_size, bool q_bias, bool k_bias, bool v_bias, const infinicore::DataType &dtype, const infinicore::Device &device, - engine::distributed::RankInfo rank_info, - std::optional quant_scheme) + engine::distributed::RankInfo rank_info) : infinicore::nn::ColumnParallelLinear( hidden_size, num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim, @@ -37,8 +46,62 @@ QKVParallelLinear::QKVParallelLinear(size_t hidden_size, dtype, device, rank_info.tp_rank, - rank_info.tp_size, - quant_scheme), + rank_info.tp_size), + q_dim_(q_dim), + k_dim_(k_dim), + v_dim_(v_dim), + num_q_head_(num_q_head), + num_k_head_(num_k_head), + num_v_head_(num_v_head), + q_bias_(q_bias), + k_bias_(k_bias), + v_bias_(v_bias) { + if (num_q_head % tp_size_ != 0 || num_k_head % tp_size_ != 0 || num_v_head % tp_size_ != 0) { + throw std::runtime_error("QKVParallelLinear: num_[q|k|v]_head must be divisible by tp_size"); + } + + if ((q_bias_ != k_bias_) || (k_bias_ != v_bias_)) { + throw std::runtime_error("q_bias, k_bias, v_bias must all match"); + } + + q_out_size_ = num_q_head_ * q_dim_ / tp_size_; + k_out_size_ = num_k_head_ * k_dim_ / tp_size_; + v_out_size_ = num_v_head_ * v_dim_ / tp_size_; +} + +QKVParallelLinear::QKVParallelLinear(size_t hidden_size, + size_t head_dim, + size_t num_q_head, + size_t num_kv_head, + infinicore::nn::QuantScheme quant_scheme, + bool bias, + const infinicore::DataType &dtype, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info) + : QKVParallelLinear(hidden_size, + head_dim, head_dim, head_dim, + num_q_head, num_kv_head, num_kv_head, + bias, bias, bias, + quant_scheme, + dtype, device, rank_info) {} + +QKVParallelLinear::QKVParallelLinear(size_t hidden_size, + size_t q_dim, size_t k_dim, size_t v_dim, + size_t num_q_head, size_t num_k_head, size_t num_v_head, + bool q_bias, bool k_bias, bool v_bias, + infinicore::nn::QuantScheme quant_scheme, + const infinicore::DataType &dtype, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info) + : infinicore::nn::ColumnParallelLinear( + hidden_size, + num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim, + quant_scheme, + (q_bias || k_bias || v_bias), + dtype, + device, + rank_info.tp_rank, + rank_info.tp_size), q_dim_(q_dim), k_dim_(k_dim), v_dim_(v_dim), @@ -141,18 +204,44 @@ bool QKVParallelLinear::has_v_bias() const { return v_bias_; } // --------------------------------------------------------- // Gate-Up Parallel Linear // --------------------------------------------------------- +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias, const infinicore::DataType &dtype, const infinicore::Device &device, - engine::distributed::RankInfo rank_info, - std::optional quant_scheme) - : GateUpParallelLinear(hidden_size, intermediate_size, bias, bias, dtype, device, rank_info, quant_scheme) { + engine::distributed::RankInfo rank_info) + : GateUpParallelLinear(hidden_size, intermediate_size, bias, bias, dtype, device, rank_info) { +} + +GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, + const infinicore::DataType &dtype, const infinicore::Device &device, + engine::distributed::RankInfo rank_info) + : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) { + if (gate_bias_ != up_bias_) { + throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time"); + } +} + +GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, infinicore::nn::QuantScheme quant_scheme, bool bias, + const infinicore::DataType &dtype, const infinicore::Device &device, + engine::distributed::RankInfo rank_info) + : GateUpParallelLinear(hidden_size, intermediate_size, bias, bias, quant_scheme, dtype, device, rank_info) { } GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, + infinicore::nn::QuantScheme quant_scheme, const infinicore::DataType &dtype, const infinicore::Device &device, - engine::distributed::RankInfo rank_info, - std::optional quant_scheme) - : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size, quant_scheme), gate_bias_(gate_bias), up_bias_(up_bias) { + engine::distributed::RankInfo rank_info) + : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, quant_scheme, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) { if (gate_bias_ != up_bias_) { throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time"); } diff --git a/csrc/layers/fused_linear.hpp b/csrc/layers/fused_linear.hpp index f3d95bae..9656e6ca 100644 --- a/csrc/layers/fused_linear.hpp +++ b/csrc/layers/fused_linear.hpp @@ -13,8 +13,7 @@ class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear { bool q_bias, bool k_bias, bool v_bias, const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - std::optional quant_scheme = std::nullopt); + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); // A more common case where all heads have the same dimension explicit QKVParallelLinear(size_t hidden_size, @@ -23,8 +22,26 @@ class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear { bool bias = false, const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - std::optional quant_scheme = std::nullopt); + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + + explicit QKVParallelLinear(size_t hidden_size, + size_t q_dim, size_t k_dim, size_t v_dim, + size_t num_q_head, size_t num_k_head, size_t num_v_head, + bool q_bias, bool k_bias, bool v_bias, + infinicore::nn::QuantScheme quant_scheme, + const infinicore::DataType &dtype = infinicore::DataType::F32, + const infinicore::Device &device = infinicore::Device(), + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + + // A more common case where all heads have the same dimension + explicit QKVParallelLinear(size_t hidden_size, + size_t head_dim, + size_t num_q_head, size_t num_kv_head, + infinicore::nn::QuantScheme quant_scheme, + bool bias = false, + const infinicore::DataType &dtype = infinicore::DataType::F32, + const infinicore::Device &device = infinicore::Device(), + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); std::tuple forward_split(infinicore::Tensor &input); @@ -62,15 +79,36 @@ class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear { class GateUpParallelLinear : public infinicore::nn::ColumnParallelLinear { public: + /** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias = false, const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - std::optional quant_scheme = std::nullopt); + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + + GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, + const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + + GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, infinicore::nn::QuantScheme quant_scheme, + bool bias = false, + const infinicore::DataType &dtype = infinicore::DataType::F32, + const infinicore::Device &device = infinicore::Device(), + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, + infinicore::nn::QuantScheme quant_scheme, const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - std::optional quant_scheme = std::nullopt); + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); std::tuple forward_split(infinicore::Tensor &input); @@ -119,16 +157,12 @@ class GateUpParallelLinear : public infinicore::nn::ColumnParallelLinear { // ========================= QKV Quantization ================================== #define INFINILM_QKV_LINEAR_W8A8_INIT(name, q_name, k_name, v_name, ...) \ name##_ = std::make_shared(__VA_ARGS__); \ - /* 注册 Q 权重 */ \ this->register_parameter(std::string(q_name) + ".weight", name##_->get_q_weight()); \ this->register_parameter(std::string(q_name) + ".weight_scale", name##_->get_q_weight_scale()); \ - /* 注册 K 权重 */ \ this->register_parameter(std::string(k_name) + ".weight", name##_->get_k_weight()); \ this->register_parameter(std::string(k_name) + ".weight_scale", name##_->get_k_weight_scale()); \ - /* 注册 V 权重 */ \ this->register_parameter(std::string(v_name) + ".weight", name##_->get_v_weight()); \ this->register_parameter(std::string(v_name) + ".weight_scale", name##_->get_v_weight_scale()); \ - /* bias 保持原样 */ \ if (name##_->has_q_bias()) \ this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias()); \ if (name##_->has_k_bias()) \ diff --git a/csrc/models/llama/llama_attention.cpp b/csrc/models/llama/llama_attention.cpp index cdf80600..a4e82811 100644 --- a/csrc/models/llama/llama_attention.cpp +++ b/csrc/models/llama/llama_attention.cpp @@ -16,6 +16,62 @@ namespace infinilm::models::llama { +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ +LlamaAttention::LlamaAttention(const LlamaConfig &config, + const infinicore::Device &device, + size_t layer_idx, + engine::distributed::RankInfo rank_info) + : layer_idx_(layer_idx), + hidden_size_(config.hidden_size), + num_attention_heads_(config.num_attention_heads), + num_key_value_heads_(config.num_key_value_heads), + head_dim_(config.head_dim), + kv_dim_(config.kv_dim()), + use_bias_(config.attention_bias), + use_output_bias_(config.attention_output_bias), + use_qk_norm_(config.qk_norm), + max_position_embeddings_(config.max_position_embeddings), rank_info_(rank_info) { + const auto &dtype{config.dtype}; + + int tp_rank = rank_info.tp_rank; + int tp_size = rank_info.tp_size; + + int num_attention_heads = config.num_attention_heads; + int num_key_value_heads = config.num_key_value_heads; + + if ((num_key_value_heads >= tp_size) && (0 == (num_key_value_heads % tp_size))) { + this->num_attention_heads_ = num_attention_heads / tp_size; + this->num_key_value_heads_ = num_key_value_heads / tp_size; + } else { + throw std::runtime_error("num_attention_heads / tp_size error."); + } + scaling_ = 1.0f / std::sqrt(static_cast(head_dim_)); + + // Initialize projection layers + INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, config.num_attention_heads, config.num_key_value_heads, use_bias_, + dtype, device, rank_info); + // Output projection uses attention_output_bias (can be different from qkv) + INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads * head_dim_, hidden_size_, use_output_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm); + + // Initialize qk RMSNorm + if (use_qk_norm_) { + INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, config.rms_norm_eps, dtype, device); + INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, config.rms_norm_eps, dtype, device); + } +} + LlamaAttention::LlamaAttention(std::shared_ptr model_config, const infinicore::Device &device, size_t layer_idx, @@ -50,18 +106,18 @@ LlamaAttention::LlamaAttention(std::shared_ptr mo auto quant_scheme = this->model_config_->get_quant_scheme(); switch (quant_scheme) { case infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8: - INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), use_bias_, - dtype, device, rank_info, quant_scheme); + INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), quant_scheme, use_bias_, + dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get("num_attention_heads") * head_dim_, hidden_size_, use_output_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm, quant_scheme); + INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get("num_attention_heads") * head_dim_, hidden_size_, quant_scheme, use_output_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm); break; default: - INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), use_bias_, + INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), quant_scheme, use_bias_, dtype, device, rank_info); - INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get("num_attention_heads") * head_dim_, hidden_size_, use_output_bias_, + INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get("num_attention_heads") * head_dim_, hidden_size_, quant_scheme, use_output_bias_, dtype, device, tp_rank, tp_size, rank_info.comm); break; } diff --git a/csrc/models/llama/llama_attention.hpp b/csrc/models/llama/llama_attention.hpp index 70e43bdc..45bca14a 100644 --- a/csrc/models/llama/llama_attention.hpp +++ b/csrc/models/llama/llama_attention.hpp @@ -37,6 +37,23 @@ class LlamaAttention : public infinicore::nn::Module { * @param layer_idx Layer index for cache access * @param dtype Optional data type for model parameters (defaults to F32) */ + /** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ + LlamaAttention(const LlamaConfig &config, + const infinicore::Device &device, + size_t layer_idx, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + LlamaAttention(std::shared_ptr model_config, const infinicore::Device &device, size_t layer_idx, @@ -111,6 +128,7 @@ class LlamaAttention : public infinicore::nn::Module { size_t kv_dim_; bool use_bias_; // Bias for Q/K/V projections bool use_output_bias_; // Bias for output projection (o_proj) + bool use_qk_norm_; // Whether to use QK RMSNorm size_t max_position_embeddings_; // For cache initialization (deprecated, kept for compatibility) float scaling_; diff --git a/csrc/models/llama/llama_config.hpp b/csrc/models/llama/llama_config.hpp index fe5ba7e9..f2df38e5 100644 --- a/csrc/models/llama/llama_config.hpp +++ b/csrc/models/llama/llama_config.hpp @@ -6,8 +6,6 @@ #include #include "../infinilm_model.hpp" -#include "infinicore/nn/quantization.hpp" -#include "nlohmann/json.hpp" #include @@ -72,8 +70,7 @@ struct LlamaConfig : public InfinilmModel::Config { * @brief Compute key-value dimension for Grouped Query Attention (GQA) * @return The dimension for key/value projections */ - size_t - kv_dim() const { + size_t kv_dim() const { return hidden_size * num_key_value_heads / num_attention_heads; } @@ -95,4 +92,4 @@ struct LlamaConfig : public InfinilmModel::Config { } }; -} // namespace infinilm::models::llama +} // namespace infinilm::models::llama \ No newline at end of file diff --git a/csrc/models/llama/llama_decoder_layer.cpp b/csrc/models/llama/llama_decoder_layer.cpp index 9d03059d..208771d2 100644 --- a/csrc/models/llama/llama_decoder_layer.cpp +++ b/csrc/models/llama/llama_decoder_layer.cpp @@ -4,6 +4,34 @@ #include namespace infinilm::models::llama { +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ +LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config, + const infinicore::Device &device, + size_t layer_idx, + engine::distributed::RankInfo rank_info) : layer_idx_(layer_idx), rank_info_(rank_info) { + const auto &dtype{config.dtype}; + + // Initialize layer normalization layers + INFINICORE_NN_MODULE_INIT(input_layernorm, config.hidden_size, config.rms_norm_eps, + dtype, device); + INFINICORE_NN_MODULE_INIT(post_attention_layernorm, config.hidden_size, config.rms_norm_eps, + dtype, device); + + // Initialize attention and MLP modules + INFINICORE_NN_MODULE_INIT(self_attn, config, device, layer_idx, rank_info_); + INFINICORE_NN_MODULE_INIT(mlp, config, device, rank_info_); +} LlamaDecoderLayer::LlamaDecoderLayer(std::shared_ptr model_config, const infinicore::Device &device, diff --git a/csrc/models/llama/llama_decoder_layer.hpp b/csrc/models/llama/llama_decoder_layer.hpp index 1ba58a30..a56aec03 100644 --- a/csrc/models/llama/llama_decoder_layer.hpp +++ b/csrc/models/llama/llama_decoder_layer.hpp @@ -33,6 +33,23 @@ class LlamaDecoderLayer : public infinicore::nn::Module { * @param layer_idx Layer index for cache management and debugging * @param dtype Optional data type for model parameters (defaults to F32) */ + /** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ + LlamaDecoderLayer(const LlamaConfig &config, + const infinicore::Device &device, + size_t layer_idx, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + LlamaDecoderLayer(std::shared_ptr model_config, const infinicore::Device &device, size_t layer_idx, diff --git a/csrc/models/llama/llama_for_causal_lm.cpp b/csrc/models/llama/llama_for_causal_lm.cpp index 060737d1..cb386814 100644 --- a/csrc/models/llama/llama_for_causal_lm.cpp +++ b/csrc/models/llama/llama_for_causal_lm.cpp @@ -4,6 +4,36 @@ #include "infinicore/ops.hpp" namespace infinilm::models::llama { +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ +LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info) { + + // Initialize module's device_ member + device_ = device; + + const auto &dtype{config.dtype}; + + // Initialize base model + INFINICORE_NN_MODULE_INIT(model, config, device, rank_info); + + // Initialize language modeling head + // Note: If tie_word_embeddings is true, we would share weights with embed_tokens + // For now, we create a separate linear layer + INFINICORE_NN_MODULE_INIT(lm_head, config.hidden_size, config.vocab_size, false, + dtype, device); +} LlamaForCausalLM::LlamaForCausalLM(std::shared_ptr model_config, const infinicore::Device &device, diff --git a/csrc/models/llama/llama_for_causal_lm.hpp b/csrc/models/llama/llama_for_causal_lm.hpp index 59dac7ae..a6e078e7 100644 --- a/csrc/models/llama/llama_for_causal_lm.hpp +++ b/csrc/models/llama/llama_for_causal_lm.hpp @@ -28,6 +28,22 @@ class LlamaForCausalLM : public InfinilmModel { * @param config Model configuration * @param device Device to create tensors on */ + /** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ + LlamaForCausalLM(const LlamaConfig &config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + LlamaForCausalLM(std::shared_ptr model_config, const infinicore::Device &device, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); diff --git a/csrc/models/llama/llama_mlp.cpp b/csrc/models/llama/llama_mlp.cpp index f9601118..89866a16 100644 --- a/csrc/models/llama/llama_mlp.cpp +++ b/csrc/models/llama/llama_mlp.cpp @@ -3,6 +3,35 @@ #include "infinicore/ops.hpp" namespace infinilm::models::llama { +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ +LlamaMLP::LlamaMLP(const LlamaConfig &config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info) + : hidden_size_(config.hidden_size), + intermediate_size_(config.intermediate_size), + use_bias_(config.mlp_bias), rank_info_(rank_info) { + const auto &dtype{config.dtype}; + + int tp_rank = rank_info.tp_rank; + int tp_size = rank_info.tp_size; + + // Initialize projection layers + INFINILM_GATE_UP_LINEAR_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, use_bias_, + dtype, device, rank_info_); + INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, use_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm); +} LlamaMLP::LlamaMLP(std::shared_ptr model_config, const infinicore::Device &device, @@ -20,16 +49,16 @@ LlamaMLP::LlamaMLP(std::shared_ptr model_config, auto quant_scheme = this->model_config_->get_quant_scheme(); switch (quant_scheme) { case infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8: - INFINILM_GATE_UP_LINEAR_W8A8_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, use_bias_, - dtype, device, rank_info_, quant_scheme); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, use_bias_, - dtype, device, tp_rank, tp_size, rank_info.comm, quant_scheme); + INFINILM_GATE_UP_LINEAR_W8A8_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, quant_scheme, use_bias_, + dtype, device, rank_info_); + INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, quant_scheme, use_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm); break; default: - INFINILM_GATE_UP_LINEAR_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, use_bias_, + INFINILM_GATE_UP_LINEAR_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, quant_scheme, use_bias_, dtype, device, rank_info_); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, use_bias_, + INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, quant_scheme, use_bias_, dtype, device, tp_rank, tp_size, rank_info.comm); break; } diff --git a/csrc/models/llama/llama_mlp.hpp b/csrc/models/llama/llama_mlp.hpp index 45df91f5..179ea217 100644 --- a/csrc/models/llama/llama_mlp.hpp +++ b/csrc/models/llama/llama_mlp.hpp @@ -34,6 +34,22 @@ class LlamaMLP : public infinicore::nn::Module { * @param device Device to create tensors on * @param dtype Optional data type for model parameters (defaults to F32) */ + /** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ + LlamaMLP(const LlamaConfig &config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + LlamaMLP(std::shared_ptr model_config, const infinicore::Device &device, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); diff --git a/csrc/models/llama/llama_model.cpp b/csrc/models/llama/llama_model.cpp index 0771fdd7..e8360a87 100644 --- a/csrc/models/llama/llama_model.cpp +++ b/csrc/models/llama/llama_model.cpp @@ -3,8 +3,56 @@ #include "infinicore/nn/rmsnorm.hpp" #include "infinicore/nn/rope.hpp" #include "infinicore/ops.hpp" +#include namespace infinilm::models::llama { +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ +LlamaModel::LlamaModel(const LlamaConfig &config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info) + : config_(config), rank_info_(rank_info) { + const auto &dtype{config.dtype}; + // Initialize token embeddings + INFINICORE_NN_MODULE_INIT(embed_tokens, config.vocab_size, config.hidden_size, + std::nullopt, dtype, device); + + // Initialize decoder layers with layer indices + // TODO: Update INFINICORE_NN_MODULE_VEC_INIT macro to support per-layer constructor arguments + // (e.g., via a factory function or lambda that receives the layer index) + // Currently, we can't use the macro because each layer needs a different layer_idx + layers_.reserve(config.num_hidden_layers); + for (size_t i = 0; i < config.num_hidden_layers; ++i) { + layers_.push_back(this->register_module( + "layers." + std::to_string(i), config, device, i, rank_info)); + } + + // Initialize final layer normalization + INFINICORE_NN_MODULE_INIT(norm, config.hidden_size, config.rms_norm_eps, + dtype, device); + + // Initialize Rotary Position Embeddings (shared across all layers) + // Use GPT-J-style inverse frequencies (default) and GPT_NEOX rotation pairing + INFINICORE_NN_MODULE_INIT(rotary_emb, config.head_dim, config.max_position_embeddings, + config.rope_theta, infinicore::nn::RoPE::Algo::GPT_NEOX, + dtype, device, config.rope_scaling); + + for (auto &layer : layers_) { + if (layer) { + layer->set_rotary_emb(rotary_emb_); + } + } +} LlamaModel::LlamaModel(std::shared_ptr model_config, const infinicore::Device &device, diff --git a/csrc/models/llama/llama_model.hpp b/csrc/models/llama/llama_model.hpp index 11a8547d..f293a97a 100644 --- a/csrc/models/llama/llama_model.hpp +++ b/csrc/models/llama/llama_model.hpp @@ -37,6 +37,22 @@ class LlamaModel : public infinicore::nn::Module { * @param device Device to create tensors on * @param dtype Optional data type for model parameters (defaults to F32) */ + /** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ + LlamaModel(const LlamaConfig &config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + LlamaModel(std::shared_ptr model_config, const infinicore::Device &device, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); @@ -83,6 +99,8 @@ class LlamaModel : public infinicore::nn::Module { std::shared_ptr kv_cache_; private: + LlamaConfig config_; + std::shared_ptr model_config_; }; diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp index 38d119cf..fa117227 100644 --- a/csrc/models/model_factory.cpp +++ b/csrc/models/model_factory.cpp @@ -2,15 +2,46 @@ #include "llama/llama.hpp" namespace infinilm { +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ +std::shared_ptr InfinilmModelFactory::createModel( + const InfinilmModel::Config &config, + engine::distributed::RankInfo rank_info, + const cache::CacheConfig *cache) { + + std::shared_ptr model; + if (const auto llama_config_ptr = dynamic_cast(&config)) { + const auto &llama_config = *llama_config_ptr; + model = std::make_shared( + llama_config, rank_info.device, rank_info); + } else { + throw std::invalid_argument("InfinilmModelFactory::createModel: Unsupported model config type"); + } + + if (cache) { + model->reset_cache(cache); + } + + return model; +} + std::shared_ptr InfinilmModelFactory::createModel( std::shared_ptr model_config, engine::distributed::RankInfo rank_info, const cache::CacheConfig *cache) { std::shared_ptr model; - //****************************NEED TO BE FIXED */ if (true) { - // const auto &llama_config = *llama_config_ptr; model = std::make_shared( model_config, rank_info.device, rank_info); } else { diff --git a/csrc/models/model_factory.hpp b/csrc/models/model_factory.hpp index 627d9447..02385029 100644 --- a/csrc/models/model_factory.hpp +++ b/csrc/models/model_factory.hpp @@ -8,6 +8,23 @@ namespace infinilm { class InfinilmModelFactory { public: + /** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ + static std::shared_ptr createModel( + const InfinilmModel::Config &config, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), + const cache::CacheConfig *cache = nullptr); + static std::shared_ptr createModel( std::shared_ptr model_config, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp index 3a0471ac..78af5daa 100644 --- a/csrc/pybind11/engine/engine.hpp +++ b/csrc/pybind11/engine/engine.hpp @@ -32,22 +32,62 @@ inline void bind_infer_engine(py::module &m) { py::class_> infer_engine(m, "InferEngine"); infer_engine .def(py::init([]( + const InfinilmModel::Config &cfg, const distributed::DistConfig &dist, infinicore::Device::Type dev, std::shared_ptr cache_cfg, - const std::string &model_path, bool enable_graph_compiling) { return std::make_shared( + cfg, dist, dev, cache_cfg ? cache_cfg.get() : nullptr, - model_path, enable_graph_compiling); }), + py::arg("config"), py::arg("distributed_config") = distributed::DistConfig(), py::arg("device_type") = infinicore::context::getDevice().getType(), py::arg("cache_config") = py::none(), + py::arg("enable_graph_compiling") = false) + .def("load_param", &InferEngine::load_param, + py::arg("name"), py::arg("param"), + "Load a parameter tensor into all workers (each worker picks its shard)") + .def("state_dict", [](InferEngine &self) { + py::list state_dict_tp_all; + for (const auto &state_dict_tp : self.state_dict()) { + py::dict result; + for (const auto &[name, param] : state_dict_tp) { + result[py::cast(name)] = infinicore::Tensor(param); + } + state_dict_tp_all.append(result); + } + return state_dict_tp_all; + }) + .def("forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments") + .def("reset_cache", [](InferEngine &self, std::shared_ptr cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none()) + .def("get_cache_config", [](const InferEngine &self) { + auto cfg = self.get_cache_config(); + return std::shared_ptr(std::move(cfg->unique_copy())); }) + .def("__repr__", [](const InferEngine &self) { return ""; }); + + infer_engine + .def(py::init([]( + const std::string &model_path, + const distributed::DistConfig &dist, + infinicore::Device::Type dev, + std::shared_ptr cache_cfg, + bool enable_graph_compiling) { + return std::make_shared( + model_path, + dist, + dev, + cache_cfg ? cache_cfg.get() : nullptr, + enable_graph_compiling); + }), py::arg("model_path") = "", + py::arg("distributed_config") = distributed::DistConfig(), + py::arg("device_type") = infinicore::context::getDevice().getType(), + py::arg("cache_config") = py::none(), py::arg("enable_graph_compiling") = false) .def("load_param", &InferEngine::load_param, py::arg("name"), py::arg("param"), diff --git a/examples/bench.py b/examples/bench.py index 2b968fa6..957f9215 100644 --- a/examples/bench.py +++ b/examples/bench.py @@ -272,6 +272,13 @@ def __init__( # 创建 tokenizer # ---------------------------------------------------------------------------- # tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + if tokenizer.pad_token is None: + if tokenizer.eos_token is not None: + tokenizer.pad_token = tokenizer.eos_token + tokenizer.pad_token_id = tokenizer.eos_token_id + else: + tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # ---------------------------------------------------------------------------- # # token编码 @@ -285,7 +292,16 @@ def __init__( ] # print(input_content, end="", flush=True) - input_ids_list = tokenizer.batch_encode_plus(input_content)["input_ids"] + # Support Transformers >= 5.0 for batch_encode_plus deprecation + encoding = tokenizer( + input_content, + padding=True, + truncation=True, + max_length=2048, + return_tensors="pt" + ) + + input_ids_list = encoding["input_ids"] self.model = model self.tokenizer = tokenizer diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py index e41a89c7..213e62ad 100644 --- a/python/infinilm/infer_engine.py +++ b/python/infinilm/infer_engine.py @@ -34,12 +34,19 @@ def __init__( if device is None: device = infinicore.device() + + # super().__init__( + # self.config, + # distributed_config._underlying, + # device._underlying.type, + # cache_config, + # enable_graph_compiling, + # ) super().__init__( - # self.config, + model_path, distributed_config._underlying, device._underlying.type, cache_config, - model_path, enable_graph_compiling, ) self.use_cache = False