From 0947b0c46214a64ef2a749f521d7cde2bf808b6b Mon Sep 17 00:00:00 2001 From: luoxiaojian Date: Tue, 10 Mar 2026 14:34:12 +0800 Subject: [PATCH 1/5] enable batch compute for int8 l2 dist --- src/core/metric/quantized_integer_metric_batch.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/core/metric/quantized_integer_metric_batch.h b/src/core/metric/quantized_integer_metric_batch.h index e9e63cef9..ced11f323 100644 --- a/src/core/metric/quantized_integer_metric_batch.h +++ b/src/core/metric/quantized_integer_metric_batch.h @@ -55,6 +55,11 @@ struct BaseDistanceBatchWithScoreUnquantized { return CosineMinusInnerProductDistanceBatchWithScoreUnquantized< ValueType, BatchSize, PrefetchStep>::ComputeBatch(m, q, num, dim, out); + } else if constexpr (std::is_same_v, + SquaredEuclidean>) { + return SquaredEuclideanDistanceBatchWithScoreUnquantized< + ValueType, BatchSize, PrefetchStep>::ComputeBatch(m, q, num, dim, + out); } _ComputeBatch(m, q, num, dim, out); @@ -234,7 +239,7 @@ struct SquaredEuclideanDistanceBatchWithScoreUnquantized struct SquaredEuclideanDistanceBatchWithScoreUnquantized { - static void ComputeBatch(const int8_t **vecs, const int8_t *query, + static void ComputeBatch(const uint8_t **vecs, const uint8_t *query, size_t num_vecs, size_t dim, float *results) { const size_t original_dim = dim - 32; const size_t original_dim_in_uint8_array = original_dim >> 1; @@ -351,4 +356,4 @@ struct MipsSquaredEuclideanDistanceBatchWithScoreUnquantized Date: Tue, 10 Mar 2026 17:07:12 +0800 Subject: [PATCH 2/5] enable icelake --- cmake/option.cmake | 11 +++++++--- src/core/metric/quantized_integer_metric.cc | 8 ++++++-- .../metric/quantized_integer_metric_batch.h | 20 ++++++++++++------- .../metric/quantized_integer_metric_matrix.h | 8 ++++---- src/core/quantizer/cosine_converter.cc | 6 +++--- src/core/quantizer/cosine_reformer.cc | 2 +- .../quantizer/integer_quantizer_converter.cc | 4 +++- .../quantizer/integer_quantizer_reformer.cc | 2 +- src/core/quantizer/record_quantizer.h | 12 ++++++++--- 9 files changed, 48 insertions(+), 25 deletions(-) diff --git a/cmake/option.cmake b/cmake/option.cmake index 01388564e..f9c531702 100644 --- a/cmake/option.cmake +++ b/cmake/option.cmake @@ -9,6 +9,7 @@ option(ENABLE_HASWELL "Enable Intel Haswell CPU microarchitecture" OFF) option(ENABLE_BROADWELL "Enable Intel Broadwell CPU microarchitecture" OFF) option(ENABLE_SKYLAKE "Enable Intel Skylake CPU microarchitecture" OFF) option(ENABLE_SKYLAKE_AVX512 "Enable Intel Skylake Server CPU microarchitecture" OFF) +option(ENABLE_ICELAKE "Enable Intel Icelake CPU microarchitecture" OFF) option(ENABLE_SAPPHIRERAPIDS "Enable Intel Sapphire Rapids Server CPU microarchitecture" OFF) option(ENABLE_EMERALDRAPIDS "Enable Intel Emerald Rapids Server CPU microarchitecture" OFF) option(ENABLE_GRANITERAPIDS "Enable Intel Granite Rapids Server CPU microarchitecture" OFF) @@ -34,8 +35,8 @@ option(ENABLE_OPENMP "Enable OpenMP support" OFF) set(ARCH_OPTIONS ENABLE_NEHALEM ENABLE_SANDYBRIDGE ENABLE_HASWELL ENABLE_BROADWELL ENABLE_SKYLAKE - ENABLE_SKYLAKE_AVX512 ENABLE_SAPPHIRERAPIDS ENABLE_EMERALDRAPIDS ENABLE_GRANITERAPIDS - ENABLE_ZEN1 ENABLE_ZEN2 ENABLE_ZEN3 + ENABLE_SKYLAKE_AVX512 ENABLE_ICELAKE ENABLE_SAPPHIRERAPIDS ENABLE_EMERALDRAPIDS + ENABLE_GRANITERAPIDS ENABLE_ZEN1 ENABLE_ZEN2 ENABLE_ZEN3 ENABLE_ARMV8A ENABLE_ARMV8.1A ENABLE_ARMV8.2A ENABLE_ARMV8.3A ENABLE_ARMV8.4A ENABLE_ARMV8.5A ENABLE_ARMV8.6A ENABLE_NATIVE @@ -94,7 +95,7 @@ endfunction() function(_detect_x86_best) set(_x86_flags "graniterapids" "emeraldrapids" "sapphirerapids" - "skylake-avx512" "skylake" + "icelake-server" "skylake-avx512" "skylake" "broadwell" "haswell" "sandybridge" "nehalem" "znver3" "znver2" "znver1" ) @@ -162,6 +163,10 @@ if(NOT AUTO_DETECT_ARCH) add_arch_flag("-march=skylake" SKYLAKE ENABLE_SKYLAKE) endif() + if(ENABLE_ICELAKE) + add_arch_flag("-march=icelake-server" ICELAKE ENABLE_ICELAKE) + endif() + if(ENABLE_BROADWELL) add_arch_flag("-march=broadwell" BROADWELL ENABLE_BROADWELL) endif() diff --git a/src/core/metric/quantized_integer_metric.cc b/src/core/metric/quantized_integer_metric.cc index 56e95634b..43a534059 100644 --- a/src/core/metric/quantized_integer_metric.cc +++ b/src/core/metric/quantized_integer_metric.cc @@ -148,7 +148,7 @@ class QuantizedIntegerMetric : public IndexMetric { if (meta_.data_type() == IndexMeta::DataType::DT_INT8) { return reinterpret_cast( BaseDistanceBatchWithScoreUnquantized::ComputeBatch); + 2, 2>::ComputeBatch); } if (meta_.data_type() == IndexMeta::DataType::DT_INT4) { return reinterpret_cast( @@ -266,6 +266,10 @@ class QuantizedIntegerMetric : public IndexMetric { meta_.data_type() == IndexMeta::DataType::DT_INT8) { return CosineMinusInnerProductDistanceBatchWithScoreUnquantized< int8_t, 1, 1>::GetQueryPreprocessFunc(); + } else if (origin_metric_type_ == MetricType::kSquaredEuclidean && + meta_.data_type() == IndexMeta::DataType::DT_INT8) { + return SquaredEuclideanDistanceBatchWithScoreUnquantized< + int8_t, 1, 1>::GetQueryPreprocessFunc(); } return nullptr; @@ -320,4 +324,4 @@ class QuantizedIntegerMetric : public IndexMetric { INDEX_FACTORY_REGISTER_METRIC_ALIAS(QuantizedInteger, QuantizedIntegerMetric); } // namespace core -} // namespace zvec \ No newline at end of file +} // namespace zvec diff --git a/src/core/metric/quantized_integer_metric_batch.h b/src/core/metric/quantized_integer_metric_batch.h index ced11f323..9790a7a49 100644 --- a/src/core/metric/quantized_integer_metric_batch.h +++ b/src/core/metric/quantized_integer_metric_batch.h @@ -139,7 +139,7 @@ struct MinusInnerProductDistanceBatchWithScoreUnquantized(m_tail)[3]; + int int_sum = reinterpret_cast(m_tail)[4]; result -= 128 * int_sum; } result = -(ma * qa * result + mb * qa * qs + qb * ma * ms + @@ -197,7 +197,7 @@ struct SquaredEuclideanDistanceBatchWithScoreUnquantized; static void ComputeBatch(const int8_t **vecs, const int8_t *query, size_t num_vecs, size_t dim, float *results) { - const size_t original_dim = dim - 16; + const size_t original_dim = dim - 20; ailego::DistanceBatch::InnerProductDistanceBatch< int8_t, BatchSize, PrefetchStep>::ComputeBatch(vecs, query, num_vecs, original_dim, results); @@ -218,10 +218,14 @@ struct SquaredEuclideanDistanceBatchWithScoreUnquantized(m_tail)[4]; + result -= 128 * int8_sum; + } + result = ma * ma * ms2 + sum2 - 2 * ma * qa * result + (mb - qb) * (mb - qb) * original_dim + 2 * (mb - qb) * (ms * ma - sum); - ++results; } } @@ -231,7 +235,9 @@ struct SquaredEuclideanDistanceBatchWithScoreUnquantized; static void ComputeBatch(const int8_t **vecs, const int8_t *query, size_t num_vecs, size_t dim, float *results) { - const size_t original_dim = dim - 16; + const size_t original_dim = dim - 20; ailego::DistanceBatch::InnerProductDistanceBatch< int8_t, BatchSize, PrefetchStep>::ComputeBatch(vecs, query, num_vecs, original_dim, results); @@ -315,7 +321,7 @@ struct MipsSquaredEuclideanDistanceBatchWithScoreUnquantized struct SquaredEuclidean { static void Compute(const int8_t *m, const int8_t *q, size_t dim, float *out) { - const size_t d = dim - 16; + const size_t d = dim - 20; ailego::InnerProductMatrix::Compute(m, q, d, out); for (size_t i = 0; i < N; ++i) { @@ -141,7 +141,7 @@ template struct MinusInnerProduct { static void Compute(const int8_t *m, const int8_t *q, size_t dim, float *out) { - const size_t origin_dim = dim - 16; + const size_t origin_dim = dim - 20; MinusInnerProductImplInt8(m, q, origin_dim, out); } }; @@ -195,7 +195,7 @@ template struct MipsSquaredEuclidean { static void Compute(const int8_t *m, const int8_t *q, size_t dim, float *out) { - const size_t d = dim - 16; + const size_t d = dim - 20; ailego::InnerProductMatrix::Compute(m, q, d, out); for (size_t i = 0; i < N; ++i) { @@ -251,4 +251,4 @@ struct MipsSquaredEuclidean { } }; -} // namespace zvec::core \ No newline at end of file +} // namespace zvec::core diff --git a/src/core/quantizer/cosine_converter.cc b/src/core/quantizer/cosine_converter.cc index dda76b01c..dd5cbbd0d 100644 --- a/src/core/quantizer/cosine_converter.cc +++ b/src/core/quantizer/cosine_converter.cc @@ -206,7 +206,7 @@ class CosineConverterHolder : public IndexHolder { if (type == IndexMeta::DataType::DT_INT4) return 40; // 5 * sizeof(float) / sizeof(FT_INT4) else if (type == IndexMeta::DataType::DT_INT8) - return 20; // 5 * sizeof(float) / sizeof(FT_INT8) + return 24; // (5 * sizeof(float) + sizeof(int)) / sizeof(FT_INT8) else if (type == IndexMeta::DataType::DT_FP16) return 2; // 2* sizeof(float) / sizeof(FT_FP16) else if (type == IndexMeta::DataType::DT_FP32) { @@ -362,7 +362,7 @@ class CosineConverter : public IndexConverter { if (type == IndexMeta::DataType::DT_INT4) return 40; // 5 * sizeof(float) / sizeof(FT_INT4) else if (type == IndexMeta::DataType::DT_INT8) - return 20; // 5 * sizeof(float) / sizeof(FT_INT8) + return 24; // (5 * sizeof(float) + sizeof(int)) / sizeof(FT_INT8) else if (type == IndexMeta::DataType::DT_FP16) return 2; // sizeof(float) / sizeof(FT_FP16) else if (type == IndexMeta::DataType::DT_FP32) { @@ -402,4 +402,4 @@ INDEX_FACTORY_REGISTER_CONVERTER_ALIAS(CosineHalfFloatConverter, IndexMeta::DataType::DT_FP16); } // namespace core -} // namespace zvec \ No newline at end of file +} // namespace zvec diff --git a/src/core/quantizer/cosine_reformer.cc b/src/core/quantizer/cosine_reformer.cc index 5823728dd..d6080b8d9 100644 --- a/src/core/quantizer/cosine_reformer.cc +++ b/src/core/quantizer/cosine_reformer.cc @@ -249,7 +249,7 @@ class CosineReformer : public IndexReformer { if (type == IndexMeta::DataType::DT_INT4) return 40; // 5 * sizeof(float) / sizeof(FT_INT4) else if (type == IndexMeta::DataType::DT_INT8) - return 20; // 5 * sizeof(float) / sizeof(FT_INT8) + return 24; // (5 * sizeof(float) + sizeof(int)) / sizeof(FT_INT8) else if (type == IndexMeta::DataType::DT_FP16) return 2; // sizeof(float) / sizeof(FT_FP16) else if (type == IndexMeta::DataType::DT_FP32) { diff --git a/src/core/quantizer/integer_quantizer_converter.cc b/src/core/quantizer/integer_quantizer_converter.cc index 91757a5d5..9439a1d0f 100644 --- a/src/core/quantizer/integer_quantizer_converter.cc +++ b/src/core/quantizer/integer_quantizer_converter.cc @@ -581,7 +581,9 @@ class IntegerStreamingConverter : public IndexConverter { static size_t ExtraDimension(IndexMeta::DataType type) { // The extra quantized params storage size to save for each vector constexpr size_t kExtraSize = 4 * sizeof(float); - return type == IndexMeta::DataType::DT_INT8 ? kExtraSize : kExtraSize * 2; + constexpr size_t kAdditionalInt32 = sizeof(int32_t); + return type == IndexMeta::DataType::DT_INT8 ? (kExtraSize + kAdditionalInt32) + : (kExtraSize * 2); } //! Members diff --git a/src/core/quantizer/integer_quantizer_reformer.cc b/src/core/quantizer/integer_quantizer_reformer.cc index 9c7410364..4228d0fda 100644 --- a/src/core/quantizer/integer_quantizer_reformer.cc +++ b/src/core/quantizer/integer_quantizer_reformer.cc @@ -279,7 +279,7 @@ class IntegerStreamingReformer : public IndexReformer { //! Constructor IntegerStreamingReformer(IndexMeta::DataType dst_type) : data_type_(dst_type), - extra_dimension_(data_type_ == IndexMeta::DataType::DT_INT8 ? 16 : 32) { + extra_dimension_(data_type_ == IndexMeta::DataType::DT_INT8 ? 20 : 32) { } //! Initialize Reformer diff --git a/src/core/quantizer/record_quantizer.h b/src/core/quantizer/record_quantizer.h index 06744f692..b1095a2ae 100644 --- a/src/core/quantizer/record_quantizer.h +++ b/src/core/quantizer/record_quantizer.h @@ -74,10 +74,16 @@ class RecordQuantizer { extras[0] = 1.0f / scale; extras[1] = -bias / scale; extras[2] = sum; - if (is_euclidean) { + + if (type == IndexMeta::DataType::DT_INT8) { extras[3] = squared_sum; + reinterpret_cast(extras + 4)[0] = int8_sum; } else { - reinterpret_cast(extras)[3] = int8_sum; + if (is_euclidean) { + extras[3] = squared_sum; + } else { + reinterpret_cast(extras)[3] = int8_sum; + } } } } @@ -128,4 +134,4 @@ class RecordQuantizer { }; } // namespace core -} // namespace zvec \ No newline at end of file +} // namespace zvec From 716dc23d116a918e825fd3cbe320951907c3faa6 Mon Sep 17 00:00:00 2001 From: luoxiaojian Date: Wed, 11 Mar 2026 15:01:05 +0800 Subject: [PATCH 3/5] fix for ip and cos --- src/core/metric/quantized_integer_metric_batch.h | 12 ++++++------ src/core/metric/quantized_integer_metric_matrix.h | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/core/metric/quantized_integer_metric_batch.h b/src/core/metric/quantized_integer_metric_batch.h index 9790a7a49..17917c346 100644 --- a/src/core/metric/quantized_integer_metric_batch.h +++ b/src/core/metric/quantized_integer_metric_batch.h @@ -56,7 +56,7 @@ struct BaseDistanceBatchWithScoreUnquantized { ValueType, BatchSize, PrefetchStep>::ComputeBatch(m, q, num, dim, out); } else if constexpr (std::is_same_v, - SquaredEuclidean>) { + SquaredEuclidean>) { return SquaredEuclideanDistanceBatchWithScoreUnquantized< ValueType, BatchSize, PrefetchStep>::ComputeBatch(m, q, num, dim, out); @@ -80,7 +80,7 @@ struct CosineMinusInnerProductDistanceBatchWithScoreUnquantized< static inline void ComputeBatch(const int8_t **vecs, const int8_t *query, size_t num_vecs, size_t dim, float *results) { - size_t original_dim = dim - 20; + size_t original_dim = dim - 24; ImplType::ComputeBatch(vecs, query, num_vecs, original_dim, results); } @@ -92,7 +92,7 @@ struct CosineMinusInnerProductDistanceBatchWithScoreUnquantized< static void QueryPreprocess(void *query, size_t dim) { if (auto func = ImplType::GetQueryPreprocessFunc(); func != nullptr) { - return func(query, dim - 20); + return func(query, dim - 24); } } }; @@ -218,14 +218,14 @@ struct SquaredEuclideanDistanceBatchWithScoreUnquantized(m_tail)[4]; result -= 128 * int8_sum; } result = ma * ma * ms2 + sum2 - 2 * ma * qa * result + - (mb - qb) * (mb - qb) * original_dim + - 2 * (mb - qb) * (ms * ma - sum); + (mb - qb) * (mb - qb) * original_dim + + 2 * (mb - qb) * (ms * ma - sum); } } diff --git a/src/core/metric/quantized_integer_metric_matrix.h b/src/core/metric/quantized_integer_metric_matrix.h index abce490fe..387097fcd 100644 --- a/src/core/metric/quantized_integer_metric_matrix.h +++ b/src/core/metric/quantized_integer_metric_matrix.h @@ -168,7 +168,7 @@ template struct CosineMinusInnerProduct { static void Compute(const int8_t *m, const int8_t *q, size_t dim, float *out) { - const size_t origin_dim = dim - 20; + const size_t origin_dim = dim - 24; MinusInnerProductImplInt8(m, q, origin_dim, out); } }; From 81ee36ae30af2ab1daaf9cc3bc274f001ad64c3f Mon Sep 17 00:00:00 2001 From: luoxiaojian Date: Wed, 11 Mar 2026 15:15:57 +0800 Subject: [PATCH 4/5] fixed null check and warnings --- .../inner_product_distance_batch_impl.h | 12 ++++---- .../inner_product_distance_batch_impl_fp16.h | 24 +++++++-------- .../inner_product_distance_batch_impl_int8.h | 30 +++++++++---------- src/core/metric/quantized_integer_metric.cc | 4 +-- .../metric/quantized_integer_metric_batch.h | 12 ++++---- 5 files changed, 42 insertions(+), 40 deletions(-) diff --git a/src/ailego/math_batch/inner_product_distance_batch_impl.h b/src/ailego/math_batch/inner_product_distance_batch_impl.h index d15a747e7..c318ceb6f 100644 --- a/src/ailego/math_batch/inner_product_distance_batch_impl.h +++ b/src/ailego/math_batch/inner_product_distance_batch_impl.h @@ -53,14 +53,14 @@ compute_one_to_many_avx2_fp32( const ValueType *query, const ValueType **ptrs, std::array &prefetch_ptrs, size_t dimensionality, float *results) { - std::array<__m256, dp_batch> accs; + __m256 accs[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { accs[i] = _mm256_setzero_ps(); } size_t dim = 0; for (; dim + 8 <= dimensionality; dim += 8) { __m256 q = _mm256_loadu_ps(query + dim); - std::array<__m256, dp_batch> data_regs; + __m256 data_regs[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { data_regs[i] = _mm256_loadu_ps(ptrs[i] + dim); } @@ -73,13 +73,13 @@ compute_one_to_many_avx2_fp32( accs[i] = _mm256_fnmadd_ps(q, data_regs[i], accs[i]); } } - std::array<__m128, dp_batch> sum128_regs; + __m128 sum128_regs[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { sum128_regs[i] = sum_top_bottom_avx(accs[i]); } if (dim + 4 <= dimensionality) { __m128 q = _mm_loadu_ps(query + dim); - std::array<__m128, dp_batch> data_regs; + __m128 data_regs[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { data_regs[i] = _mm_loadu_ps(ptrs[i] + dim); } @@ -95,7 +95,7 @@ compute_one_to_many_avx2_fp32( } if (dim + 2 <= dimensionality) { __m128 q = _mm_setzero_ps(); - std::array<__m128, dp_batch> data_regs; + __m128 data_regs[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { data_regs[i] = _mm_setzero_ps(); } @@ -126,4 +126,4 @@ compute_one_to_many_avx2_fp32( #endif -} // namespace zvec::ailego::DistanceBatch \ No newline at end of file +} // namespace zvec::ailego::DistanceBatch diff --git a/src/ailego/math_batch/inner_product_distance_batch_impl_fp16.h b/src/ailego/math_batch/inner_product_distance_batch_impl_fp16.h index 183369d7d..3a1e7699a 100644 --- a/src/ailego/math_batch/inner_product_distance_batch_impl_fp16.h +++ b/src/ailego/math_batch/inner_product_distance_batch_impl_fp16.h @@ -30,7 +30,7 @@ compute_one_to_many_avx512fp16_fp16( const ailego::Float16 *query, const ailego::Float16 **ptrs, std::array &prefetch_ptrs, size_t dimensionality, float *results) { - std::array<__m512h, dp_batch> accs; + __m512h accs[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { accs[i] = _mm512_setzero_ph(); @@ -40,7 +40,7 @@ compute_one_to_many_avx512fp16_fp16( for (; dim + 32 <= dimensionality; dim += 32) { __m512h q = _mm512_loadu_ph(query + dim); - std::array<__m512h, dp_batch> data_regs; + __m512h data_regs[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { data_regs[i] = _mm512_loadu_ph(ptrs[i] + dim); } @@ -86,7 +86,7 @@ compute_one_to_many_avx512f_fp16( const ailego::Float16 *query, const ailego::Float16 **ptrs, std::array &prefetch_ptrs, size_t dimensionality, float *results) { - std::array<__m512, dp_batch> accs; + __m512 accs[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { accs[i] = _mm512_setzero_ps(); @@ -100,8 +100,8 @@ compute_one_to_many_avx512f_fp16( __m512 q1 = _mm512_cvtph_ps(_mm512_castsi512_si256(q)); __m512 q2 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(q, 1)); - std::array<__m512, dp_batch> data_regs_1; - std::array<__m512, dp_batch> data_regs_2; + __m512 data_regs_1[dp_batch]; + __m512 data_regs_2[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { __m512i m = _mm512_loadu_si512(reinterpret_cast(ptrs[i] + dim)); @@ -126,7 +126,7 @@ compute_one_to_many_avx512f_fp16( __m512 q = _mm512_cvtph_ps( _mm256_loadu_si256(reinterpret_cast(query + dim))); - std::array<__m512, dp_batch> data_regs; + __m512 data_regs[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { data_regs[i] = _mm512_cvtph_ps( _mm256_loadu_si256(reinterpret_cast(ptrs[i] + dim))); @@ -136,7 +136,7 @@ compute_one_to_many_avx512f_fp16( dim += 16; } - std::array<__m256, dp_batch> acc_new; + __m256 acc_new[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { acc_new[i] = _mm256_add_ps( _mm512_castps512_ps256(accs[i]), @@ -176,7 +176,7 @@ compute_one_to_many_avx2_fp16( const ailego::Float16 *query, const ailego::Float16 **ptrs, std::array &prefetch_ptrs, size_t dimensionality, float *results) { - std::array<__m256, dp_batch> accs; + __m256 accs[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { accs[i] = _mm256_setzero_ps(); @@ -190,8 +190,8 @@ compute_one_to_many_avx2_fp16( __m256 q1 = _mm256_cvtph_ps(_mm256_castsi256_si128(q)); __m256 q2 = _mm256_cvtph_ps(_mm256_extractf128_si256(q, 1)); - std::array<__m256, dp_batch> data_regs_1; - std::array<__m256, dp_batch> data_regs_2; + __m256 data_regs_1[dp_batch]; + __m256 data_regs_2[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { __m256i m = _mm256_loadu_si256(reinterpret_cast(ptrs[i] + dim)); @@ -216,7 +216,7 @@ compute_one_to_many_avx2_fp16( __m256 q = _mm256_cvtph_ps( _mm_loadu_si128(reinterpret_cast(query + dim))); - std::array<__m256, dp_batch> data_regs; + __m256 data_regs[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { data_regs[i] = _mm256_cvtph_ps( _mm_loadu_si128(reinterpret_cast(ptrs[i] + dim))); @@ -240,4 +240,4 @@ compute_one_to_many_avx2_fp16( #endif -} // namespace zvec::ailego::DistanceBatch \ No newline at end of file +} // namespace zvec::ailego::DistanceBatch diff --git a/src/ailego/math_batch/inner_product_distance_batch_impl_int8.h b/src/ailego/math_batch/inner_product_distance_batch_impl_int8.h index 0e2366418..ec7034449 100644 --- a/src/ailego/math_batch/inner_product_distance_batch_impl_int8.h +++ b/src/ailego/math_batch/inner_product_distance_batch_impl_int8.h @@ -55,7 +55,7 @@ static void compute_one_to_many_avx512_vnni_int8( const int8_t *query, const int8_t **ptrs, std::array &prefetch_ptrs, size_t dimensionality, float *results) { - std::array<__m512i, dp_batch> accs; + __m512i accs[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { accs[i] = _mm512_setzero_si512(); } @@ -63,7 +63,7 @@ static void compute_one_to_many_avx512_vnni_int8( for (; dim + 64 <= dimensionality; dim += 64) { __m512i q = _mm512_loadu_si512(reinterpret_cast(query + dim)); - std::array<__m512i, dp_batch> data_regs; + __m512i data_regs[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { data_regs[i] = _mm512_loadu_si512(reinterpret_cast(ptrs[i] + dim)); @@ -100,12 +100,12 @@ static void compute_one_to_many_avx512_vnni_int8( // const int8_t *query, const int8_t **ptrs, // std::array &prefetch_ptrs, size_t // dimensionality, float *results) { -// std::array<__m512i, dp_batch> accs; +// __m512i accs[dp_batch]; // size_t dim = 0; // for (; dim + 64 <= dimensionality; dim += 64) { // __m512i q = // _mm512_loadu_si512(reinterpret_cast(query + dim)); -// std::array<__m512i, dp_batch> data_regs; +// __m512i data_regs[dp_batch]; // for (size_t i = 0; i < dp_batch; ++i) { // data_regs[i] = // _mm512_loadu_si512(reinterpret_cast(ptrs[i] + @@ -118,16 +118,16 @@ static void compute_one_to_many_avx512_vnni_int8( // } // __m512i q_lo = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(q, 0)); // __m512i q_hi = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(q, 1)); -// std::array<__m512i, dp_batch> data_lo; -// std::array<__m512i, dp_batch> data_hi; +// __m512i data_lo[dp_batch]; +// __m512i data_hi[dp_batch]; // for (size_t i = 0; i < dp_batch; ++i) { // data_lo[i] = // _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(data_regs[i], 0)); // data_hi[i] = // _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(data_regs[i], 1)); // } -// std::array<__m512i, dp_batch> prod_lo; -// std::array<__m512i, dp_batch> prod_hi; +// __m512i prod_lo[dp_batch]; +// __m512i prod_hi[dp_batch]; // for (size_t i = 0; i < dp_batch; ++i) { // prod_lo[i] = _mm512_madd_epi16(q_lo, data_lo[i]); // prod_hi[i] = _mm512_madd_epi16(q_hi, data_hi[i]); @@ -163,14 +163,14 @@ compute_one_to_many_avx2_int8( const int8_t *query, const int8_t **ptrs, std::array &prefetch_ptrs, size_t dimensionality, float *results) { - std::array<__m256i, dp_batch> accs; + __m256i accs[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { accs[i] = _mm256_setzero_si256(); } size_t dim = 0; for (; dim + 32 <= dimensionality; dim += 32) { __m256i q = _mm256_loadu_si256((const __m256i *)(query + dim)); - std::array<__m256i, dp_batch> data_regs; + __m256i data_regs[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { data_regs[i] = _mm256_loadu_si256((const __m256i *)(ptrs[i] + dim)); } @@ -181,15 +181,15 @@ compute_one_to_many_avx2_int8( } __m256i q_lo = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(q)); __m256i q_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(q, 1)); - std::array<__m256i, dp_batch> data_lo; - std::array<__m256i, dp_batch> data_hi; + __m256i data_lo[dp_batch]; + __m256i data_hi[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { data_lo[i] = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(data_regs[i])); data_hi[i] = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(data_regs[i], 1)); } - std::array<__m256i, dp_batch> prod_lo; - std::array<__m256i, dp_batch> prod_hi; + __m256i prod_lo[dp_batch]; + __m256i prod_hi[dp_batch]; for (size_t i = 0; i < dp_batch; ++i) { prod_lo[i] = _mm256_madd_epi16(q_lo, data_lo[i]); prod_hi[i] = _mm256_madd_epi16(q_hi, data_hi[i]); @@ -222,4 +222,4 @@ compute_one_to_many_avx2_int8( #endif -} // namespace zvec::ailego::DistanceBatch \ No newline at end of file +} // namespace zvec::ailego::DistanceBatch diff --git a/src/core/metric/quantized_integer_metric.cc b/src/core/metric/quantized_integer_metric.cc index 43a534059..2b4e757a2 100644 --- a/src/core/metric/quantized_integer_metric.cc +++ b/src/core/metric/quantized_integer_metric.cc @@ -148,7 +148,7 @@ class QuantizedIntegerMetric : public IndexMetric { if (meta_.data_type() == IndexMeta::DataType::DT_INT8) { return reinterpret_cast( BaseDistanceBatchWithScoreUnquantized::ComputeBatch); + 12, 2>::ComputeBatch); } if (meta_.data_type() == IndexMeta::DataType::DT_INT4) { return reinterpret_cast( @@ -267,7 +267,7 @@ class QuantizedIntegerMetric : public IndexMetric { return CosineMinusInnerProductDistanceBatchWithScoreUnquantized< int8_t, 1, 1>::GetQueryPreprocessFunc(); } else if (origin_metric_type_ == MetricType::kSquaredEuclidean && - meta_.data_type() == IndexMeta::DataType::DT_INT8) { + meta_.data_type() == IndexMeta::DataType::DT_INT8) { return SquaredEuclideanDistanceBatchWithScoreUnquantized< int8_t, 1, 1>::GetQueryPreprocessFunc(); } diff --git a/src/core/metric/quantized_integer_metric_batch.h b/src/core/metric/quantized_integer_metric_batch.h index 17917c346..33bbfa924 100644 --- a/src/core/metric/quantized_integer_metric_batch.h +++ b/src/core/metric/quantized_integer_metric_batch.h @@ -211,7 +211,7 @@ struct SquaredEuclideanDistanceBatchWithScoreUnquantized( reinterpret_cast(vecs[i]) + original_dim); float ma = m_tail[0]; @@ -262,7 +262,7 @@ struct SquaredEuclideanDistanceBatchWithScoreUnquantized( reinterpret_cast(vecs[i]) + original_dim_in_uint8_array); @@ -306,7 +306,7 @@ struct MipsSquaredEuclideanDistanceBatchWithScoreUnquantized( reinterpret_cast(vecs[i]) + original_dim); float ma = m_tail[0]; @@ -321,7 +321,9 @@ struct MipsSquaredEuclideanDistanceBatchWithScoreUnquantized( reinterpret_cast(vecs[i]) + original_dim_in_uint8_array); From 6a75f7b4d38513a2f4dfd252833c9b058e73add2 Mon Sep 17 00:00:00 2001 From: luoxiaojian Date: Wed, 11 Mar 2026 20:12:11 +0800 Subject: [PATCH 5/5] fix order --- cmake/option.cmake | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/option.cmake b/cmake/option.cmake index f9c531702..e93dff4d8 100644 --- a/cmake/option.cmake +++ b/cmake/option.cmake @@ -155,6 +155,10 @@ if(NOT AUTO_DETECT_ARCH) add_arch_flag("-march=sapphirerapids" SAPPHIRERAPIDS ENABLE_SAPPHIRERAPIDS) endif() + if(ENABLE_ICELAKE) + add_arch_flag("-march=icelake-server" ICELAKE ENABLE_ICELAKE) + endif() + if(ENABLE_SKYLAKE_AVX512) add_arch_flag("-march=skylake-avx512" SKYLAKE_AVX512 ENABLE_SKYLAKE_AVX512) endif() @@ -163,10 +167,6 @@ if(NOT AUTO_DETECT_ARCH) add_arch_flag("-march=skylake" SKYLAKE ENABLE_SKYLAKE) endif() - if(ENABLE_ICELAKE) - add_arch_flag("-march=icelake-server" ICELAKE ENABLE_ICELAKE) - endif() - if(ENABLE_BROADWELL) add_arch_flag("-march=broadwell" BROADWELL ENABLE_BROADWELL) endif()