Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions cmake/option.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ option(ENABLE_HASWELL "Enable Intel Haswell CPU microarchitecture" OFF)
option(ENABLE_BROADWELL "Enable Intel Broadwell CPU microarchitecture" OFF)
option(ENABLE_SKYLAKE "Enable Intel Skylake CPU microarchitecture" OFF)
option(ENABLE_SKYLAKE_AVX512 "Enable Intel Skylake Server CPU microarchitecture" OFF)
option(ENABLE_ICELAKE "Enable Intel Icelake CPU microarchitecture" OFF)
option(ENABLE_SAPPHIRERAPIDS "Enable Intel Sapphire Rapids Server CPU microarchitecture" OFF)
option(ENABLE_EMERALDRAPIDS "Enable Intel Emerald Rapids Server CPU microarchitecture" OFF)
option(ENABLE_GRANITERAPIDS "Enable Intel Granite Rapids Server CPU microarchitecture" OFF)
Expand All @@ -34,8 +35,8 @@ option(ENABLE_OPENMP "Enable OpenMP support" OFF)

set(ARCH_OPTIONS
ENABLE_NEHALEM ENABLE_SANDYBRIDGE ENABLE_HASWELL ENABLE_BROADWELL ENABLE_SKYLAKE
ENABLE_SKYLAKE_AVX512 ENABLE_SAPPHIRERAPIDS ENABLE_EMERALDRAPIDS ENABLE_GRANITERAPIDS
ENABLE_ZEN1 ENABLE_ZEN2 ENABLE_ZEN3
ENABLE_SKYLAKE_AVX512 ENABLE_ICELAKE ENABLE_SAPPHIRERAPIDS ENABLE_EMERALDRAPIDS
ENABLE_GRANITERAPIDS ENABLE_ZEN1 ENABLE_ZEN2 ENABLE_ZEN3
ENABLE_ARMV8A ENABLE_ARMV8.1A ENABLE_ARMV8.2A ENABLE_ARMV8.3A ENABLE_ARMV8.4A
ENABLE_ARMV8.5A ENABLE_ARMV8.6A
ENABLE_NATIVE
Expand Down Expand Up @@ -94,7 +95,7 @@ endfunction()
function(_detect_x86_best)
set(_x86_flags
"graniterapids" "emeraldrapids" "sapphirerapids"
"skylake-avx512" "skylake"
"icelake-server" "skylake-avx512" "skylake"
"broadwell" "haswell" "sandybridge" "nehalem"
"znver3" "znver2" "znver1"
)
Expand Down Expand Up @@ -154,6 +155,10 @@ if(NOT AUTO_DETECT_ARCH)
add_arch_flag("-march=sapphirerapids" SAPPHIRERAPIDS ENABLE_SAPPHIRERAPIDS)
endif()

if(ENABLE_ICELAKE)
add_arch_flag("-march=icelake-server" ICELAKE ENABLE_ICELAKE)
endif()

if(ENABLE_SKYLAKE_AVX512)
add_arch_flag("-march=skylake-avx512" SKYLAKE_AVX512 ENABLE_SKYLAKE_AVX512)
endif()
Expand Down
12 changes: 6 additions & 6 deletions src/ailego/math_batch/inner_product_distance_batch_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,14 @@ compute_one_to_many_avx2_fp32(
const ValueType *query, const ValueType **ptrs,
std::array<const ValueType *, dp_batch> &prefetch_ptrs,
size_t dimensionality, float *results) {
std::array<__m256, dp_batch> accs;
__m256 accs[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
accs[i] = _mm256_setzero_ps();
}
size_t dim = 0;
for (; dim + 8 <= dimensionality; dim += 8) {
__m256 q = _mm256_loadu_ps(query + dim);
std::array<__m256, dp_batch> data_regs;
__m256 data_regs[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
data_regs[i] = _mm256_loadu_ps(ptrs[i] + dim);
}
Expand All @@ -73,13 +73,13 @@ compute_one_to_many_avx2_fp32(
accs[i] = _mm256_fnmadd_ps(q, data_regs[i], accs[i]);
}
}
std::array<__m128, dp_batch> sum128_regs;
__m128 sum128_regs[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
sum128_regs[i] = sum_top_bottom_avx(accs[i]);
}
if (dim + 4 <= dimensionality) {
__m128 q = _mm_loadu_ps(query + dim);
std::array<__m128, dp_batch> data_regs;
__m128 data_regs[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
data_regs[i] = _mm_loadu_ps(ptrs[i] + dim);
}
Expand All @@ -95,7 +95,7 @@ compute_one_to_many_avx2_fp32(
}
if (dim + 2 <= dimensionality) {
__m128 q = _mm_setzero_ps();
std::array<__m128, dp_batch> data_regs;
__m128 data_regs[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
data_regs[i] = _mm_setzero_ps();
}
Expand Down Expand Up @@ -126,4 +126,4 @@ compute_one_to_many_avx2_fp32(
#endif


} // namespace zvec::ailego::DistanceBatch
} // namespace zvec::ailego::DistanceBatch
24 changes: 12 additions & 12 deletions src/ailego/math_batch/inner_product_distance_batch_impl_fp16.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ compute_one_to_many_avx512fp16_fp16(
const ailego::Float16 *query, const ailego::Float16 **ptrs,
std::array<const ailego::Float16 *, dp_batch> &prefetch_ptrs,
size_t dimensionality, float *results) {
std::array<__m512h, dp_batch> accs;
__m512h accs[dp_batch];

for (size_t i = 0; i < dp_batch; ++i) {
accs[i] = _mm512_setzero_ph();
Expand All @@ -40,7 +40,7 @@ compute_one_to_many_avx512fp16_fp16(
for (; dim + 32 <= dimensionality; dim += 32) {
__m512h q = _mm512_loadu_ph(query + dim);

std::array<__m512h, dp_batch> data_regs;
__m512h data_regs[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
data_regs[i] = _mm512_loadu_ph(ptrs[i] + dim);
}
Expand Down Expand Up @@ -86,7 +86,7 @@ compute_one_to_many_avx512f_fp16(
const ailego::Float16 *query, const ailego::Float16 **ptrs,
std::array<const ailego::Float16 *, dp_batch> &prefetch_ptrs,
size_t dimensionality, float *results) {
std::array<__m512, dp_batch> accs;
__m512 accs[dp_batch];

for (size_t i = 0; i < dp_batch; ++i) {
accs[i] = _mm512_setzero_ps();
Expand All @@ -100,8 +100,8 @@ compute_one_to_many_avx512f_fp16(
__m512 q1 = _mm512_cvtph_ps(_mm512_castsi512_si256(q));
__m512 q2 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(q, 1));

std::array<__m512, dp_batch> data_regs_1;
std::array<__m512, dp_batch> data_regs_2;
__m512 data_regs_1[dp_batch];
__m512 data_regs_2[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
__m512i m =
_mm512_loadu_si512(reinterpret_cast<const __m512i *>(ptrs[i] + dim));
Expand All @@ -126,7 +126,7 @@ compute_one_to_many_avx512f_fp16(
__m512 q = _mm512_cvtph_ps(
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(query + dim)));

std::array<__m512, dp_batch> data_regs;
__m512 data_regs[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
data_regs[i] = _mm512_cvtph_ps(
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptrs[i] + dim)));
Expand All @@ -136,7 +136,7 @@ compute_one_to_many_avx512f_fp16(
dim += 16;
}

std::array<__m256, dp_batch> acc_new;
__m256 acc_new[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
acc_new[i] = _mm256_add_ps(
_mm512_castps512_ps256(accs[i]),
Expand Down Expand Up @@ -176,7 +176,7 @@ compute_one_to_many_avx2_fp16(
const ailego::Float16 *query, const ailego::Float16 **ptrs,
std::array<const ailego::Float16 *, dp_batch> &prefetch_ptrs,
size_t dimensionality, float *results) {
std::array<__m256, dp_batch> accs;
__m256 accs[dp_batch];

for (size_t i = 0; i < dp_batch; ++i) {
accs[i] = _mm256_setzero_ps();
Expand All @@ -190,8 +190,8 @@ compute_one_to_many_avx2_fp16(
__m256 q1 = _mm256_cvtph_ps(_mm256_castsi256_si128(q));
__m256 q2 = _mm256_cvtph_ps(_mm256_extractf128_si256(q, 1));

std::array<__m256, dp_batch> data_regs_1;
std::array<__m256, dp_batch> data_regs_2;
__m256 data_regs_1[dp_batch];
__m256 data_regs_2[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
__m256i m =
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptrs[i] + dim));
Expand All @@ -216,7 +216,7 @@ compute_one_to_many_avx2_fp16(
__m256 q = _mm256_cvtph_ps(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(query + dim)));

std::array<__m256, dp_batch> data_regs;
__m256 data_regs[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
data_regs[i] = _mm256_cvtph_ps(
_mm_loadu_si128(reinterpret_cast<const __m128i *>(ptrs[i] + dim)));
Expand All @@ -240,4 +240,4 @@ compute_one_to_many_avx2_fp16(
#endif


} // namespace zvec::ailego::DistanceBatch
} // namespace zvec::ailego::DistanceBatch
30 changes: 15 additions & 15 deletions src/ailego/math_batch/inner_product_distance_batch_impl_int8.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,15 @@ static void compute_one_to_many_avx512_vnni_int8(
const int8_t *query, const int8_t **ptrs,
std::array<const int8_t *, dp_batch> &prefetch_ptrs, size_t dimensionality,
float *results) {
std::array<__m512i, dp_batch> accs;
__m512i accs[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
accs[i] = _mm512_setzero_si512();
}
size_t dim = 0;
for (; dim + 64 <= dimensionality; dim += 64) {
__m512i q =
_mm512_loadu_si512(reinterpret_cast<const __m512i *>(query + dim));
std::array<__m512i, dp_batch> data_regs;
__m512i data_regs[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
data_regs[i] =
_mm512_loadu_si512(reinterpret_cast<const __m512i *>(ptrs[i] + dim));
Expand Down Expand Up @@ -100,12 +100,12 @@ static void compute_one_to_many_avx512_vnni_int8(
// const int8_t *query, const int8_t **ptrs,
// std::array<const int8_t *, dp_batch> &prefetch_ptrs, size_t
// dimensionality, float *results) {
// std::array<__m512i, dp_batch> accs;
// __m512i accs[dp_batch];
// size_t dim = 0;
// for (; dim + 64 <= dimensionality; dim += 64) {
// __m512i q =
// _mm512_loadu_si512(reinterpret_cast<const __m512i *>(query + dim));
// std::array<__m512i, dp_batch> data_regs;
// __m512i data_regs[dp_batch];
// for (size_t i = 0; i < dp_batch; ++i) {
// data_regs[i] =
// _mm512_loadu_si512(reinterpret_cast<const __m512i *>(ptrs[i] +
Expand All @@ -118,16 +118,16 @@ static void compute_one_to_many_avx512_vnni_int8(
// }
// __m512i q_lo = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(q, 0));
// __m512i q_hi = _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(q, 1));
// std::array<__m512i, dp_batch> data_lo;
// std::array<__m512i, dp_batch> data_hi;
// __m512i data_lo[dp_batch];
// __m512i data_hi[dp_batch];
// for (size_t i = 0; i < dp_batch; ++i) {
// data_lo[i] =
// _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(data_regs[i], 0));
// data_hi[i] =
// _mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(data_regs[i], 1));
// }
// std::array<__m512i, dp_batch> prod_lo;
// std::array<__m512i, dp_batch> prod_hi;
// __m512i prod_lo[dp_batch];
// __m512i prod_hi[dp_batch];
// for (size_t i = 0; i < dp_batch; ++i) {
// prod_lo[i] = _mm512_madd_epi16(q_lo, data_lo[i]);
// prod_hi[i] = _mm512_madd_epi16(q_hi, data_hi[i]);
Expand Down Expand Up @@ -163,14 +163,14 @@ compute_one_to_many_avx2_int8(
const int8_t *query, const int8_t **ptrs,
std::array<const int8_t *, dp_batch> &prefetch_ptrs, size_t dimensionality,
float *results) {
std::array<__m256i, dp_batch> accs;
__m256i accs[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
accs[i] = _mm256_setzero_si256();
}
size_t dim = 0;
for (; dim + 32 <= dimensionality; dim += 32) {
__m256i q = _mm256_loadu_si256((const __m256i *)(query + dim));
std::array<__m256i, dp_batch> data_regs;
__m256i data_regs[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
data_regs[i] = _mm256_loadu_si256((const __m256i *)(ptrs[i] + dim));
}
Expand All @@ -181,15 +181,15 @@ compute_one_to_many_avx2_int8(
}
__m256i q_lo = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(q));
__m256i q_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(q, 1));
std::array<__m256i, dp_batch> data_lo;
std::array<__m256i, dp_batch> data_hi;
__m256i data_lo[dp_batch];
__m256i data_hi[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
data_lo[i] = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(data_regs[i]));
data_hi[i] =
_mm256_cvtepi8_epi16(_mm256_extracti128_si256(data_regs[i], 1));
}
std::array<__m256i, dp_batch> prod_lo;
std::array<__m256i, dp_batch> prod_hi;
__m256i prod_lo[dp_batch];
__m256i prod_hi[dp_batch];
for (size_t i = 0; i < dp_batch; ++i) {
prod_lo[i] = _mm256_madd_epi16(q_lo, data_lo[i]);
prod_hi[i] = _mm256_madd_epi16(q_hi, data_hi[i]);
Expand Down Expand Up @@ -222,4 +222,4 @@ compute_one_to_many_avx2_int8(
#endif


} // namespace zvec::ailego::DistanceBatch
} // namespace zvec::ailego::DistanceBatch
6 changes: 5 additions & 1 deletion src/core/metric/quantized_integer_metric.cc
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,10 @@ class QuantizedIntegerMetric : public IndexMetric {
meta_.data_type() == IndexMeta::DataType::DT_INT8) {
return CosineMinusInnerProductDistanceBatchWithScoreUnquantized<
int8_t, 1, 1>::GetQueryPreprocessFunc();
} else if (origin_metric_type_ == MetricType::kSquaredEuclidean &&
meta_.data_type() == IndexMeta::DataType::DT_INT8) {
return SquaredEuclideanDistanceBatchWithScoreUnquantized<
int8_t, 1, 1>::GetQueryPreprocessFunc();
}

return nullptr;
Expand Down Expand Up @@ -320,4 +324,4 @@ class QuantizedIntegerMetric : public IndexMetric {
INDEX_FACTORY_REGISTER_METRIC_ALIAS(QuantizedInteger, QuantizedIntegerMetric);

} // namespace core
} // namespace zvec
} // namespace zvec
Loading
Loading