Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions kernels/optimized/cpu/op_native_layer_norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,18 +76,18 @@ void layer_norm(
const CTYPE* src_ptr = input_data + i * N;
CTYPE* dst_ptr = out_data + i * N;

CTYPE mean_val;
CTYPE rstd_val;
acc_t<CTYPE> mean_val;
acc_t<CTYPE> rstd_val;
std::tie(mean_val, rstd_val) = RowwiseMoments(src_ptr, N);
rstd_val = CTYPE(1) / std::sqrt(rstd_val + eps);

const CTYPE scale = rstd_val;
const CTYPE offset = -rstd_val * mean_val;
const acc_t<CTYPE> scale = rstd_val;
const acc_t<CTYPE> offset = -rstd_val * mean_val;

if (gamma_null || beta_null) {
for (size_t j = 0; j < N; ++j) {
const CTYPE gamma_v = gamma_null ? CTYPE(1) : gamma_data[j];
const CTYPE beta_v = beta_null ? CTYPE(0) : beta_data[j];
const acc_t<CTYPE> gamma_v = gamma_null ? CTYPE(1) : gamma_data[j];
const acc_t<CTYPE> beta_v = beta_null ? CTYPE(0) : beta_data[j];
dst_ptr[j] = (src_ptr[j] * scale + offset) * gamma_v + beta_v;
}
} else {
Expand Down
1 change: 1 addition & 0 deletions kernels/optimized/test/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def _lib_test_bin(name, extra_deps = [], in_cpu = False):
deps = [
"//executorch/test/utils:utils",
"//executorch/kernels/optimized{}:{}".format(cpu_path, lib_root),
"//executorch/runtime/core/portable_type:scalar_type",
] + extra_deps,
preprocessor_flags = get_vec_preprocessor_flags() + get_vec_cxx_preprocessor_flags(),
)
Expand Down
11 changes: 11 additions & 0 deletions kernels/optimized/utils/math_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include <cstdint>

#include <executorch/kernels/optimized/utils/llvmMathExtras.h>
#include <executorch/runtime/core/portable_type/bfloat16.h>
#include <executorch/runtime/core/portable_type/half.h>

namespace executorch {
namespace utils {
Expand All @@ -37,6 +39,15 @@ template <>
struct ComputeDTypeTraits<int8_t> {
using type = int32_t;
};
// For 16 bit float types, ops should perform internal math in float32.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This matches the ATen convention from OpMathType.h.

template <>
struct ComputeDTypeTraits<c10::BFloat16> {
using type = float;
};
template <>
struct ComputeDTypeTraits<c10::Half> {
using type = float;
};

template <typename T>
using compute_dtype = typename ComputeDTypeTraits<T>::type;
Expand Down
4 changes: 2 additions & 2 deletions kernels/portable/cpu/op_native_group_norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ void group_norm(
const CTYPE* x = input_data + i * inner_size;

// compute E[X] and Var[x] = E[x^2] - E[x]^2
CTYPE sum = reduce_add(x, static_cast<CTYPE>(inner_size));
CTYPE sq_sum = vec_powerf(x, static_cast<CTYPE>(inner_size));
float sum = reduce_add(x, inner_size);
Copy link
Member Author

@GregoryComer GregoryComer Jan 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: We should ideally use float64 if CTYPE is double, but we're already forcing float32 in the various utility functions - reduce_add and vec_powerf return floats and use floats internally. I'm just removing the implicit cast to CTYPE on the return value.

I'm inclined to leave this for a follow-up on an as-needed basis.

float sq_sum = vec_powerf(x, inner_size);
double mean_value =
static_cast<double>(sum) / static_cast<double>(inner_size);
double variance =
Expand Down
10 changes: 5 additions & 5 deletions kernels/portable/cpu/op_native_layer_norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,11 @@ void layer_norm(
CTYPE* y = out_data + i * normalized;

// compute E[X] and Var[x] = E[x^2] - E[x]^2
CTYPE sum = reduce_add(x, ct_normalized);
CTYPE sq_sum = vec_powerf(x, ct_normalized);
CTYPE mean_value = sum / ct_normalized;
CTYPE variance = sq_sum / ct_normalized - mean_value * mean_value;
CTYPE std = std::sqrt(variance + eps);
float sum = reduce_add(x, ct_normalized);
float sq_sum = vec_powerf(x, ct_normalized);
float mean_value = sum / ct_normalized;
float variance = sq_sum / ct_normalized - mean_value * mean_value;
float std = std::sqrt(variance + eps);

// Calculate the elements of output
for (const auto j : c10::irange(normalized)) {
Expand Down
2 changes: 1 addition & 1 deletion kernels/portable/cpu/vec_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ template <typename T>
inline float vec_powerf(const T* x, size_t size) {
float sum = 0;
for (const auto i : c10::irange(size)) {
sum += x[i] * x[i];
sum += static_cast<float>(x[i]) * x[i];
}
return sum;
}
Expand Down
36 changes: 36 additions & 0 deletions kernels/test/op_native_group_norm_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,42 @@ class OpNativeGroupNormTest : public OperatorTest {
0.38038814,
0.75809801}, // expected_rstd_data
},
{
{1, 4, 3}, // sizes
{0.0,
1000.0,
2000.0,
3000.0,
4000.0,
5000.0,
6000.0,
7000.0,
8000.0,
9000.0,
10000.0,
11000.0}, // input_data
{1.0, 1.0, 1.0, 1.0}, // weight_data
{0.0, 0.0, 0.0, 0.0}, // bias_data
1, // N
4, // C
3, // HxW
2, // group
1e-5, // eps
{-1.46385,
-0.87831,
-0.29277,
0.29277,
0.87831,
1.46385,
-1.46385,
-0.87831,
-0.29277,
0.29277,
0.87831,
1.46385}, // expected_data
{2500.0, 8500.0}, // expected_mean_data
{0.00058554, 0.00058554}, // expected_rstd_data
},
};

run_test_cases(test_cases);
Expand Down
21 changes: 21 additions & 0 deletions kernels/test/op_native_layer_norm_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,12 @@ class OpNativeLayerNormTest : public OperatorTest {
expected,
1e-2,
executorch::runtime::testing::internal::kDefaultBFloat16Atol);
} else if constexpr (DTYPE == ScalarType::Half) {
EXPECT_TENSOR_CLOSE_WITH_TOL(
out0,
expected,
1e-3,
executorch::runtime::testing::internal::kDefaultHalfAtol);
} else {
EXPECT_TENSOR_CLOSE(out0, expected);
}
Expand Down Expand Up @@ -235,6 +241,21 @@ class OpNativeLayerNormTest : public OperatorTest {
1.38873,
-0.46291}, // expected_data
},
{
std::string(__func__) + ": Large variance",
{1, 2, 3}, // sizes
{0.0, 1000.0, 2000.0, 3000.0, 4000.0, 5000.0}, // input_data
{1, 2, 3}, // normalized shape
{1.0, 1.0, 1.0, 1.0, 1.0, 1.0}, // weights
{0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, // bias
1.0e-5, // eps
{-1.46385,
-0.87831,
-0.29277,
0.29277,
0.87831,
1.46385}, // expected_data
},
};

run_test_cases(test_cases);
Expand Down
Loading