diff --git a/include/core/detail/casting.hpp b/include/core/detail/casting.hpp
index 71168af4..25e5bf6f 100644
--- a/include/core/detail/casting.hpp
+++ b/include/core/detail/casting.hpp
@@ -22,11 +22,53 @@
 #pragma once
 
 #include <algorithm>
+#include <cmath>
 
 #include "core/detail/type_traits.hpp"
 
 namespace roccv::detail {
 
+/**
+ * @brief Rounds a floating-point value to the nearest integer using IEEE
+ * half-to-even rounding (the default rounding mode). Matches the semantics of
+ * __float2int_rn on device. Selects single- vs double-precision based on the
+ * argument type to avoid silent precision loss when U is double.
+ */
+template <typename U>
+__device__ __host__ inline U IEEERound(U v) {
+    static_assert(std::is_floating_point_v<U>, "IEEERound requires a floating-point input");
+#ifdef __HIP_DEVICE_COMPILE__
+    if constexpr (std::is_same_v<U, float>) {
+        return rintf(v);
+    } else {
+        return rint(v);
+    }
+#else
+    return std::rint(v);
+#endif
+}
+
+/**
+ * @brief Clamps v to [lo, hi].
+ * @param[in] v The value to clamp.
+ * @param[in] lo The lower bound of the clamp.
+ * @param[in] hi The upper bound of the clamp.
+ * @return The value v clamped to [lo, hi].
+ */
+template <typename U>
+__device__ __host__ inline U FpClamp(U v, U lo, U hi) {
+    static_assert(std::is_floating_point_v<U>, "FpClamp requires a floating-point input");
+#ifdef __HIP_DEVICE_COMPILE__
+    if constexpr (std::is_same_v<U, float>) {
+        return __builtin_amdgcn_fmed3f(v, lo, hi);
+    } else {
+        return fmin(fmax(v, lo), hi);
+    }
+#else
+    return std::clamp(v, lo, hi);
+#endif
+}
+
 /**
  * @brief ScalarSaturateCast is for implementation purposes only. Use SaturateCast directly.
  */
@@ -36,36 +78,55 @@ __device__ __host__ T ScalarSaturateCast(U v) {
     constexpr bool bigToSmall = !smallToBig;
 
     if constexpr (std::is_integral_v<T> && std::is_floating_point_v<U>) {
-        // Any float -> any integral
-        return static_cast<T>(std::clamp<U>(std::round(v), static_cast<U>(std::numeric_limits<T>::min()),
-                                            static_cast<U>(std::numeric_limits<T>::max())));
-    } else if constexpr (std::is_integral_v<T> && std::is_integral_v<U> && std::is_signed_v<U> &&
-                         std::is_unsigned_v<T> && smallToBig) {
-        // Any integral signed -> Any integral unsigned, small -> big or equal
-        return v <= 0 ? 0 : static_cast<T>(v);
-    } else if constexpr (std::is_integral_v<U> && std::is_integral_v<T> &&
-                         ((std::is_signed_v<U> && std::is_signed_v<T>) ||
-                          (std::is_unsigned_v<U> && std::is_unsigned_v<T>)) &&
-                         bigToSmall) {
-        // Any integral signed -> Any integral signed, big -> small
-        // Any integral unsigned -> Any integral unsigned, big -> small
-        return v <= std::numeric_limits<T>::min()
-                   ? std::numeric_limits<T>::min()
-                   : (v >= std::numeric_limits<T>::max() ? std::numeric_limits<T>::max() : static_cast<T>(v));
-    } else if constexpr (std::is_integral_v<U> && std::is_unsigned_v<U> && std::is_integral_v<T> &&
-                         std::is_signed_v<T>) {
-        // Any integral unsigned -> Any integral signed, small -> big or equal
-        return v >= std::numeric_limits<T>::max() ? std::numeric_limits<T>::max() : static_cast<T>(v);
-    } else if constexpr (std::is_integral_v<U> && std::is_signed_v<U> && std::is_integral_v<T> &&
-                         std::is_unsigned_v<T> && bigToSmall) {
-        // Any integral signed -> Any integral unsigned, big -> small
-        return v <= static_cast<U>(std::numeric_limits<T>::min())
-                   ? std::numeric_limits<T>::min()
-                   : (v >= static_cast<U>(std::numeric_limits<T>::max()) ? std::numeric_limits<T>::max()
-                                                                         : static_cast<T>(v));
-    } else {
-        // All other cases fall into this
-        return v;
+        // Float -> integral: clamp to [min, max] then round (IEEE half-to-even).
+        constexpr U minVal = static_cast<U>(std::numeric_limits<T>::lowest());
+        constexpr U maxVal = static_cast<U>(std::numeric_limits<T>::max());
+
+        if constexpr (sizeof(T) <= 2) {
+            // 8/16 bit integer cases. These can be represented exactly in floating point.
+            return static_cast<T>(IEEERound(FpClamp(v, minVal, maxVal)));
+        } else {
+            // 32/64 bit integer cases. maxVal may round up to an unrepresentable
+            // value when cast back, so compare against the rounded source.
+            const U rounded = IEEERound(v);
+            return rounded >= maxVal   ? std::numeric_limits<T>::max()
+                   : rounded <= minVal ? std::numeric_limits<T>::min()
+                                       : static_cast<T>(rounded);
+        }
+    }
+
+    else if constexpr (std::is_integral_v<T> && std::is_integral_v<U> && std::is_signed_v<U> && std::is_unsigned_v<T> &&
+                       smallToBig) {
+        // Signed -> unsigned, small to big: clamp negative to 0
+        // Branchless: max(v, 0) handles negative values
+        return static_cast<T>(max(v, U{0}));
+    }
+
+    else if constexpr (std::is_integral_v<U> && std::is_integral_v<T> &&
+                       ((std::is_signed_v<U> && std::is_signed_v<T>) ||
+                        (std::is_unsigned_v<U> && std::is_unsigned_v<T>)) &&
+                       bigToSmall) {
+        // Same signedness, big -> small: clamp to [min, max]
+        constexpr U minVal = static_cast<U>(std::numeric_limits<T>::min());
+        constexpr U maxVal = static_cast<U>(std::numeric_limits<T>::max());
+        return static_cast<T>(min(max(v, minVal), maxVal));
+    }
+
+    else if constexpr (std::is_integral_v<U> && std::is_unsigned_v<U> && std::is_integral_v<T> && std::is_signed_v<T>) {
+        // Unsigned -> signed: clamp to max (can't exceed min since unsigned)
+        constexpr U maxVal = static_cast<U>(std::numeric_limits<T>::max());
+        return static_cast<T>(min(v, maxVal));
+    }
+
+    else if constexpr (std::is_integral_v<U> && std::is_signed_v<U> && std::is_integral_v<T> && std::is_unsigned_v<T> &&
+                       bigToSmall) {
+        // Signed -> unsigned, big -> small: clamp to [0, max]
+        constexpr U maxVal = static_cast<U>(std::numeric_limits<T>::max());
+        return static_cast<T>(min(max(v, U{0}), maxVal));
+    }
+
+    else {
+        return static_cast<T>(v);
     }
 }
 
@@ -83,18 +144,21 @@ __device__ __host__ T ScalarSaturateCast(U v) {
 template <typename T, typename U,
           class = std::enable_if_t<(HasTypeTraits<T> && HasTypeTraits<U>) && (NumElements<T> <= NumElements<U>)>>
 __device__ __host__ T SaturateCast(U v) {
+    using B = BaseType<T>;
     if constexpr (std::is_same_v<T, U>) {
         return v;
+    } else if constexpr (NumElements<T> == 1) {
+        return T{ScalarSaturateCast<B>(GetElement(v, 0))};
+    } else if constexpr (NumElements<T> == 2) {
+        return T{ScalarSaturateCast<B>(GetElement(v, 0)), ScalarSaturateCast<B>(GetElement(v, 1))};
+    } else if constexpr (NumElements<T> == 3) {
+        return T{ScalarSaturateCast<B>(GetElement(v, 0)), ScalarSaturateCast<B>(GetElement(v, 1)),
+                 ScalarSaturateCast<B>(GetElement(v, 2))};
+    } else {
+        static_assert(NumElements<T> == 4, "SaturateCast supports up to 4-element vectors");
+        return T{ScalarSaturateCast<B>(GetElement(v, 0)), ScalarSaturateCast<B>(GetElement(v, 1)),
+                 ScalarSaturateCast<B>(GetElement(v, 2)), ScalarSaturateCast<B>(GetElement(v, 3))};
     }
-
-    T ret{};
-
-    GetElement(ret, 0) = ScalarSaturateCast<BaseType<T>>(GetElement(v, 0));
-    if constexpr (NumElements<T> >= 2) GetElement(ret, 1) = ScalarSaturateCast<BaseType<T>>(GetElement(v, 1));
-    if constexpr (NumElements<T> >= 3) GetElement(ret, 2) = ScalarSaturateCast<BaseType<T>>(GetElement(v, 2));
-    if constexpr (NumElements<T> >= 4) GetElement(ret, 3) = ScalarSaturateCast<BaseType<T>>(GetElement(v, 3));
-
-    return ret;
 }
 
 /**
@@ -109,17 +173,40 @@ __device__ __host__ T ScalarRangeCast(U v) {
     }
 
     else if constexpr (std::is_integral_v<T> && std::is_floating_point_v<U> && std::is_signed_v<T>) {
-        // Float to signed integers
-        return v >= T{1}    ? std::numeric_limits<T>::max()
-               : v <= T{-1} ? std::numeric_limits<T>::min()
-                            : static_cast<T>(std::round(static_cast<U>(std::numeric_limits<T>::max()) * v));
+        // Float to signed integer. Map [-1, 1] -> [min, max] with IEEE half-to-even rounding.
+        constexpr U scale = static_cast<U>(std::numeric_limits<T>::max());
+
+        if constexpr (sizeof(T) <= 2) {
+            // 8/16 bit signed cases. These can be represented exactly in floating point,
+            // so clamp first then round.
+            return static_cast<T>(IEEERound(FpClamp(v, U{-1}, U{1}) * scale));
+        } else {
+            // 32/64 bit signed cases.
+            return v >= U{1}    ? std::numeric_limits<T>::max()
+                   : v <= U{-1} ? std::numeric_limits<T>::min()
+                                : static_cast<T>(IEEERound(scale * v));
+        }
     }
 
     else if constexpr (std::is_integral_v<T> && std::is_floating_point_v<U> && std::is_unsigned_v<T>) {
         // float to unsigned integers
-        return v >= T{1}   ? std::numeric_limits<T>::max()
-               : v <= T{0} ? 0
-                           : static_cast<T>(lrintf(static_cast<U>(std::numeric_limits<T>::max()) * v));
+        constexpr U scale = static_cast<U>(std::numeric_limits<T>::max());
+
+        if constexpr (sizeof(T) <= 2) {
+            // 8/16 bit integer cases. These can be represented exactly in floating point.
+#ifdef __HIP_DEVICE_COMPILE__
+            if constexpr (std::is_same_v<U, float>) {
+                return static_cast<T>(__float2int_rn(__saturatef(v) * scale));
+            } else {
+                return static_cast<T>(IEEERound(FpClamp(v, U{0}, U{1}) * scale));
+            }
+#else
+            return static_cast<T>(IEEERound(FpClamp(v, U{0}, U{1}) * scale));
+#endif
+        } else {
+            // 32/64 bit integer cases.
+            return v >= U{1} ? std::numeric_limits<T>::max() : v <= U{0} ? T{0} : static_cast<T>(IEEERound(v * scale));
+        }
     }
 
     else if constexpr (std::is_floating_point_v<T> && std::is_integral_v<U> && std::is_signed_v<U>) {
@@ -162,18 +249,21 @@ __device__ __host__ T ScalarRangeCast(U v) {
 template <typename T, typename U,
           class = std::enable_if_t<(HasTypeTraits<T> && HasTypeTraits<U>) && NumElements<T> <= NumElements<U>>>
 __device__ __host__ T RangeCast(U v) {
+    using B = BaseType<T>;
     if constexpr (std::is_same_v<T, U>) {
         return v;
+    } else if constexpr (NumElements<T> == 1) {
+        return T{ScalarRangeCast<B>(GetElement(v, 0))};
+    } else if constexpr (NumElements<T> == 2) {
+        return T{ScalarRangeCast<B>(GetElement(v, 0)), ScalarRangeCast<B>(GetElement(v, 1))};
+    } else if constexpr (NumElements<T> == 3) {
+        return T{ScalarRangeCast<B>(GetElement(v, 0)), ScalarRangeCast<B>(GetElement(v, 1)),
+                 ScalarRangeCast<B>(GetElement(v, 2))};
+    } else {
+        static_assert(NumElements<T> == 4, "RangeCast supports up to 4-element vectors");
+        return T{ScalarRangeCast<B>(GetElement(v, 0)), ScalarRangeCast<B>(GetElement(v, 1)),
+                 ScalarRangeCast<B>(GetElement(v, 2)), ScalarRangeCast<B>(GetElement(v, 3))};
     }
-
-    T ret{};
-
-    GetElement(ret, 0) = ScalarRangeCast<BaseType<T>>(GetElement(v, 0));
-    if constexpr (NumElements<T> >= 2) GetElement(ret, 1) = ScalarRangeCast<BaseType<T>>(GetElement(v, 1));
-    if constexpr (NumElements<T> >= 3) GetElement(ret, 2) = ScalarRangeCast<BaseType<T>>(GetElement(v, 2));
-    if constexpr (NumElements<T> >= 4) GetElement(ret, 3) = ScalarRangeCast<BaseType<T>>(GetElement(v, 3));
-
-    return ret;
 }
 
 /**
@@ -187,21 +277,23 @@ __device__ __host__ T RangeCast(U v) {
 template <typename T, typename U,
           class = std::enable_if_t<(HasTypeTraits<T> && HasTypeTraits<U>) && NumElements<T> <= NumElements<U>>>
 __device__ __host__ T StaticCast(U v) {
+    using B = BaseType<T>;
     if constexpr (std::is_same_v<T, U>) {
         // Both same type, just return the value.
         return v;
     } else if constexpr (!IsCompound<T> && !IsCompound<U>) {
         // Both scalar values. Reduces to a standard static cast.
         return static_cast<T>(v);
+    } else if constexpr (NumElements<T> == 1) {
+        return T{StaticCast<B>(GetElement(v, 0))};
+    } else if constexpr (NumElements<T> == 2) {
+        return T{StaticCast<B>(GetElement(v, 0)), StaticCast<B>(GetElement(v, 1))};
+    } else if constexpr (NumElements<T> == 3) {
+        return T{StaticCast<B>(GetElement(v, 0)), StaticCast<B>(GetElement(v, 1)), StaticCast<B>(GetElement(v, 2))};
     } else {
-        // Vector types. Perform casting on each element.
-        T ret{};
-        GetElement(ret, 0) = StaticCast<BaseType<T>>(GetElement(v, 0));
-        if constexpr (NumElements<T> >= 2) GetElement(ret, 1) = StaticCast<BaseType<T>>(GetElement(v, 1));
-        if constexpr (NumElements<T> >= 3) GetElement(ret, 2) = StaticCast<BaseType<T>>(GetElement(v, 2));
-        if constexpr (NumElements<T> >= 4) GetElement(ret, 3) = StaticCast<BaseType<T>>(GetElement(v, 3));
-
-        return ret;
+        static_assert(NumElements<T> == 4, "StaticCast supports up to 4-element vectors");
+        return T{StaticCast<B>(GetElement(v, 0)), StaticCast<B>(GetElement(v, 1)), StaticCast<B>(GetElement(v, 2)),
+                 StaticCast<B>(GetElement(v, 3))};
     }
 }
 }  // namespace roccv::detail
\ No newline at end of file
diff --git a/include/core/detail/type_traits.hpp b/include/core/detail/type_traits.hpp
index dcf77eb0..32f14d58 100644
--- a/include/core/detail/type_traits.hpp
+++ b/include/core/detail/type_traits.hpp
@@ -20,6 +20,7 @@
  */
 
 #include <hip/hip_runtime.h>
+
 #include <cassert>
 
 #pragma once
@@ -83,6 +84,8 @@ DEFINE_TYPE_TRAITS_0_TO_4(int, signed int);
 DEFINE_TYPE_TRAITS_0_TO_4(short, signed short);
 DEFINE_TYPE_TRAITS_0_TO_4(ushort, unsigned short);
 DEFINE_TYPE_TRAITS_0_TO_4(double, double);
+DEFINE_TYPE_TRAITS_0_TO_4(long, signed long);
+DEFINE_TYPE_TRAITS_0_TO_4(ulong, unsigned long);
 
 /**
  * @brief Returns the number of elements in a HIP vectorized type. For example: uchar3 will return 3, int2 will
diff --git a/tests/roccv/cpp/src/tests/core/detail/test_range_cast.cpp b/tests/roccv/cpp/src/tests/core/detail/test_range_cast.cpp
index c284dc87..35fc843e 100644
--- a/tests/roccv/cpp/src/tests/core/detail/test_range_cast.cpp
+++ b/tests/roccv/cpp/src/tests/core/detail/test_range_cast.cpp
@@ -39,6 +39,8 @@ int main(int argc, char **argv) {
     TEST_CASE(EXPECT_EQ(RangeCast<int>(-1.0f), std::numeric_limits<int>::min()));
     TEST_CASE(EXPECT_EQ(RangeCast<uint>(1.0f), std::numeric_limits<uint>::max()));
     TEST_CASE(EXPECT_EQ(RangeCast<uint>(-1.0f), 0));
+    TEST_CASE(EXPECT_EQ(RangeCast<uint>(0.0f), 0));
+
 
     // Test unsigned/signed integer -> float casting
     TEST_CASE(EXPECT_EQ(RangeCast<float>(std::numeric_limits<int>::max()), 1.0f));
@@ -58,6 +60,71 @@ int main(int argc, char **argv) {
     TEST_CASE(EXPECT_EQ(RangeCast<double>(std::numeric_limits<uint>::max()), 1.0f));
     TEST_CASE(EXPECT_EQ(RangeCast<double>(0), 0.0f));
 
+    // ----- 8/16-bit signed fast path -----
+    TEST_CASE(EXPECT_EQ(RangeCast<int8_t>(1.0f), 127));
+    TEST_CASE(EXPECT_EQ(RangeCast<int8_t>(-1.0f), -127));
+    TEST_CASE(EXPECT_EQ(RangeCast<int8_t>(0.0f), 0));
+    TEST_CASE(EXPECT_EQ(RangeCast<int8_t>(2.0f), 127));     // out-of-range positive clamps
+    TEST_CASE(EXPECT_EQ(RangeCast<int8_t>(-2.0f), -127));   // out-of-range negative clamps
+    TEST_CASE(EXPECT_EQ(RangeCast<int16_t>(1.0f), 32767));
+    TEST_CASE(EXPECT_EQ(RangeCast<int16_t>(-1.0f), -32767));
+    TEST_CASE(EXPECT_EQ(RangeCast<int16_t>(2.0f), 32767));
+    TEST_CASE(EXPECT_EQ(RangeCast<int16_t>(-2.0f), -32767));
+
+    // ----- 8/16-bit unsigned fast path -----
+    TEST_CASE(EXPECT_EQ(RangeCast<uint8_t>(1.0f), 255));
+    TEST_CASE(EXPECT_EQ(RangeCast<uint8_t>(0.0f), 0));
+    TEST_CASE(EXPECT_EQ(RangeCast<uint8_t>(2.0f), 255));    // clamp positive
+    TEST_CASE(EXPECT_EQ(RangeCast<uint8_t>(-0.5f), 0));     // clamp negative
+    TEST_CASE(EXPECT_EQ(RangeCast<uint16_t>(1.0f), 65535));
+    TEST_CASE(EXPECT_EQ(RangeCast<uint16_t>(-1.0f), 0));
+
+    // ----- Rounding mode: must be IEEE half-to-even -----
+    TEST_CASE(EXPECT_EQ(RangeCast<uint8_t>(0.5f / 255.0f), 0));   // would be 1 with std::round
+    TEST_CASE(EXPECT_EQ(RangeCast<uint8_t>(1.5f / 255.0f), 2));   // round half to even
+    TEST_CASE(EXPECT_EQ(RangeCast<uint8_t>(2.5f / 255.0f), 2));   // round half to even (down)
+    TEST_CASE(EXPECT_EQ(RangeCast<int8_t>(0.5f / 127.0f), 0));    // signed: same rounding rule
+    TEST_CASE(EXPECT_EQ(RangeCast<int8_t>(-0.5f / 127.0f), 0));
+    TEST_CASE(EXPECT_EQ(RangeCast<int8_t>(-1.5f / 127.0f), -2));
+
+    // ----- Double precision in float -> int -----
+    TEST_CASE(EXPECT_EQ(RangeCast<int>(0.5), std::numeric_limits<int>::max() / 2 + 1));
+    TEST_CASE(EXPECT_EQ(RangeCast<int>(0.0), 0));
+    TEST_CASE(EXPECT_EQ(RangeCast<uint>(0.0), 0u));
+
+    // ----- int -> float clamping: signed min hits the -1.008... clamp -----
+    // numeric_limits<int8_t>::min() / max() = -128 / 127 = -1.0078..., must clamp to -1.
+    TEST_CASE(EXPECT_EQ(RangeCast<float>(int8_t{-128}), -1.0f));
+    TEST_CASE(EXPECT_EQ(RangeCast<float>(int8_t{127}), 1.0f));
+    TEST_CASE(EXPECT_EQ(RangeCast<float>(int8_t{0}), 0.0f));
+    TEST_CASE(EXPECT_EQ(RangeCast<float>(int16_t{-32768}), -1.0f));
+    TEST_CASE(EXPECT_EQ(RangeCast<float>(int16_t{32767}), 1.0f));
+
+    // ----- uint -> float -----
+    TEST_CASE(EXPECT_EQ(RangeCast<float>(uint8_t{255}), 1.0f));
+    TEST_CASE(EXPECT_EQ(RangeCast<float>(uint8_t{0}), 0.0f));
+    TEST_CASE(EXPECT_EQ(RangeCast<float>(uint16_t{65535}), 1.0f));
+
+    // ----- Integer -> integer falls back to SaturateCast -----
+    TEST_CASE(EXPECT_EQ(RangeCast<int8_t>(int32_t{300}), 127));
+    TEST_CASE(EXPECT_EQ(RangeCast<uint8_t>(int32_t{-1}), 0));
+
+    // ----- Vector types -----
+    // float -> uchar4: 0.0 -> 0, 0.5 -> 128, 1.0 -> 255 (with banker's rounding at half)
+    TEST_CASE(EXPECT_TRUE(
+        (RangeCast<uchar4>(float4{0.0f, 0.5f, 1.0f, -0.5f}) == uchar4{0, 128, 255, 0})));
+    // uchar4 -> float4: 0 -> 0.0, 255 -> 1.0
+    {
+        float4 result = RangeCast<float4>(uchar4{0, 128, 255, 64});
+        TEST_CASE(EXPECT_EQ(result.x, 0.0f));
+        TEST_CASE(EXPECT_EQ(result.z, 1.0f));
+        TEST_CASE(EXPECT_TRUE(std::abs(result.y - (128.0f / 255.0f)) < 1e-6f));
+        TEST_CASE(EXPECT_TRUE(std::abs(result.w - (64.0f / 255.0f)) < 1e-6f));
+    }
+    // 2- and 3-element vectors
+    TEST_CASE(EXPECT_TRUE((RangeCast<uchar2>(float2{0.5f, -10.0f}) == uchar2{128, 0})));
+    TEST_CASE(EXPECT_TRUE((RangeCast<char3>(float3{1.0f, -1.0f, 0.0f}) == char3{127, -127, 0})));
+
     // clang-format on
 
     TEST_CASES_END();
diff --git a/tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp b/tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp
new file mode 100644
index 00000000..eca96d44
--- /dev/null
+++ b/tests/roccv/cpp/src/tests/core/detail/test_saturate_cast.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <core/detail/casting.hpp>
+
+#include "test_helpers.hpp"
+
+using namespace roccv::detail;
+using namespace roccv::tests;
+using namespace roccv;
+
+int main(int argc, char **argv) {
+    (void)argc;
+    (void)argv;
+
+    TEST_CASES_BEGIN();
+
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(1.0f), 1));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(-1.0f), -1));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint>(1.0f), 1));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint>(-1.0f), 0));
+    TEST_CASE(EXPECT_EQ(SaturateCast<float>(1), 1.0f));
+    TEST_CASE(EXPECT_EQ(SaturateCast<float>(-1), -1.0f));
+    TEST_CASE(EXPECT_EQ(SaturateCast<double>(1), 1.0));
+    TEST_CASE(EXPECT_EQ(SaturateCast<double>(-1), -1.0));
+
+    // Test numeric limits
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(std::numeric_limits<float>::max()), std::numeric_limits<int>::max()));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint8_t>(std::numeric_limits<float>::max()), std::numeric_limits<uint8_t>::max()));
+    TEST_CASE(EXPECT_EQ(SaturateCast<long>(std::numeric_limits<float>::max()), std::numeric_limits<long>::max()));
+    TEST_CASE(EXPECT_EQ(SaturateCast<ulong>(std::numeric_limits<float>::lowest()), 0UL));
+
+    // Test vectorized types
+    TEST_CASE(EXPECT_TRUE((SaturateCast<float4>(uchar4{255, 128, 0, 255}) == float4{255.0f, 128.0f, 0.0f, 255.0f})));
+    TEST_CASE(EXPECT_TRUE(
+        (SaturateCast<float4>(char4{-128, -128, -128, -128}) == float4{-128.0f, -128.0f, -128.0f, -128.0f})));
+
+    // ----- Rounding mode: must be IEEE half-to-even (banker's rounding) -----
+    // These regression-guard against accidentally switching back to std::round
+    // (half-away-from-zero), which would diverge from the device fast-paths.
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(0.5f), 0));  // halfway -> nearest even (0)
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(1.5f), 2));  // halfway -> nearest even (2)
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(2.5f), 2));  // halfway -> nearest even (2)
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(-0.5f), 0));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(-1.5f), -2));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(-2.5f), -2));
+    // Same rounding rules in the 8/16-bit clamp-then-round path.
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint8_t>(0.5f), 0));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint8_t>(1.5f), 2));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint8_t>(2.5f), 2));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int16_t>(-1.5f), -2));
+    // Non-half values should still round to nearest as expected.
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(1.4f), 1));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(1.6f), 2));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(-1.4f), -1));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(-1.6f), -2));
+
+    // ----- Double precision: must NOT be silently truncated to float -----
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(1234567890.7), 1234567891));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(-1234567890.7), -1234567891));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(16777217.0), 16777217));          // 2^24+1, not exact in float
+    TEST_CASE(EXPECT_EQ(SaturateCast<int64_t>(1234567890.5), 1234567890));  // half-to-even
+
+    // ----- Float clamping: out-of-range floats clamp to numeric limits -----
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint8_t>(300.0f), 255));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint8_t>(-1.0f), 0));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint8_t>(-100.0f), 0));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int8_t>(200.0f), 127));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int8_t>(-200.0f), -128));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int16_t>(40000.0f), 32767));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int16_t>(-40000.0f), -32768));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint16_t>(70000.0f), 65535));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint16_t>(-1.0f), 0));
+
+    // ----- Integer narrowing: same signedness -----
+    TEST_CASE(EXPECT_EQ(SaturateCast<int8_t>(int32_t{300}), 127));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int8_t>(int32_t{-300}), -128));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int8_t>(int32_t{42}), 42));  // in-range, passthrough
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint8_t>(uint32_t{300}), 255));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint8_t>(uint32_t{42}), 42));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int16_t>(int64_t{-100000}), -32768));
+
+    // ----- Integer cross-signedness narrowing -----
+    // Signed -> unsigned, big -> small: clamp negatives to 0, big to max
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint8_t>(int32_t{-1}), 0));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint8_t>(int32_t{300}), 255));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint8_t>(int32_t{42}), 42));
+    // Unsigned -> signed: clamp values exceeding signed max
+    TEST_CASE(EXPECT_EQ(SaturateCast<int8_t>(uint32_t{300}), 127));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int8_t>(uint32_t{42}), 42));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int16_t>(uint32_t{70000}), 32767));
+
+    // ----- Integer cross-signedness widening -----
+    // Signed -> unsigned, small to big: clamp negatives to 0
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint32_t>(int8_t{-1}), 0u));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint32_t>(int8_t{-128}), 0u));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint32_t>(int8_t{42}), 42u));
+    // Unsigned -> signed widening: always representable, no clamping
+    TEST_CASE(EXPECT_EQ(SaturateCast<int32_t>(uint8_t{255}), 255));
+    TEST_CASE(EXPECT_EQ(SaturateCast<int32_t>(uint8_t{0}), 0));
+
+    // ----- Same-type early-return path -----
+    TEST_CASE(EXPECT_EQ(SaturateCast<int>(int{42}), 42));
+    TEST_CASE(EXPECT_EQ(SaturateCast<float>(1.5f), 1.5f));
+    TEST_CASE(EXPECT_EQ(SaturateCast<uint8_t>(uint8_t{200}), 200));
+
+    // ----- Additional vector coverage: 2- and 3-element types, integer narrowing -----
+    TEST_CASE(EXPECT_TRUE((SaturateCast<uchar2>(int2{300, -50}) == uchar2{255, 0})));
+    TEST_CASE(EXPECT_TRUE(
+        (SaturateCast<uchar3>(float3{300.0f, -10.0f, 127.5f}) == uchar3{255, 0, 128})));  // 127.5 rounds to even (128)
+    TEST_CASE(EXPECT_TRUE((SaturateCast<char4>(float4{200.0f, -200.0f, 0.5f, -0.5f}) == char4{127, -128, 0, 0})));
+
+    TEST_CASES_END();
+}
\ No newline at end of file
diff --git a/tests/roccv/cpp/src/tests/core/detail/test_static_cast.cpp b/tests/roccv/cpp/src/tests/core/detail/test_static_cast.cpp
new file mode 100644
index 00000000..c7f2d3a9
--- /dev/null
+++ b/tests/roccv/cpp/src/tests/core/detail/test_static_cast.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <core/detail/casting.hpp>
+
+#include "test_helpers.hpp"
+
+using namespace roccv::detail;
+using namespace roccv::tests;
+using namespace roccv;
+
+int main(int argc, char **argv) {
+    (void)argc;
+    (void)argv;
+    TEST_CASES_BEGIN();
+
+    // ----- Scalar same-type early return -----
+    TEST_CASE(EXPECT_EQ(StaticCast<int>(int{42}), 42));
+    TEST_CASE(EXPECT_EQ(StaticCast<float>(1.5f), 1.5f));
+    TEST_CASE(EXPECT_EQ(StaticCast<double>(2.5), 2.5));
+
+    // ----- Scalar conversions: behave exactly like static_cast -----
+    // Float -> int: truncates toward zero, no clamping or rounding.
+    TEST_CASE(EXPECT_EQ(StaticCast<int>(3.7f), 3));
+    TEST_CASE(EXPECT_EQ(StaticCast<int>(-3.7f), -3));
+    TEST_CASE(EXPECT_EQ(StaticCast<int>(0.999f), 0));
+    // int -> float: exact for small values.
+    TEST_CASE(EXPECT_EQ(StaticCast<float>(int{42}), 42.0f));
+    TEST_CASE(EXPECT_EQ(StaticCast<float>(int{-42}), -42.0f));
+    // Widening / narrowing integer conversions follow C++ rules (no clamping).
+    TEST_CASE(EXPECT_EQ(StaticCast<int32_t>(int8_t{-1}), -1));
+    TEST_CASE(EXPECT_EQ(StaticCast<uint8_t>(int32_t{300}), static_cast<uint8_t>(300)));
+    // double -> float
+    TEST_CASE(EXPECT_EQ(StaticCast<float>(1.5), 1.5f));
+
+    // ----- Vector same-type early return -----
+    TEST_CASE(EXPECT_TRUE((StaticCast<float4>(float4{1.0f, 2.0f, 3.0f, 4.0f}) == float4{1.0f, 2.0f, 3.0f, 4.0f})));
+    TEST_CASE(EXPECT_TRUE((StaticCast<uchar4>(uchar4{1, 2, 3, 4}) == uchar4{1, 2, 3, 4})));
+
+    // ----- Vector conversions across base types (same arity) -----
+    TEST_CASE(EXPECT_TRUE((StaticCast<float4>(uchar4{1, 2, 3, 4}) == float4{1.0f, 2.0f, 3.0f, 4.0f})));
+    TEST_CASE(EXPECT_TRUE((StaticCast<int4>(float4{1.7f, -2.7f, 3.3f, -3.3f}) == int4{1, -2, 3, -3})));
+    TEST_CASE(EXPECT_TRUE((StaticCast<float3>(uchar3{10, 20, 30}) == float3{10.0f, 20.0f, 30.0f})));
+    TEST_CASE(EXPECT_TRUE((StaticCast<float2>(int2{-5, 5}) == float2{-5.0f, 5.0f})));
+
+    // ----- Partial-element extraction (NumElements<T> < NumElements<U>) -----
+    // Per the enable_if (NumElements<T> <= NumElements<U>), narrower vectors are allowed.
+    TEST_CASE(EXPECT_TRUE((StaticCast<float2>(float4{1.0f, 2.0f, 3.0f, 4.0f}) == float2{1.0f, 2.0f})));
+    TEST_CASE(EXPECT_TRUE((StaticCast<float3>(float4{1.0f, 2.0f, 3.0f, 4.0f}) == float3{1.0f, 2.0f, 3.0f})));
+    TEST_CASE(EXPECT_TRUE((StaticCast<uchar2>(uchar4{10, 20, 30, 40}) == uchar2{10, 20})));
+    // Cross-type partial extraction
+    TEST_CASE(EXPECT_TRUE((StaticCast<int2>(float4{1.7f, -2.7f, 3.3f, -3.3f}) == int2{1, -2})));
+
+    // ----- Scalar destination from compound source -----
+    // NumElements<T> == 1 with compound U: takes element 0 only.
+    TEST_CASE(EXPECT_EQ(StaticCast<float>(float4{7.0f, 1.0f, 2.0f, 3.0f}), 7.0f));
+    TEST_CASE(EXPECT_EQ(StaticCast<int>(float2{4.7f, 9.0f}), 4));
+
+    // ----- No clamping on overflow (this is what distinguishes StaticCast from SaturateCast) -----
+    // float -> uint8 with out-of-range input: result is implementation-defined per C++,
+    // but specifically does NOT clamp like SaturateCast would.
+    // We only assert that the values DIFFER from the saturate-cast behaviour to lock
+    // in StaticCast's pass-through semantics.
+    TEST_CASE(EXPECT_NE(static_cast<int>(StaticCast<uint8_t>(int32_t{300})),
+                        static_cast<int>(SaturateCast<uint8_t>(int32_t{300}))));
+
+    TEST_CASES_END();
+}