vlantonov · vladiant · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,40 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.2.0] - 2026-03-29
+
+### Added
+
+- In-memory TTL cache for image metadata lookups (`get_by_id`), avoiding redundant
+  database hits on repeated reads. Implemented as a `CachedImageRepository` decorator
+  wrapping the existing `PostgresImageRepository`.
+- New `IMG_CACHE_TTL_SECONDS` (default 60) and `IMG_CACHE_MAX_SIZE` (default 1024)
+  configuration settings for cache tuning.
+
+### Changed
+
+- Health endpoint (`/health`) now reads version from `importlib.metadata` instead of
+  hardcoding `"1.0.0"`, and verifies database connectivity (`SELECT 1`) and storage
+  directory existence. Response includes a `checks` map with per-component status;
+  overall status reports `"degraded"` if any check fails.
+- `list_images()` and `get_expired()` now use server-side cursors
+  (`session.stream_scalars`) instead of buffered `execute` + `all()`, reducing peak
+  memory usage for large result sets.
+- C++ `bilinear_resize` uses SSE2 SIMD intrinsics on x86-64 to interpolate all
+  channels per pixel in parallel, with a scalar fallback for other architectures.
+  Arithmetic switched from `double` to `float` for better vectorization throughput.
+- C++ `bilinear_resize` now accepts and returns NumPy `uint8` arrays
+  (`py::array_t<uint8_t>`) instead of `std::vector<uint8_t>`, eliminating the
+  per-element copy between Python lists and C++ vectors. CMake builds with
+  `-march=native` to enable host-optimal SIMD.
+
+### Fixed
+
+- C++ `fast_resize.cpp` now passes `clang-tidy` with `bugprone-*`, `readability-*`,
+  `performance-*`, and `modernize-*` checks: renamed short identifiers, added explicit
+  `static_cast`, extracted magic numbers to constants, used uppercase float literal
+  suffixes, added parentheses for clarity, and passed NumPy array by `const&`.
+
 ## [1.1.0] - 2026-03-28
 
 ### Added

diff --git a/PROJECT_DESCRIPTION.md b/PROJECT_DESCRIPTION.md
@@ -144,7 +144,7 @@ For particularly performance-critical image processing scenarios, the project in
 
 ## Testing Strategy
 
-The project includes **29 tests** covering all architectural layers, all passing without requiring external services:
+The project includes **tests** covering all architectural layers, all passing without requiring external services:
 
 | Layer | Tests | Strategy |
 |-------|-------|----------|

diff --git a/README.md b/README.md
@@ -98,7 +98,7 @@ src/
 cpp/                                   # Optional C++ resize module (pybind11)
 k8s/                                   # Kubernetes manifests (Deployment, HPA, PVC, …)
 minikube/                              # Local K8s demo scripts
-tests/                                 # 29 tests across all architecture layers
+tests/                                 # tests across all architecture layers
 ```
 
 ## Testing
@@ -107,5 +107,5 @@ tests/                                 # 29 tests across all architecture layers
 pytest tests/ -v
 ```
 
-All 29 tests pass without external services — domain tests are pure unit tests, application tests use mocked ports, infrastructure tests use real Pillow/filesystem I/O, and API tests use FastAPI `TestClient` with dependency overrides.
+All tests pass without external services — domain tests are pure unit tests, application tests use mocked ports, infrastructure tests use real Pillow/filesystem I/O, and API tests use FastAPI `TestClient` with dependency overrides.
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -8,6 +8,6 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 find_package(pybind11 REQUIRED)
 
 pybind11_add_module(fast_resize fast_resize.cpp)
-target_compile_options(fast_resize PRIVATE -O3 -Wall -Wextra)
+target_compile_options(fast_resize PRIVATE -O3 -Wall -Wextra -march=native)
 
 install(TARGETS fast_resize DESTINATION .)
diff --git a/cpp/fast_resize.cpp b/cpp/fast_resize.cpp
@@ -5,38 +5,190 @@
  * for bilinear interpolation resize, significantly faster than pure-Python
  * for large images in tight loops.
  *
+ * Features:
+ *   - SSE2 SIMD for parallel per-pixel channel interpolation (x86-64)
+ *   - Zero-copy I/O via NumPy arrays (no Python list conversion overhead)
+ *   - Scalar fallback for non-x86 architectures
+ *
  * Build:
- *   pip install pybind11
- *   c++ -O3 -Wall -shared -std=c++17 -fPIC \
+ *   pip install pybind11 numpy
+ *   c++ -O3 -Wall -shared -std=c++17 -fPIC -march=native \
  *       $(python3 -m pybind11 --includes) \
  *       fast_resize.cpp -o fast_resize$(python3-config --extension-suffix)
  */
 
 #include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
 #include <pybind11/stl.h>
 #include <algorithm>
 #include <cmath>
+#include <cstddef>
 #include <cstdint>
+#include <cstring>
 #include <stdexcept>
-#include <vector>
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
 
 namespace py = pybind11;
 
 namespace fast_resize {
 
+constexpr float MAX_CHANNEL_VALUE = 255.0F;
+
+// ── SSE2 SIMD path ─────────────────────────────────────────────────────────
+
+#ifdef __SSE2__
+
+/// Unpack the low 4 bytes of a 128-bit int register to 4 floats.
+static inline __m128 unpack_u8x4_to_ps(__m128i packed) {
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i i16 = _mm_unpacklo_epi8(packed, zero);
+    const __m128i i32 = _mm_unpacklo_epi16(i16, zero);
+    return _mm_cvtepi32_ps(i32);
+}
+
+/// Pack 4 floats (clamped to [0,255]) to the low 4 bytes of a 128-bit int register.
+static inline __m128i pack_ps_to_u8x4(__m128 values) {
+    values = _mm_max_ps(
+        _mm_min_ps(values, _mm_set1_ps(MAX_CHANNEL_VALUE)), _mm_setzero_ps());
+    const __m128i i32 = _mm_cvttps_epi32(values);
+    const __m128i i16 = _mm_packs_epi32(i32, i32);
+    return _mm_packus_epi16(i16, i16);
+}
+
+/// Load channels bytes (3 or 4) from ptr into the low bytes of __m128i.
+static inline __m128i load_pixel(const uint8_t* ptr, int num_channels) {
+    int32_t val = 0;
+    std::memcpy(&val, ptr, static_cast<size_t>(num_channels));
+    return _mm_cvtsi32_si128(val);
+}
+
+static void bilinear_resize_sse2(
+    const uint8_t* src, int src_w,
+    int src_h,  // NOLINT(bugprone-easily-swappable-parameters)
+    int num_channels,
+    uint8_t* dst, int dst_w, int dst_h)
+{
+    const auto col_stride = static_cast<ptrdiff_t>(num_channels);
+    const float x_ratio = static_cast<float>(src_w) / static_cast<float>(dst_w);
+    const float y_ratio = static_cast<float>(src_h) / static_cast<float>(dst_h);
+
+    for (int dst_y = 0; dst_y < dst_h; ++dst_y) {
+        const float src_y = static_cast<float>(dst_y) * y_ratio;
+        const int row_top = static_cast<int>(src_y);
+        const int row_bot = std::min(row_top + 1, src_h - 1);
+        const float frac_y = src_y - static_cast<float>(row_top);
+        const __m128 frac_y_v = _mm_set1_ps(frac_y);
+        const __m128 one_minus_fy = _mm_set1_ps(1.0F - frac_y);
+
+        const auto top_row_off = static_cast<ptrdiff_t>(row_top) * src_w;
+        const auto bot_row_off = static_cast<ptrdiff_t>(row_bot) * src_w;
+
+        for (int dst_x = 0; dst_x < dst_w; ++dst_x) {
+            const float src_x = static_cast<float>(dst_x) * x_ratio;
+            const int col_left = static_cast<int>(src_x);
+            const int col_right = std::min(col_left + 1, src_w - 1);
+            const float frac_x = src_x - static_cast<float>(col_left);
+            const __m128 frac_x_v = _mm_set1_ps(frac_x);
+            const __m128 one_minus_fx = _mm_set1_ps(1.0F - frac_x);
+
+            // Load four corner pixels and unpack to float
+            const __m128 top_left = unpack_u8x4_to_ps(
+                load_pixel(src + ((top_row_off + col_left) * col_stride), num_channels));
+            const __m128 top_right = unpack_u8x4_to_ps(
+                load_pixel(src + ((top_row_off + col_right) * col_stride), num_channels));
+            const __m128 bot_left = unpack_u8x4_to_ps(
+                load_pixel(src + ((bot_row_off + col_left) * col_stride), num_channels));
+            const __m128 bot_right = unpack_u8x4_to_ps(
+                load_pixel(src + ((bot_row_off + col_right) * col_stride), num_channels));
+
+            // Bilinear interpolation on all channels simultaneously
+            const __m128 top = _mm_add_ps(
+                _mm_mul_ps(top_left, one_minus_fx), _mm_mul_ps(top_right, frac_x_v));
+            const __m128 bottom = _mm_add_ps(
+                _mm_mul_ps(bot_left, one_minus_fx), _mm_mul_ps(bot_right, frac_x_v));
+            const __m128 result = _mm_add_ps(
+                _mm_mul_ps(top, one_minus_fy), _mm_mul_ps(bottom, frac_y_v));
+
+            // Pack back to uint8 and store
+            const int32_t pixel_packed = _mm_cvtsi128_si32(pack_ps_to_u8x4(result));
+            const auto dst_off =
+                (static_cast<ptrdiff_t>(dst_y) * dst_w + dst_x) * col_stride;
+            std::memcpy(dst + dst_off, &pixel_packed, static_cast<size_t>(num_channels));
+        }
+    }
+}
+
+#endif  // __SSE2__
+
+// ── Scalar fallback ─────────────────────────────────────────────────────────
+
+#ifndef __SSE2__
+
+static void bilinear_resize_scalar(
+    const uint8_t* src, int src_w,
+    int src_h,  // NOLINT(bugprone-easily-swappable-parameters)
+    int num_channels,
+    uint8_t* dst, int dst_w, int dst_h)
+{
+    const float x_ratio = static_cast<float>(src_w) / static_cast<float>(dst_w);
+    const float y_ratio = static_cast<float>(src_h) / static_cast<float>(dst_h);
+
+    for (int dst_y = 0; dst_y < dst_h; ++dst_y) {
+        const float src_y = static_cast<float>(dst_y) * y_ratio;
+        const int row_top = static_cast<int>(src_y);
+        const int row_bot = std::min(row_top + 1, src_h - 1);
+        const float frac_y = src_y - static_cast<float>(row_top);
+
+        for (int dst_x = 0; dst_x < dst_w; ++dst_x) {
+            const float src_x = static_cast<float>(dst_x) * x_ratio;
+            const int col_left = static_cast<int>(src_x);
+            const int col_right = std::min(col_left + 1, src_w - 1);
+            const float frac_x = src_x - static_cast<float>(col_left);
+
+            for (int chan = 0; chan < num_channels; ++chan) {
+                const float top_left =
+                    src[(row_top * src_w + col_left) * num_channels + chan];
+                const float top_right =
+                    src[(row_top * src_w + col_right) * num_channels + chan];
+                const float bot_left =
+                    src[(row_bot * src_w + col_left) * num_channels + chan];
+                const float bot_right =
+                    src[(row_bot * src_w + col_right) * num_channels + chan];
+
+                const float top    = top_left + frac_x * (top_right - top_left);
+                const float bottom = bot_left + frac_x * (bot_right - bot_left);
+                const float value  = top + frac_y * (bottom - top);
+
+                dst[(dst_y * dst_w + dst_x) * num_channels + chan] =
+                    static_cast<uint8_t>(std::clamp(value, 0.0F, MAX_CHANNEL_VALUE));
+            }
+        }
+    }
+}
+
+#endif  // !__SSE2__
+
+// ── Public API ──────────────────────────────────────────────────────────────
+
 /**
  * Bilinear interpolation resize for 8-bit RGB/RGBA images.
  *
+ * Accepts and returns NumPy uint8 arrays (zero-copy, no Python list overhead).
+ * Uses SSE2 intrinsics on x86-64 to interpolate all channels in parallel.
+ *
  * @param src       Flat pixel buffer (row-major, channels interleaved)
  * @param src_w     Source width
  * @param src_h     Source height
  * @param channels  3 (RGB) or 4 (RGBA)
  * @param dst_w     Target width
  * @param dst_h     Target height
- * @return          Resized flat pixel buffer
+ * @return          Resized flat pixel buffer as NumPy uint8 array
  */
-std::vector<uint8_t> bilinear_resize(
-    const std::vector<uint8_t>& src,
+py::array_t<uint8_t> bilinear_resize(
+    const py::array_t<uint8_t, py::array::c_style | py::array::forcecast>& src,
     int src_w, int src_h, int channels,
     int dst_w, int dst_h)
 {
@@ -46,42 +198,26 @@ std::vector<uint8_t> bilinear_resize(
     if (src_w <= 0 || src_h <= 0 || dst_w <= 0 || dst_h <= 0) {
         throw std::invalid_argument("dimensions must be positive");
     }
-    if (static_cast<size_t>(src_w) * src_h * channels != src.size()) {
+
+    const auto buf = src.request();
+    const auto expected = static_cast<py::ssize_t>(src_w) * src_h * channels;
+    if (buf.size != expected) {
         throw std::invalid_argument("src buffer size mismatch");
     }
 
-    std::vector<uint8_t> dst(static_cast<size_t>(dst_w) * dst_h * channels);
-
-    const double x_ratio = static_cast<double>(src_w) / dst_w;
-    const double y_ratio = static_cast<double>(src_h) / dst_h;
-
-    for (int dy = 0; dy < dst_h; ++dy) {
-        const double src_y = dy * y_ratio;
-        const int y0 = static_cast<int>(std::floor(src_y));
-        const int y1 = std::min(y0 + 1, src_h - 1);
-        const double fy = src_y - y0;
+    const auto* src_data = static_cast<const uint8_t*>(buf.ptr);
 
-        for (int dx = 0; dx < dst_w; ++dx) {
-            const double src_x = dx * x_ratio;
-            const int x0 = static_cast<int>(std::floor(src_x));
-            const int x1 = std::min(x0 + 1, src_w - 1);
-            const double fx = src_x - x0;
+    // Allocate output NumPy array — caller receives it with zero copy
+    const auto dst_size = static_cast<py::ssize_t>(dst_w) * dst_h * channels;
+    py::array_t<uint8_t> dst(dst_size);
+    auto* dst_data = static_cast<uint8_t*>(dst.request().ptr);
 
-            for (int c = 0; c < channels; ++c) {
-                const double top_left     = src[(y0 * src_w + x0) * channels + c];
-                const double top_right    = src[(y0 * src_w + x1) * channels + c];
-                const double bottom_left  = src[(y1 * src_w + x0) * channels + c];
-                const double bottom_right = src[(y1 * src_w + x1) * channels + c];
+#ifdef __SSE2__
+    bilinear_resize_sse2(src_data, src_w, src_h, channels, dst_data, dst_w, dst_h);
+#else
+    bilinear_resize_scalar(src_data, src_w, src_h, channels, dst_data, dst_w, dst_h);
+#endif
 
-                const double top    = top_left    + fx * (top_right    - top_left);
-                const double bottom = bottom_left + fx * (bottom_right - bottom_left);
-                const double value  = top + fy * (bottom - top);
-
-                dst[(dy * dst_w + dx) * channels + c] =
-                    static_cast<uint8_t>(std::clamp(value, 0.0, 255.0));
-            }
-        }
-    }
     return dst;
 }
 
@@ -106,14 +242,15 @@ std::pair<int, int> fit_dimensions(int src_w, int src_h, int max_w, int max_h)
 
 }  // namespace fast_resize
 
-
+// NOLINTNEXTLINE(readability-identifier-length)
 PYBIND11_MODULE(fast_resize, m) {
     m.doc() = "Performance-critical image resize operations in C++";
 
     m.def("bilinear_resize", &fast_resize::bilinear_resize,
           py::arg("src"), py::arg("src_w"), py::arg("src_h"),
           py::arg("channels"), py::arg("dst_w"), py::arg("dst_h"),
-          "Bilinear interpolation resize for 8-bit RGB/RGBA pixel buffers.");
+          "Bilinear interpolation resize for 8-bit RGB/RGBA pixel buffers.\n\n"
+          "Accepts and returns NumPy uint8 arrays for zero-copy I/O.");
 
     m.def("fit_dimensions", &fast_resize::fit_dimensions,
           py::arg("src_w"), py::arg("src_h"),

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "image-processing-service"
-version = "1.1.0"
+version = "1.2.0"
 description = "High-performance image processing microservice with Clean Architecture"
 requires-python = ">=3.11"
 dependencies = [

diff --git a/src/config.py b/src/config.py
@@ -22,6 +22,10 @@ class Settings(BaseSettings):
     processing_max_workers: int = 4
     thumbnail_max_size: int = 256
 
+    # ── Cache ─────────────────────────────────────────────────────────
+    cache_ttl_seconds: int = 60
+    cache_max_size: int = 1024
+
     # ── Retention ────────────────────────────────────────────────────────
     retention_batch_size: int = 100
 

diff --git a/src/infrastructure/cache/__init__.py b/src/infrastructure/cache/__init__.py