Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,40 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [1.2.0] - 2026-03-29

### Added

- In-memory TTL cache for image metadata lookups (`get_by_id`), avoiding redundant
database hits on repeated reads. Implemented as a `CachedImageRepository` decorator
wrapping the existing `PostgresImageRepository`.
- New `IMG_CACHE_TTL_SECONDS` (default 60) and `IMG_CACHE_MAX_SIZE` (default 1024)
configuration settings for cache tuning.

### Changed

- Health endpoint (`/health`) now reads version from `importlib.metadata` instead of
hardcoding `"1.0.0"`, and verifies database connectivity (`SELECT 1`) and storage
directory existence. Response includes a `checks` map with per-component status;
overall status reports `"degraded"` if any check fails.
- `list_images()` and `get_expired()` now use server-side cursors
(`session.stream_scalars`) instead of buffered `execute` + `all()`, reducing peak
memory usage for large result sets.
- C++ `bilinear_resize` uses SSE2 SIMD intrinsics on x86-64 to interpolate all
channels per pixel in parallel, with a scalar fallback for other architectures.
Arithmetic switched from `double` to `float` for better vectorization throughput.
- C++ `bilinear_resize` now accepts and returns NumPy `uint8` arrays
(`py::array_t<uint8_t>`) instead of `std::vector<uint8_t>`, eliminating the
per-element copy between Python lists and C++ vectors. CMake builds with
`-march=native` to enable host-optimal SIMD.

### Fixed

- C++ `fast_resize.cpp` now passes `clang-tidy` with `bugprone-*`, `readability-*`,
`performance-*`, and `modernize-*` checks: renamed short identifiers, added explicit
`static_cast`, extracted magic numbers to constants, used uppercase float literal
suffixes, added parentheses for clarity, and passed NumPy array by `const&`.

## [1.1.0] - 2026-03-28

### Added
Expand Down
2 changes: 1 addition & 1 deletion PROJECT_DESCRIPTION.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ For particularly performance-critical image processing scenarios, the project in

## Testing Strategy

The project includes **29 tests** covering all architectural layers, all passing without requiring external services:
The project includes **tests** covering all architectural layers, all passing without requiring external services:

| Layer | Tests | Strategy |
|-------|-------|----------|
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ src/
cpp/ # Optional C++ resize module (pybind11)
k8s/ # Kubernetes manifests (Deployment, HPA, PVC, …)
minikube/ # Local K8s demo scripts
tests/ # 29 tests across all architecture layers
tests/ # tests across all architecture layers
```

## Testing
Expand All @@ -107,5 +107,5 @@ tests/ # 29 tests across all architecture layers
pytest tests/ -v
```

All 29 tests pass without external services — domain tests are pure unit tests, application tests use mocked ports, infrastructure tests use real Pillow/filesystem I/O, and API tests use FastAPI `TestClient` with dependency overrides.
All tests pass without external services — domain tests are pure unit tests, application tests use mocked ports, infrastructure tests use real Pillow/filesystem I/O, and API tests use FastAPI `TestClient` with dependency overrides.

2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
find_package(pybind11 REQUIRED)

pybind11_add_module(fast_resize fast_resize.cpp)
target_compile_options(fast_resize PRIVATE -O3 -Wall -Wextra)
target_compile_options(fast_resize PRIVATE -O3 -Wall -Wextra -march=native)

install(TARGETS fast_resize DESTINATION .)
213 changes: 175 additions & 38 deletions cpp/fast_resize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,38 +5,190 @@
* for bilinear interpolation resize, significantly faster than pure-Python
* for large images in tight loops.
*
* Features:
* - SSE2 SIMD for parallel per-pixel channel interpolation (x86-64)
* - Zero-copy I/O via NumPy arrays (no Python list conversion overhead)
* - Scalar fallback for non-x86 architectures
*
* Build:
* pip install pybind11
* c++ -O3 -Wall -shared -std=c++17 -fPIC \
* pip install pybind11 numpy
* c++ -O3 -Wall -shared -std=c++17 -fPIC -march=native \
* $(python3 -m pybind11 --includes) \
* fast_resize.cpp -o fast_resize$(python3-config --extension-suffix)
*/

#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <algorithm>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <stdexcept>
#include <vector>

#ifdef __SSE2__
#include <emmintrin.h>
#endif

namespace py = pybind11;

namespace fast_resize {

constexpr float MAX_CHANNEL_VALUE = 255.0F;

// ── SSE2 SIMD path ─────────────────────────────────────────────────────────

#ifdef __SSE2__

/// Unpack the low 4 bytes of a 128-bit int register to 4 floats.
static inline __m128 unpack_u8x4_to_ps(__m128i packed) {
const __m128i zero = _mm_setzero_si128();
const __m128i i16 = _mm_unpacklo_epi8(packed, zero);
const __m128i i32 = _mm_unpacklo_epi16(i16, zero);
return _mm_cvtepi32_ps(i32);
}

/// Pack 4 floats (clamped to [0,255]) to the low 4 bytes of a 128-bit int register.
static inline __m128i pack_ps_to_u8x4(__m128 values) {
values = _mm_max_ps(
_mm_min_ps(values, _mm_set1_ps(MAX_CHANNEL_VALUE)), _mm_setzero_ps());
const __m128i i32 = _mm_cvttps_epi32(values);
const __m128i i16 = _mm_packs_epi32(i32, i32);
return _mm_packus_epi16(i16, i16);
}

/// Load channels bytes (3 or 4) from ptr into the low bytes of __m128i.
static inline __m128i load_pixel(const uint8_t* ptr, int num_channels) {
int32_t val = 0;
std::memcpy(&val, ptr, static_cast<size_t>(num_channels));
return _mm_cvtsi32_si128(val);
}

static void bilinear_resize_sse2(
const uint8_t* src, int src_w,
int src_h, // NOLINT(bugprone-easily-swappable-parameters)
int num_channels,
uint8_t* dst, int dst_w, int dst_h)
{
const auto col_stride = static_cast<ptrdiff_t>(num_channels);
const float x_ratio = static_cast<float>(src_w) / static_cast<float>(dst_w);
const float y_ratio = static_cast<float>(src_h) / static_cast<float>(dst_h);

for (int dst_y = 0; dst_y < dst_h; ++dst_y) {
const float src_y = static_cast<float>(dst_y) * y_ratio;
const int row_top = static_cast<int>(src_y);
const int row_bot = std::min(row_top + 1, src_h - 1);
const float frac_y = src_y - static_cast<float>(row_top);
const __m128 frac_y_v = _mm_set1_ps(frac_y);
const __m128 one_minus_fy = _mm_set1_ps(1.0F - frac_y);

const auto top_row_off = static_cast<ptrdiff_t>(row_top) * src_w;
const auto bot_row_off = static_cast<ptrdiff_t>(row_bot) * src_w;

for (int dst_x = 0; dst_x < dst_w; ++dst_x) {
const float src_x = static_cast<float>(dst_x) * x_ratio;
const int col_left = static_cast<int>(src_x);
const int col_right = std::min(col_left + 1, src_w - 1);
const float frac_x = src_x - static_cast<float>(col_left);
const __m128 frac_x_v = _mm_set1_ps(frac_x);
const __m128 one_minus_fx = _mm_set1_ps(1.0F - frac_x);

// Load four corner pixels and unpack to float
const __m128 top_left = unpack_u8x4_to_ps(
load_pixel(src + ((top_row_off + col_left) * col_stride), num_channels));
const __m128 top_right = unpack_u8x4_to_ps(
load_pixel(src + ((top_row_off + col_right) * col_stride), num_channels));
const __m128 bot_left = unpack_u8x4_to_ps(
load_pixel(src + ((bot_row_off + col_left) * col_stride), num_channels));
const __m128 bot_right = unpack_u8x4_to_ps(
load_pixel(src + ((bot_row_off + col_right) * col_stride), num_channels));

// Bilinear interpolation on all channels simultaneously
const __m128 top = _mm_add_ps(
_mm_mul_ps(top_left, one_minus_fx), _mm_mul_ps(top_right, frac_x_v));
const __m128 bottom = _mm_add_ps(
_mm_mul_ps(bot_left, one_minus_fx), _mm_mul_ps(bot_right, frac_x_v));
const __m128 result = _mm_add_ps(
_mm_mul_ps(top, one_minus_fy), _mm_mul_ps(bottom, frac_y_v));

// Pack back to uint8 and store
const int32_t pixel_packed = _mm_cvtsi128_si32(pack_ps_to_u8x4(result));
const auto dst_off =
(static_cast<ptrdiff_t>(dst_y) * dst_w + dst_x) * col_stride;
std::memcpy(dst + dst_off, &pixel_packed, static_cast<size_t>(num_channels));
}
}
}

#endif // __SSE2__

// ── Scalar fallback ─────────────────────────────────────────────────────────

#ifndef __SSE2__

static void bilinear_resize_scalar(
const uint8_t* src, int src_w,
int src_h, // NOLINT(bugprone-easily-swappable-parameters)
int num_channels,
uint8_t* dst, int dst_w, int dst_h)
{
const float x_ratio = static_cast<float>(src_w) / static_cast<float>(dst_w);
const float y_ratio = static_cast<float>(src_h) / static_cast<float>(dst_h);

for (int dst_y = 0; dst_y < dst_h; ++dst_y) {
const float src_y = static_cast<float>(dst_y) * y_ratio;
const int row_top = static_cast<int>(src_y);
const int row_bot = std::min(row_top + 1, src_h - 1);
const float frac_y = src_y - static_cast<float>(row_top);

for (int dst_x = 0; dst_x < dst_w; ++dst_x) {
const float src_x = static_cast<float>(dst_x) * x_ratio;
const int col_left = static_cast<int>(src_x);
const int col_right = std::min(col_left + 1, src_w - 1);
const float frac_x = src_x - static_cast<float>(col_left);

for (int chan = 0; chan < num_channels; ++chan) {
const float top_left =
src[(row_top * src_w + col_left) * num_channels + chan];
const float top_right =
src[(row_top * src_w + col_right) * num_channels + chan];
const float bot_left =
src[(row_bot * src_w + col_left) * num_channels + chan];
const float bot_right =
src[(row_bot * src_w + col_right) * num_channels + chan];

const float top = top_left + frac_x * (top_right - top_left);
const float bottom = bot_left + frac_x * (bot_right - bot_left);
const float value = top + frac_y * (bottom - top);

dst[(dst_y * dst_w + dst_x) * num_channels + chan] =
static_cast<uint8_t>(std::clamp(value, 0.0F, MAX_CHANNEL_VALUE));
}
}
}
}

#endif // !__SSE2__

// ── Public API ──────────────────────────────────────────────────────────────

/**
* Bilinear interpolation resize for 8-bit RGB/RGBA images.
*
* Accepts and returns NumPy uint8 arrays (zero-copy, no Python list overhead).
* Uses SSE2 intrinsics on x86-64 to interpolate all channels in parallel.
*
* @param src Flat pixel buffer (row-major, channels interleaved)
* @param src_w Source width
* @param src_h Source height
* @param channels 3 (RGB) or 4 (RGBA)
* @param dst_w Target width
* @param dst_h Target height
* @return Resized flat pixel buffer
* @return Resized flat pixel buffer as NumPy uint8 array
*/
std::vector<uint8_t> bilinear_resize(
const std::vector<uint8_t>& src,
py::array_t<uint8_t> bilinear_resize(
const py::array_t<uint8_t, py::array::c_style | py::array::forcecast>& src,
int src_w, int src_h, int channels,
int dst_w, int dst_h)
{
Expand All @@ -46,42 +198,26 @@ std::vector<uint8_t> bilinear_resize(
if (src_w <= 0 || src_h <= 0 || dst_w <= 0 || dst_h <= 0) {
throw std::invalid_argument("dimensions must be positive");
}
if (static_cast<size_t>(src_w) * src_h * channels != src.size()) {

const auto buf = src.request();
const auto expected = static_cast<py::ssize_t>(src_w) * src_h * channels;
if (buf.size != expected) {
throw std::invalid_argument("src buffer size mismatch");
}

std::vector<uint8_t> dst(static_cast<size_t>(dst_w) * dst_h * channels);

const double x_ratio = static_cast<double>(src_w) / dst_w;
const double y_ratio = static_cast<double>(src_h) / dst_h;

for (int dy = 0; dy < dst_h; ++dy) {
const double src_y = dy * y_ratio;
const int y0 = static_cast<int>(std::floor(src_y));
const int y1 = std::min(y0 + 1, src_h - 1);
const double fy = src_y - y0;
const auto* src_data = static_cast<const uint8_t*>(buf.ptr);

for (int dx = 0; dx < dst_w; ++dx) {
const double src_x = dx * x_ratio;
const int x0 = static_cast<int>(std::floor(src_x));
const int x1 = std::min(x0 + 1, src_w - 1);
const double fx = src_x - x0;
// Allocate output NumPy array — caller receives it with zero copy
const auto dst_size = static_cast<py::ssize_t>(dst_w) * dst_h * channels;
py::array_t<uint8_t> dst(dst_size);
auto* dst_data = static_cast<uint8_t*>(dst.request().ptr);

for (int c = 0; c < channels; ++c) {
const double top_left = src[(y0 * src_w + x0) * channels + c];
const double top_right = src[(y0 * src_w + x1) * channels + c];
const double bottom_left = src[(y1 * src_w + x0) * channels + c];
const double bottom_right = src[(y1 * src_w + x1) * channels + c];
#ifdef __SSE2__
bilinear_resize_sse2(src_data, src_w, src_h, channels, dst_data, dst_w, dst_h);
#else
bilinear_resize_scalar(src_data, src_w, src_h, channels, dst_data, dst_w, dst_h);
#endif

const double top = top_left + fx * (top_right - top_left);
const double bottom = bottom_left + fx * (bottom_right - bottom_left);
const double value = top + fy * (bottom - top);

dst[(dy * dst_w + dx) * channels + c] =
static_cast<uint8_t>(std::clamp(value, 0.0, 255.0));
}
}
}
return dst;
}

Expand All @@ -106,14 +242,15 @@ std::pair<int, int> fit_dimensions(int src_w, int src_h, int max_w, int max_h)

} // namespace fast_resize


// NOLINTNEXTLINE(readability-identifier-length)
PYBIND11_MODULE(fast_resize, m) {
m.doc() = "Performance-critical image resize operations in C++";

m.def("bilinear_resize", &fast_resize::bilinear_resize,
py::arg("src"), py::arg("src_w"), py::arg("src_h"),
py::arg("channels"), py::arg("dst_w"), py::arg("dst_h"),
"Bilinear interpolation resize for 8-bit RGB/RGBA pixel buffers.");
"Bilinear interpolation resize for 8-bit RGB/RGBA pixel buffers.\n\n"
"Accepts and returns NumPy uint8 arrays for zero-copy I/O.");

m.def("fit_dimensions", &fast_resize::fit_dimensions,
py::arg("src_w"), py::arg("src_h"),
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "image-processing-service"
version = "1.1.0"
version = "1.2.0"
description = "High-performance image processing microservice with Clean Architecture"
requires-python = ">=3.11"
dependencies = [
Expand Down
4 changes: 4 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ class Settings(BaseSettings):
processing_max_workers: int = 4
thumbnail_max_size: int = 256

# ── Cache ─────────────────────────────────────────────────────────
cache_ttl_seconds: int = 60
cache_max_size: int = 1024

# ── Retention ────────────────────────────────────────────────────────
retention_batch_size: int = 100

Expand Down
Empty file.
Loading
Loading