From 9fe502a83378139a5aa3422b8ccb2648fecdc718 Mon Sep 17 00:00:00 2001 From: David Gidwani Date: Tue, 10 Feb 2026 22:59:06 -0500 Subject: [PATCH 1/6] fix(build): only enable SIMDE_BACKEND for non-x86 architectures (#253) - SIMDE_BACKEND was unconditionally enabled for all vectorscan builds, which disables native x86 CPU feature detection and caps performance at SSE2 level - on x86-64, this caused a ~2.5-13x throughput regression vs v0.7.21 because vectorscan's runtime dispatch to SSE4.2/AVX2/AVX512 code paths was completely bypassed - now only enables SIMDE_BACKEND on ARM and other non-x86 architectures where vectorscan genuinely needs the SIMD emulation layer - add benchmark script for reproducing and validating the regression --- CMakeLists.txt | 16 +++- tools/bench_regression.py | 195 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 206 insertions(+), 5 deletions(-) create mode 100644 tools/bench_regression.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 69dc629..0177afe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -261,16 +261,22 @@ if(HS_BUILD_REQUIRED) else() set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") - # Architecture-specific compiler flags + # Architecture-specific compiler flags and SIMDE_BACKEND selection. + # SIMDE_BACKEND is only enabled for non-x86 architectures (ARM, etc.) + # where vectorscan has no native SIMD support. On x86-64, the native + # backend provides runtime CPU feature detection (SSE4.2/AVX2/AVX512) + # which is critical for performance. Enabling SIMDE_BACKEND on x86-64 + # disables all higher ISA code paths and caps performance at SSE2 + # level (~10-15x slower). See: https://github.com/darvid/python-hyperscan/issues/253 + set(HS_USE_SIMDE_BACKEND OFF) if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|aarch64|arm64)") - # ARM architecture - use conservative flags with SIMDE_BACKEND set(HS_CMAKE_COMMON_FLAGS "-fPIC") + set(HS_USE_SIMDE_BACKEND ON) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86|X86|amd64|AMD64|x86_64|i[3-6]86)") - # x86/x86_64 architecture - use compatible x86-64 baseline set(HS_CMAKE_COMMON_FLAGS "-march=x86-64 -fPIC") else() - # Other architectures - rely on SIMDE_BACKEND for portability set(HS_CMAKE_COMMON_FLAGS "-fPIC") + set(HS_USE_SIMDE_BACKEND ON) endif() @@ -447,7 +453,7 @@ if(HS_BUILD_REQUIRED) if(USE_VECTORSCAN) list( APPEND HS_CMAKE_ARGS - -DSIMDE_BACKEND=ON + -DSIMDE_BACKEND=${HS_USE_SIMDE_BACKEND} -DRAGEL=${RAGEL_EXECUTABLE} -DPCRE_BUILD_SOURCE=ON -DBUILD_STATIC_LIBS=ON diff --git a/tools/bench_regression.py b/tools/bench_regression.py new file mode 100644 index 0000000..e614f1b --- /dev/null +++ b/tools/bench_regression.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python +"""Benchmark to reproduce performance regression reported in #253. + +Simulates the reporter's workload: 50 patterns scanning 500KB documents +in block mode. Reports throughput (MB/s) and average time per scan. + +Usage: + python tools/bench_regression.py + python tools/bench_regression.py --patterns 100 --doc-size 1048576 +""" + +import argparse +import os +import random +import statistics +import string +import time + +import hyperscan + + +def generate_patterns(count): + """Generate realistic regex patterns for benchmarking.""" + templates = [ + rb"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", + rb"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", + rb"\b(https?|ftp)://[^\s/$.?#].[^\s]*\b", + rb"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b", + rb"\b[A-Z][a-z]+\s[A-Z][a-z]+\b", + rb"[0-9a-fA-F]{32}", + rb"\b(error|warning|critical|fatal)\b", + rb"\b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\b", + rb"<[^>]+>", + rb"\$\d+[\.,]?\d*", + ] + keyword_bases = [ + b"password", b"secret", b"token", b"api.key", + b"authorization", b"credential", b"private", + b"admin", b"root", b"config", b"database", + b"server", b"client", b"session", b"cookie", + b"header", b"payload", b"request", b"response", + b"encrypt", b"decrypt", b"hash", b"salt", + b"certificate", b"key.file", b"login", b"logout", + b"access", b"permission", b"role", b"user", + b"account", b"profile", b"setting", b"option", + b"enable", b"disable", b"start", b"stop", + b"create", b"delete", b"update", b"select", + ] + + patterns = [] + for i in range(count): + if i < len(templates): + patterns.append(templates[i]) + else: + base = keyword_bases[i % len(keyword_bases)] + suffix = str(i).encode() + patterns.append(rb"\b" + base + suffix + rb"\b") + return patterns + + +def generate_document(size): + """Generate a synthetic document of approximately the given size.""" + words = [ + "the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog", + "server", "error", "warning", "request", "response", "data", + "user", "admin", "config", "session", "token", "password", + "https://example.com/path", "192.168.1.100", "test@email.com", + "2025-01-15T10:30:00", "$1,234.56", "abcdef1234567890abcdef", + "authorization", "credential", "certificate", "encrypted", + ] + rng = random.Random(42) + chunks = [] + total = 0 + while total < size: + line_words = rng.choices(words, k=rng.randint(5, 20)) + line = " ".join(line_words) + "\n" + chunks.append(line) + total += len(line) + return "".join(chunks)[:size].encode("utf-8") + + +def run_benchmark(db, document, num_scans, warmup=3): + """Run the benchmark and return per-scan times.""" + match_count = 0 + + def on_match(id, start, end, flags, ctx): + nonlocal match_count + match_count += 1 + + # warmup + for _ in range(warmup): + db.scan(document, match_event_handler=on_match) + + match_count = 0 + times = [] + for _ in range(num_scans): + t0 = time.perf_counter() + db.scan(document, match_event_handler=on_match) + t1 = time.perf_counter() + times.append(t1 - t0) + + return times, match_count + + +def main(): + parser = argparse.ArgumentParser( + description="Benchmark for hyperscan regression #253" + ) + parser.add_argument( + "--patterns", type=int, default=50, + help="Number of regex patterns (default: 50)", + ) + parser.add_argument( + "--doc-size", type=int, default=500_000, + help="Document size in bytes (default: 500000)", + ) + parser.add_argument( + "--scans", type=int, default=100, + help="Number of scans to perform (default: 100)", + ) + parser.add_argument( + "--warmup", type=int, default=5, + help="Number of warmup scans (default: 5)", + ) + args = parser.parse_args() + + print("=" * 60) + print("hyperscan regression benchmark (#253)") + print("=" * 60) + + db_info = hyperscan.Database(mode=hyperscan.HS_MODE_BLOCK) + patterns = generate_patterns(args.patterns) + db_info.compile( + expressions=patterns, + ids=list(range(len(patterns))), + flags=[hyperscan.HS_FLAG_CASELESS | hyperscan.HS_FLAG_SINGLEMATCH] + * len(patterns), + ) + + print(f"engine info: {db_info.info().decode()}") + print(f"database size: {db_info.size():,} bytes") + print(f"pattern count: {args.patterns}") + print(f"document size: {args.doc_size:,} bytes") + print(f"scan iterations: {args.scans}") + print(f"warmup scans: {args.warmup}") + print() + + document = generate_document(args.doc_size) + + print("running benchmark...") + times, match_count = run_benchmark( + db_info, document, args.scans, args.warmup + ) + + avg_time = statistics.mean(times) + median_time = statistics.median(times) + stdev_time = statistics.stdev(times) if len(times) > 1 else 0 + min_time = min(times) + max_time = max(times) + doc_mb = args.doc_size / (1024 * 1024) + throughput_avg = doc_mb / avg_time if avg_time > 0 else float("inf") + throughput_median = ( + doc_mb / median_time if median_time > 0 else float("inf") + ) + + print() + print("-" * 60) + print("results") + print("-" * 60) + print(f"total matches: {match_count:,}") + print(f"avg time/scan: {avg_time * 1000:.3f} ms") + print(f"median time/scan: {median_time * 1000:.3f} ms") + print(f"min time/scan: {min_time * 1000:.3f} ms") + print(f"max time/scan: {max_time * 1000:.3f} ms") + print(f"stdev: {stdev_time * 1000:.3f} ms") + print(f"throughput (avg): {throughput_avg:.1f} MB/s") + print(f"throughput (median):{throughput_median:.1f} MB/s") + print() + + if avg_time * 1000 > 10: + print("!! REGRESSION DETECTED !!") + print( + f"avg scan time {avg_time*1000:.1f}ms is way above the " + f"expected ~3ms baseline from v0.7.21" + ) + print( + "likely cause: SIMDE_BACKEND=ON forcing SSE2-only code " + "paths on x86-64" + ) + else: + print("performance looks healthy") + + +if __name__ == "__main__": + main() From 3b9a0d2e32c7c7034285e5b7cce9ce7456167533 Mon Sep 17 00:00:00 2001 From: David Gidwani Date: Tue, 10 Feb 2026 23:01:48 -0500 Subject: [PATCH 2/6] ci(build): replace deprecated macos-13 runners with macos-15 - GitHub deprecated macos-13 (Intel) runners - macOS x86_64 wheels are now cross-compiled on ARM runners via Rosetta 2, which cibuildwheel handles natively --- .github/workflows/build.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 24dd812..536b0c9 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -337,28 +337,28 @@ jobs: python_id: cp314t platform_id: musllinux_aarch64 - # 🍎 macOS x86_64 - - os: macos-13 + # 🍎 macOS x86_64 (cross-compiled on ARM runner via Rosetta 2) + - os: macos-15 host_python: "3.12" python_id: cp310 platform_id: macosx_x86_64 - - os: macos-13 + - os: macos-15 host_python: "3.12" python_id: cp311 platform_id: macosx_x86_64 - - os: macos-13 + - os: macos-15 host_python: "3.12" python_id: cp312 platform_id: macosx_x86_64 - - os: macos-13 + - os: macos-15 host_python: "3.12" python_id: cp313 platform_id: macosx_x86_64 - - os: macos-13 + - os: macos-15 host_python: "3.12" python_id: cp314 platform_id: macosx_x86_64 - - os: macos-13 + - os: macos-15 host_python: "3.12" python_id: cp314t platform_id: macosx_x86_64 From ecfa54350ef607af37bebec99c347b33832d6bef Mon Sep 17 00:00:00 2001 From: David Gidwani Date: Tue, 10 Feb 2026 23:14:19 -0500 Subject: [PATCH 3/6] build: patch vectorscan x86-64-v2 march for older GCC compat - vectorscan 5.4.12 uses -march=x86-64-v2 in cflags-x86.cmake and archdetect.cmake, but GCC <11 (manylinux2014 devtoolset) does not recognize this value - patch source at build time to use -march=nehalem which provides the same SSE4.2 baseline and is supported by all GCC versions - only applied when using native x86 backend (not SIMDE_BACKEND) --- CMakeLists.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0177afe..e03b50f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -478,6 +478,20 @@ if(HS_BUILD_REQUIRED) set(HS_TARGETS --target hs --target hs_runtime --target chimera --target pcre) endif() + # Vectorscan 5.4.12 uses -march=x86-64-v2 in cflags-x86.cmake and + # archdetect.cmake, but GCC <11 (e.g. manylinux2014 devtoolset) does + # not recognize this value. Patch it to use "nehalem" which provides + # the same SSE4.2 baseline and is supported by all GCC versions. + if(USE_VECTORSCAN AND NOT HS_USE_SIMDE_BACKEND) + set(HS_PATCH_COMMAND + sed -i "s/x86-64-v2/nehalem/g" + ${hyperscan_SOURCE_DIR}/cmake/cflags-x86.cmake + ${hyperscan_SOURCE_DIR}/cmake/archdetect.cmake + ) + else() + set(HS_PATCH_COMMAND "") + endif() + ExternalProject_Add( libhs GIT_REPOSITORY ${HYPERSCAN_REPO} @@ -489,6 +503,7 @@ if(HS_BUILD_REQUIRED) SOURCE_DIR ${hyperscan_SOURCE_DIR} BINARY_DIR ${hyperscan_BINARY_DIR} STAMP_DIR ${hyperscan_STAMP_DIR} + PATCH_COMMAND ${HS_PATCH_COMMAND} INSTALL_COMMAND "" CMAKE_GENERATOR ${HS_GENERATOR} CMAKE_ARGS ${HS_CMAKE_ARGS} -Wno-dev From 1e4045cc4f7538898c867652ed4069737430e98d Mon Sep 17 00:00:00 2001 From: David Gidwani Date: Tue, 10 Feb 2026 23:45:39 -0500 Subject: [PATCH 4/6] build: fix macOS x86_64 cross-compilation on ARM runners MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - use CMAKE_OSX_ARCHITECTURES (target arch) instead of CMAKE_SYSTEM_PROCESSOR (host arch) for SIMDE_BACKEND decision on macOS, so cross-compiling x86_64 on ARM correctly disables SIMDE and builds native x86 vectorscan - forward CMAKE_OSX_ARCHITECTURES to ExternalProject_Add so vectorscan builds for the correct target architecture - handle BSD sed -i syntax difference on macOS for the x86-64-v2 → nehalem patch --- CMakeLists.txt | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e03b50f..d14dc2c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -268,11 +268,19 @@ if(HS_BUILD_REQUIRED) # which is critical for performance. Enabling SIMDE_BACKEND on x86-64 # disables all higher ISA code paths and caps performance at SSE2 # level (~10-15x slower). See: https://github.com/darvid/python-hyperscan/issues/253 + # + # For macOS cross-compilation (e.g. building x86_64 on ARM runner), + # CMAKE_OSX_ARCHITECTURES reflects the TARGET arch and takes priority + # over CMAKE_SYSTEM_PROCESSOR (which reflects the HOST). set(HS_USE_SIMDE_BACKEND OFF) - if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|aarch64|arm64)") + set(_HS_TARGET_ARCH "${CMAKE_SYSTEM_PROCESSOR}") + if(APPLE AND CMAKE_OSX_ARCHITECTURES) + set(_HS_TARGET_ARCH "${CMAKE_OSX_ARCHITECTURES}") + endif() + if(_HS_TARGET_ARCH MATCHES "(arm|aarch64|arm64)") set(HS_CMAKE_COMMON_FLAGS "-fPIC") set(HS_USE_SIMDE_BACKEND ON) - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86|X86|amd64|AMD64|x86_64|i[3-6]86)") + elseif(_HS_TARGET_ARCH MATCHES "(x86|X86|amd64|AMD64|x86_64|i[3-6]86)") set(HS_CMAKE_COMMON_FLAGS "-march=x86-64 -fPIC") else() set(HS_CMAKE_COMMON_FLAGS "-fPIC") @@ -450,6 +458,11 @@ if(HS_BUILD_REQUIRED) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HS_CMAKE_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HS_CMAKE_CXX_FLAGS}") + # Forward CMAKE_OSX_ARCHITECTURES to ExternalProject for cross-compilation + if(APPLE AND CMAKE_OSX_ARCHITECTURES) + list(APPEND HS_CMAKE_ARGS -DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}) + endif() + if(USE_VECTORSCAN) list( APPEND HS_CMAKE_ARGS @@ -483,11 +496,20 @@ if(HS_BUILD_REQUIRED) # not recognize this value. Patch it to use "nehalem" which provides # the same SSE4.2 baseline and is supported by all GCC versions. if(USE_VECTORSCAN AND NOT HS_USE_SIMDE_BACKEND) - set(HS_PATCH_COMMAND - sed -i "s/x86-64-v2/nehalem/g" - ${hyperscan_SOURCE_DIR}/cmake/cflags-x86.cmake - ${hyperscan_SOURCE_DIR}/cmake/archdetect.cmake - ) + if(APPLE) + # BSD sed requires -i '' for in-place editing + set(HS_PATCH_COMMAND + sed -i "" "s/x86-64-v2/nehalem/g" + ${hyperscan_SOURCE_DIR}/cmake/cflags-x86.cmake + ${hyperscan_SOURCE_DIR}/cmake/archdetect.cmake + ) + else() + set(HS_PATCH_COMMAND + sed -i "s/x86-64-v2/nehalem/g" + ${hyperscan_SOURCE_DIR}/cmake/cflags-x86.cmake + ${hyperscan_SOURCE_DIR}/cmake/archdetect.cmake + ) + endif() else() set(HS_PATCH_COMMAND "") endif() From 5a81c77df09adb725dc4c4e5e46be5e0977cab01 Mon Sep 17 00:00:00 2001 From: David Gidwani Date: Wed, 11 Feb 2026 00:04:20 -0500 Subject: [PATCH 5/6] build: use perl for x86-64-v2 patch to fix macOS sed compat - CMake's list handling drops empty string in sed -i "" causing BSD sed to fail with "rename(): No such file or directory" - perl -pi -e works identically on Linux and macOS --- CMakeLists.txt | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d14dc2c..44b3948 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -495,21 +495,13 @@ if(HS_BUILD_REQUIRED) # archdetect.cmake, but GCC <11 (e.g. manylinux2014 devtoolset) does # not recognize this value. Patch it to use "nehalem" which provides # the same SSE4.2 baseline and is supported by all GCC versions. + # Uses perl instead of sed to avoid BSD/GNU sed -i syntax differences. if(USE_VECTORSCAN AND NOT HS_USE_SIMDE_BACKEND) - if(APPLE) - # BSD sed requires -i '' for in-place editing - set(HS_PATCH_COMMAND - sed -i "" "s/x86-64-v2/nehalem/g" - ${hyperscan_SOURCE_DIR}/cmake/cflags-x86.cmake - ${hyperscan_SOURCE_DIR}/cmake/archdetect.cmake - ) - else() - set(HS_PATCH_COMMAND - sed -i "s/x86-64-v2/nehalem/g" - ${hyperscan_SOURCE_DIR}/cmake/cflags-x86.cmake - ${hyperscan_SOURCE_DIR}/cmake/archdetect.cmake - ) - endif() + set(HS_PATCH_COMMAND + perl -pi -e "s/x86-64-v2/nehalem/g" + ${hyperscan_SOURCE_DIR}/cmake/cflags-x86.cmake + ${hyperscan_SOURCE_DIR}/cmake/archdetect.cmake + ) else() set(HS_PATCH_COMMAND "") endif() From 3edefbd7b87757706c7baaad74e4f5458e530c75 Mon Sep 17 00:00:00 2001 From: David Gidwani Date: Wed, 11 Feb 2026 10:25:43 -0500 Subject: [PATCH 6/6] ci(build): pin uv to 0.9.x to fix Windows build failures - uv 0.10.2 leaks host Python 3.12 stdlib into cibuildwheel venvs on Windows, causing SRE module mismatch and import errors for non-3.12 Python targets --- .github/workflows/build.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 536b0c9..ed97f29 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -188,6 +188,7 @@ jobs: uses: astral-sh/setup-uv@v5 with: enable-cache: true + version: "0.9.x" - name: Remove project venv run: python -c "import pathlib, shutil; p = pathlib.Path('.venv'); shutil.rmtree(p) if p.exists() else None" @@ -425,6 +426,7 @@ jobs: uses: astral-sh/setup-uv@v5 with: enable-cache: true + version: "0.9.x" - name: Remove project venv run: python -c "import pathlib, shutil; p = pathlib.Path('.venv'); shutil.rmtree(p) if p.exists() else None"