From 06d61136980cff202493cfe13c7573b241d192be Mon Sep 17 00:00:00 2001 From: dmaman <22967202+thedonmon@users.noreply.github.com> Date: Sun, 8 Mar 2026 04:52:47 -0600 Subject: [PATCH 1/4] fix(cmake): enable FP16 NEON intrinsics on ARM64 GCC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ailego math kernels use FP16 NEON intrinsics (vfmaq_f16, vsubq_f16, vld1q_f16, etc.) which require the +fp16 architecture extension on GCC. Apple Clang enables FP16 by default on ARM64, but GCC does not — it requires explicit -march=armv8.X-a+fp16. This patch: - Updates _detect_armv8_best() to probe for +fp16 support and append it to the detected march flag when available - Updates all explicit ENABLE_ARMV8.X options to include +fp16 Without this fix, building on Linux ARM64 with GCC fails with: "target specific option mismatch" for FP16 NEON intrinsics in src/ailego/math/*_fp16.cc --- cmake/option.cmake | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/cmake/option.cmake b/cmake/option.cmake index 01388564..2fe73ce8 100644 --- a/cmake/option.cmake +++ b/cmake/option.cmake @@ -81,8 +81,19 @@ function(_detect_armv8_best) foreach(_ver IN LISTS _arm_flags) check_c_compiler_flag("-march=${_ver}" _COMP_SUPP_${_ver}) if(_COMP_SUPP_${_ver}) - _AppendFlags(CMAKE_C_FLAGS "-march=${_ver}") - _AppendFlags(CMAKE_CXX_FLAGS "-march=${_ver}") + # Check if compiler supports +fp16 extension (required for FP16 NEON + # intrinsics used in ailego math kernels). Apple Clang enables FP16 by + # default on ARM64; GCC requires the explicit +fp16 flag. + check_c_compiler_flag("-march=${_ver}+fp16" _COMP_SUPP_${_ver}_fp16) + if(_COMP_SUPP_${_ver}_fp16) + set(_march_flag "-march=${_ver}+fp16") + message(STATUS "ARM64: using ${_march_flag} (FP16 NEON enabled)") + else() + set(_march_flag "-march=${_ver}") + message(STATUS "ARM64: using ${_march_flag} (FP16 not supported)") + endif() + _AppendFlags(CMAKE_C_FLAGS "${_march_flag}") + _AppendFlags(CMAKE_CXX_FLAGS "${_march_flag}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" PARENT_SCOPE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" PARENT_SCOPE) return() @@ -179,27 +190,30 @@ if(NOT AUTO_DETECT_ARCH) endif() # ARM (newest first — allow multiple? usually only one) - # But GCC allows only one -march=, so honor highest enabled + # But GCC allows only one -march=, so honor highest enabled. + # Append +fp16 when supported — required for FP16 NEON intrinsics + # (vfmaq_f16, etc.) used in ailego math kernels. Apple Clang enables + # FP16 by default on ARM64; GCC requires the explicit +fp16 flag. if(ENABLE_ARMV8.6A) - add_arch_flag("-march=armv8.6-a" ARMV86A ENABLE_ARMV8.6A) + add_arch_flag("-march=armv8.6-a+fp16" ARMV86A ENABLE_ARMV8.6A) endif() if(ENABLE_ARMV8.5A) - add_arch_flag("-march=armv8.5-a" ARMV85A ENABLE_ARMV8.5A) + add_arch_flag("-march=armv8.5-a+fp16" ARMV85A ENABLE_ARMV8.5A) endif() if(ENABLE_ARMV8.4A) - add_arch_flag("-march=armv8.4-a" ARMV84A ENABLE_ARMV8.4A) + add_arch_flag("-march=armv8.4-a+fp16" ARMV84A ENABLE_ARMV8.4A) endif() if(ENABLE_ARMV8.3A) - add_arch_flag("-march=armv8.3-a" ARMV83A ENABLE_ARMV8.3A) + add_arch_flag("-march=armv8.3-a+fp16" ARMV83A ENABLE_ARMV8.3A) endif() if(ENABLE_ARMV8.2A) - add_arch_flag("-march=armv8.2-a" ARMV82A ENABLE_ARMV8.2A) + add_arch_flag("-march=armv8.2-a+fp16" ARMV82A ENABLE_ARMV8.2A) endif() if(ENABLE_ARMV8.1A) - add_arch_flag("-march=armv8.1-a" ARMV81A ENABLE_ARMV8.1A) + add_arch_flag("-march=armv8.1-a+fp16" ARMV81A ENABLE_ARMV8.1A) endif() if(ENABLE_ARMV8A) - add_arch_flag("-march=armv8-a" ARMV8A ENABLE_ARMV8A) + add_arch_flag("-march=armv8-a+fp16" ARMV8A ENABLE_ARMV8A) endif() else() From 68558feae94a0e487f89e2bb9669c36952af13f7 Mon Sep 17 00:00:00 2001 From: dmaman <22967202+thedonmon@users.noreply.github.com> Date: Sun, 8 Mar 2026 04:57:28 -0600 Subject: [PATCH 2/4] ci: add workflow to test ARM64 FP16 fix on all platforms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests the cmake +fp16 fix on: - linux-arm64 (ubuntu-24.04-arm) — the platform that was broken - linux-x64 (ubuntu-24.04) — regression check - macos-arm64 (macos-15) — regression check --- .github/workflows/test-arm64-fp16.yml | 138 ++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 .github/workflows/test-arm64-fp16.yml diff --git a/.github/workflows/test-arm64-fp16.yml b/.github/workflows/test-arm64-fp16.yml new file mode 100644 index 00000000..a0fcb341 --- /dev/null +++ b/.github/workflows/test-arm64-fp16.yml @@ -0,0 +1,138 @@ +name: Test ARM64 FP16 Fix + +on: + push: + branches: [fix/arm64-fp16-neon] + workflow_dispatch: + +permissions: + contents: read + +jobs: + build-arm64: + name: Build & Test (linux-arm64) + runs-on: ubuntu-24.04-arm + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install build tools + run: | + python -m pip install --upgrade pip \ + pybind11==3.0 \ + cmake==3.30.0 \ + ninja==1.11.1 \ + pytest \ + scikit-build-core \ + setuptools_scm + + - name: Build from source + run: | + NPROC=$(nproc) + echo "Building with $NPROC parallel jobs" + + CMAKE_GENERATOR="Unix Makefiles" \ + CMAKE_BUILD_PARALLEL_LEVEL="$NPROC" \ + python -m pip install -v . --no-build-isolation + + - name: Verify FP16 NEON compiled + run: | + echo "Checking that FP16 NEON objects were compiled..." + find build/ -name "*fp16*.o" -type f | head -20 + echo "Build completed successfully on ARM64 with FP16 NEON support" + + - name: Run C++ Tests + run: | + cd build + make unittest -j$(nproc) + + - name: Run Python Tests + run: python -m pytest python/tests/ + + build-x64: + name: Build & Test (linux-x64) + runs-on: ubuntu-24.04 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install build tools + run: | + python -m pip install --upgrade pip \ + pybind11==3.0 \ + cmake==3.30.0 \ + ninja==1.11.1 \ + pytest \ + scikit-build-core \ + setuptools_scm + + - name: Build from source + run: | + NPROC=$(nproc) + CMAKE_GENERATOR="Unix Makefiles" \ + CMAKE_BUILD_PARALLEL_LEVEL="$NPROC" \ + python -m pip install -v . --no-build-isolation + + - name: Run C++ Tests + run: | + cd build + make unittest -j$(nproc) + + - name: Run Python Tests + run: python -m pytest python/tests/ + + build-macos: + name: Build & Test (macos-arm64) + runs-on: macos-15 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install build tools + run: | + python -m pip install --upgrade pip \ + pybind11==3.0 \ + cmake==3.30.0 \ + ninja==1.11.1 \ + pytest \ + scikit-build-core \ + setuptools_scm + + - name: Build from source + run: | + NPROC=$(sysctl -n hw.ncpu) + CMAKE_GENERATOR="Unix Makefiles" \ + CMAKE_BUILD_PARALLEL_LEVEL="$NPROC" \ + python -m pip install -v . --no-build-isolation + + - name: Run C++ Tests + run: | + cd build + make unittest -j$(sysctl -n hw.ncpu) + + - name: Run Python Tests + run: python -m pytest python/tests/ From 2b5eb87e7cdc05467047f2182ec644d8b8135962 Mon Sep 17 00:00:00 2001 From: dmaman <22967202+thedonmon@users.noreply.github.com> Date: Sun, 8 Mar 2026 14:01:57 -0600 Subject: [PATCH 3/4] chore: remove test workflow (verification complete) --- .github/workflows/test-arm64-fp16.yml | 138 -------------------------- 1 file changed, 138 deletions(-) delete mode 100644 .github/workflows/test-arm64-fp16.yml diff --git a/.github/workflows/test-arm64-fp16.yml b/.github/workflows/test-arm64-fp16.yml deleted file mode 100644 index a0fcb341..00000000 --- a/.github/workflows/test-arm64-fp16.yml +++ /dev/null @@ -1,138 +0,0 @@ -name: Test ARM64 FP16 Fix - -on: - push: - branches: [fix/arm64-fp16-neon] - workflow_dispatch: - -permissions: - contents: read - -jobs: - build-arm64: - name: Build & Test (linux-arm64) - runs-on: ubuntu-24.04-arm - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - - name: Install build tools - run: | - python -m pip install --upgrade pip \ - pybind11==3.0 \ - cmake==3.30.0 \ - ninja==1.11.1 \ - pytest \ - scikit-build-core \ - setuptools_scm - - - name: Build from source - run: | - NPROC=$(nproc) - echo "Building with $NPROC parallel jobs" - - CMAKE_GENERATOR="Unix Makefiles" \ - CMAKE_BUILD_PARALLEL_LEVEL="$NPROC" \ - python -m pip install -v . --no-build-isolation - - - name: Verify FP16 NEON compiled - run: | - echo "Checking that FP16 NEON objects were compiled..." - find build/ -name "*fp16*.o" -type f | head -20 - echo "Build completed successfully on ARM64 with FP16 NEON support" - - - name: Run C++ Tests - run: | - cd build - make unittest -j$(nproc) - - - name: Run Python Tests - run: python -m pytest python/tests/ - - build-x64: - name: Build & Test (linux-x64) - runs-on: ubuntu-24.04 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - - name: Install build tools - run: | - python -m pip install --upgrade pip \ - pybind11==3.0 \ - cmake==3.30.0 \ - ninja==1.11.1 \ - pytest \ - scikit-build-core \ - setuptools_scm - - - name: Build from source - run: | - NPROC=$(nproc) - CMAKE_GENERATOR="Unix Makefiles" \ - CMAKE_BUILD_PARALLEL_LEVEL="$NPROC" \ - python -m pip install -v . --no-build-isolation - - - name: Run C++ Tests - run: | - cd build - make unittest -j$(nproc) - - - name: Run Python Tests - run: python -m pytest python/tests/ - - build-macos: - name: Build & Test (macos-arm64) - runs-on: macos-15 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - - name: Install build tools - run: | - python -m pip install --upgrade pip \ - pybind11==3.0 \ - cmake==3.30.0 \ - ninja==1.11.1 \ - pytest \ - scikit-build-core \ - setuptools_scm - - - name: Build from source - run: | - NPROC=$(sysctl -n hw.ncpu) - CMAKE_GENERATOR="Unix Makefiles" \ - CMAKE_BUILD_PARALLEL_LEVEL="$NPROC" \ - python -m pip install -v . --no-build-isolation - - - name: Run C++ Tests - run: | - cd build - make unittest -j$(sysctl -n hw.ncpu) - - - name: Run Python Tests - run: python -m pytest python/tests/ From 2071b7f3768d845cc19cc8e576c85da8441a310f Mon Sep 17 00:00:00 2001 From: dmaman <22967202+thedonmon@users.noreply.github.com> Date: Sun, 8 Mar 2026 14:14:27 -0600 Subject: [PATCH 4/4] fix(cmake): add probe-then-fallback for manual ARM +fp16 options MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review feedback: the manual ENABLE_ARMV8.X options were hardcoding +fp16 without a fallback, which would cause FATAL_ERROR on compilers that support the base arch but not +fp16 (older GCC, cross-compilation toolchains). Adds add_arch_flag_with_fp16 macro that probes for +fp16 support first, then falls back to the base -march flag — matching the same pattern used in _detect_armv8_best() for auto-detection. --- cmake/option.cmake | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/cmake/option.cmake b/cmake/option.cmake index 2fe73ce8..35393510 100644 --- a/cmake/option.cmake +++ b/cmake/option.cmake @@ -74,6 +74,20 @@ macro(add_arch_flag FLAG VAR_NAME OPTION_NAME) endif() endmacro() +# Like add_arch_flag but tries -march=BASE+fp16 first, falls back to -march=BASE. +# FP16 NEON intrinsics (vfmaq_f16, etc.) in ailego math kernels require +fp16 +# on GCC. Apple Clang enables FP16 by default on ARM64. +macro(add_arch_flag_with_fp16 BASE_ARCH VAR_NAME OPTION_NAME) + check_c_compiler_flag("-march=${BASE_ARCH}+fp16" COMPILER_SUPPORT_${VAR_NAME}_FP16) + if(COMPILER_SUPPORT_${VAR_NAME}_FP16) + add_arch_flag("-march=${BASE_ARCH}+fp16" ${VAR_NAME} ${OPTION_NAME}) + message(STATUS "ARM64: using -march=${BASE_ARCH}+fp16 (FP16 NEON enabled)") + else() + add_arch_flag("-march=${BASE_ARCH}" ${VAR_NAME} ${OPTION_NAME}) + message(STATUS "ARM64: using -march=${BASE_ARCH} (FP16 not supported by compiler)") + endif() +endmacro() + function(_detect_armv8_best) set(_arm_flags "armv8.6-a" "armv8.5-a" "armv8.4-a" "armv8.3-a" "armv8.2-a" "armv8.1-a" "armv8-a" "armv8" @@ -191,29 +205,28 @@ if(NOT AUTO_DETECT_ARCH) # ARM (newest first — allow multiple? usually only one) # But GCC allows only one -march=, so honor highest enabled. - # Append +fp16 when supported — required for FP16 NEON intrinsics - # (vfmaq_f16, etc.) used in ailego math kernels. Apple Clang enables - # FP16 by default on ARM64; GCC requires the explicit +fp16 flag. + # Try +fp16 first (required for FP16 NEON intrinsics in ailego math + # kernels), fall back to base flag if compiler doesn't support it. if(ENABLE_ARMV8.6A) - add_arch_flag("-march=armv8.6-a+fp16" ARMV86A ENABLE_ARMV8.6A) + add_arch_flag_with_fp16("armv8.6-a" ARMV86A ENABLE_ARMV8.6A) endif() if(ENABLE_ARMV8.5A) - add_arch_flag("-march=armv8.5-a+fp16" ARMV85A ENABLE_ARMV8.5A) + add_arch_flag_with_fp16("armv8.5-a" ARMV85A ENABLE_ARMV8.5A) endif() if(ENABLE_ARMV8.4A) - add_arch_flag("-march=armv8.4-a+fp16" ARMV84A ENABLE_ARMV8.4A) + add_arch_flag_with_fp16("armv8.4-a" ARMV84A ENABLE_ARMV8.4A) endif() if(ENABLE_ARMV8.3A) - add_arch_flag("-march=armv8.3-a+fp16" ARMV83A ENABLE_ARMV8.3A) + add_arch_flag_with_fp16("armv8.3-a" ARMV83A ENABLE_ARMV8.3A) endif() if(ENABLE_ARMV8.2A) - add_arch_flag("-march=armv8.2-a+fp16" ARMV82A ENABLE_ARMV8.2A) + add_arch_flag_with_fp16("armv8.2-a" ARMV82A ENABLE_ARMV8.2A) endif() if(ENABLE_ARMV8.1A) - add_arch_flag("-march=armv8.1-a+fp16" ARMV81A ENABLE_ARMV8.1A) + add_arch_flag_with_fp16("armv8.1-a" ARMV81A ENABLE_ARMV8.1A) endif() if(ENABLE_ARMV8A) - add_arch_flag("-march=armv8-a+fp16" ARMV8A ENABLE_ARMV8A) + add_arch_flag_with_fp16("armv8-a" ARMV8A ENABLE_ARMV8A) endif() else()