microsoft · edgchen1 · Jan 27, 2026 · Jan 27, 2026 · Jan 28, 2026 · Jan 28, 2026
diff --git a/.github/workflows/linux_minimal_build.yml b/.github/workflows/linux_minimal_build.yml
@@ -530,93 +530,43 @@ jobs:
             --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
 
   # Job 7: Extended minimal build with NNAPI EP for Android(arm64-v8a) and skip tests.
-  # NOTE: Keeping this as direct docker run due to custom volume mounts needed for Android SDK/NDK
 android_cpu_ep: 
 android_cpu_ep: 
   build_extended_minimal_android:
     name: 7. Build Extended Minimal (Android NNAPI)
-    needs: build_full_ort # Depends on Job 1 for test data
     runs-on: [
         "self-hosted",
         "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU",
         "JobId=build_extended_minimal_android-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
         ]
-    permissions: # Permissions needed for build-docker-image
-      contents: read
-      packages: write
-      id-token: write # If using OIDC for ACR login
     steps:
       - name: Checkout repository
         uses: actions/checkout@v6
         with:
           submodules: false
-      - uses: actions/setup-node@v6
-        with:
-          node-version: 20
-      - name: Download Test Data Artifact
-        uses: actions/download-artifact@v7
-        with:
-          name: test_data
-          path: ${{ runner.temp }}/.test_data/
-
-      - name: Get Docker Image using Action
-        uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
-        id: build_docker_image_step
-        with:
-          dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
-          image-name: ghcr.io/microsoft/onnxruntime/onnxruntimecpubuildcix64
-          push: true
-          azure-container-registry-name: onnxruntimebuildcache
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Setup Android NDK
         uses: ./.github/actions/setup-android-ndk
         with:
           ndk-version: 28.0.13004108
           # Use default android-sdk-root if not specified
 
-      - name: Run Build 7 (Using docker run)
+      - name: Run Build 7
         shell: bash
         run: |
-          # Create the target dir for build output inside the runner's temp dir first
-          mkdir -p ${{ runner.temp }}/7
-
-          # Ensure ANDROID_NDK_HOME is available and get its real path
-          if [ -z "$ANDROID_NDK_HOME" ]; then
-            echo "ANDROID_NDK_HOME is not set."
-            exit 1
-          fi
-          NDK_HOME_REALPATH=$(realpath $ANDROID_NDK_HOME)
-
-          # Ensure ANDROID_HOME is available
-          if [ -z "$ANDROID_HOME" ]; then
-            echo "ANDROID_HOME is not set. Using default /usr/local/lib/android/sdk"
-            export ANDROID_HOME=/usr/local/lib/android/sdk
-          fi
-
-          docker run --rm \
-            --volume ${{ env.BUILD_SOURCES_DIRECTORY }}:/onnxruntime_src \
-            --volume ${{ runner.temp }}:/build \
-            --volume $ANDROID_HOME:/android_home \
-            --volume $NDK_HOME_REALPATH:/ndk_home \
-            -e ALLOW_RELEASED_ONNX_OPSET_ONLY=1 \
-            -e NIGHTLY_BUILD=1 -e RUNNER_TEMP=/build \
-            ${{ steps.build_docker_image_step.outputs.full-image-name }} \
-            bash -c "python3 -m pip install -r /onnxruntime_src/tools/ci_build/requirements/pybind/requirements.txt \
-            && python3 /onnxruntime_src/tools/ci_build/build.py \
-              --build_dir /build/7 \
+          python3 ./tools/ci_build/build.py \
+              --build_dir ./build.extended_minimal.nnapi \
               --cmake_generator Ninja \
               --config MinSizeRel \
               --skip_submodule_sync \
               --parallel --use_binskim_compliant_compile_flags \
               --android \
-              --android_sdk_path /android_home \
-              --android_ndk_path /ndk_home \
+              --android_sdk_path "$ANDROID_HOME" \
+              --android_ndk_path "$ANDROID_NDK_HOME" \
               --android_abi=arm64-v8a \
               --android_api=29 \
               --use_nnapi \
               --minimal_build extended \
               --build_shared_lib \
               --disable_ml_ops \
               --disable_exceptions \
-              --skip_tests"
+              --skip_tests
         working-directory: ${{ env.BUILD_SOURCES_DIRECTORY }}
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -10,6 +10,24 @@ cmake_policy(SET CMP0104 OLD)
 # Project
 project(onnxruntime C CXX ASM)
 
+# Set C/C++ standard versions
+if (NOT CMAKE_C_STANDARD)
+  # Needed for Java
+  set(CMAKE_C_STANDARD 99)
+endif()
+
+if (NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 20)
+endif()
+
+# We don't use C++20 modules yet.
+# There are some known issues to address first:
+# - Android builds from Linux Docker containers have trouble finding clang-scan-deps.
+# - The MSVC /permissive option is needed for compiling some of the CUDA EP code which uses CUTLASS.
+#   This option is not compatible with C++20 modules.
+# So we will skip module scanning for now.
+set(CMAKE_CXX_SCAN_FOR_MODULES OFF)
+
 # Disable fast-math for Intel oneAPI compiler
 if("${CMAKE_CXX_COMPILER_ID}" MATCHES "IntelLLVM")
   if("${CMAKE_CXX_COMPILER_ID}" MATCHES "MSVC-like")
@@ -21,11 +39,6 @@ if("${CMAKE_CXX_COMPILER_ID}" MATCHES "IntelLLVM")
   endif()
 endif()
 
-# Needed for Java
-if (NOT CMAKE_CXX_STANDARD)
-  set(CMAKE_C_STANDARD 99)
-endif()
-
 include(CheckCXXCompilerFlag)
 include(CheckLanguage)
 include(CMakeDependentOption)
@@ -34,15 +47,6 @@ include(CheckFunctionExists)
 include(CheckSymbolExists)
 include(GNUInstallDirs) # onnxruntime_providers_* require CMAKE_INSTALL_* variables
 
-if (NOT CMAKE_CXX_STANDARD)
-  # TODO: update this once all system adapt c++20
-  if (CMAKE_SYSTEM_NAME STREQUAL "Darwin")
-    set(CMAKE_CXX_STANDARD 20)
-  else()
-    set(CMAKE_CXX_STANDARD 17)
-  endif()
-endif()
-
 if (MSVC)
   #  Make sure Visual Studio sets __cplusplus macro correctly: https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus")
@@ -1476,7 +1480,7 @@ configure_file(onnxruntime_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime_c
 get_property(onnxruntime_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
 
 if (onnxruntime_USE_CUDA)
-  set(CMAKE_CUDA_STANDARD 17)
+  set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
-  set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
+  if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12)
+    set(CMAKE_CUDA_STANDARD 17)
+  else()
+    set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
+  endif()
-  set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
+  if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12)
+    set(CMAKE_CUDA_STANDARD 17)
+  else()
+    set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
+  endif()
   if(onnxruntime_CUDA_HOME)
     file(TO_CMAKE_PATH ${onnxruntime_CUDA_HOME} CUDAToolkit_ROOT)
   endif()

diff --git a/cmake/external/cutlass.cmake b/cmake/external/cutlass.cmake
@@ -4,7 +4,7 @@ onnxruntime_fetchcontent_declare(
   URL ${DEP_URL_cutlass}
   URL_HASH SHA1=${DEP_SHA1_cutlass}
   EXCLUDE_FROM_ALL
-PATCH_COMMAND ${Patch_EXECUTABLE} --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/cutlass/cutlass_4.2.1.patch
+  PATCH_COMMAND ${Patch_EXECUTABLE} --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/cutlass/cutlass_4.2.1.patch
 )
 
 FetchContent_GetProperties(cutlass)

diff --git a/cmake/onnxruntime_config.h.in b/cmake/onnxruntime_config.h.in
@@ -20,6 +20,7 @@
 #cmakedefine HAS_PARENTHESES
 #cmakedefine HAS_REALLOCARRAY
 #cmakedefine HAS_SHORTEN_64_TO_32
+#cmakedefine HAS_STRINGOP_OVERFLOW
 #cmakedefine HAS_TAUTOLOGICAL_POINTER_COMPARE
 #cmakedefine HAS_UNUSED_BUT_SET_PARAMETER
 #cmakedefine HAS_UNUSED_BUT_SET_VARIABLE

diff --git a/cmake/onnxruntime_fuzz_test.cmake b/cmake/onnxruntime_fuzz_test.cmake
@@ -60,7 +60,7 @@ if (onnxruntime_FUZZ_ENABLED)
     # compile the executables
     onnxruntime_add_executable(onnxruntime_security_fuzz ${SEC_FUZ_SRC})
 
-    # compile with c++17
+    # compile with at least c++17
     target_compile_features(onnxruntime_security_fuzz PUBLIC cxx_std_17)
 
     # Security fuzzing engine header file reference

diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
@@ -149,6 +149,17 @@
     onnxruntime_add_shared_library_module(onnxruntime_providers_cuda ${onnxruntime_providers_cuda_all_srcs})
   endif()
 
+  if (MSVC)
+    # Use /permissive to work around compilation error from CUTLASS header cute/tensor.hpp:
+    #   cutlass-src\include\cute\stride.hpp(299,46): error C3545: 'Ints': parameter pack expects a non-type
+    #     template argument
+    # See https://github.com/NVIDIA/cutlass/issues/3065
+    target_compile_options(onnxruntime_providers_cuda PRIVATE
+      "$<$<COMPILE_LANGUAGE:CXX>:/permissive>"
+      "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /permissive>"
+    )
+  endif()
+
   if(WIN32)
     # FILE_NAME preprocessor definition is used in onnxruntime_providers_cuda.rc
     target_compile_definitions(onnxruntime_providers_cuda PRIVATE FILE_NAME=\"onnxruntime_providers_cuda.dll\")
@@ -180,6 +191,11 @@
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
     endif()
 
+    # suppress warnings like this:
+    #   cutlass-src\include\cute/arch/mma_sm120.hpp(3128): error #177-D: variable "tidA" was declared but never
+    #     referenced
+    target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=177>")
+
     # Since CUDA 12.8, compiling diagnostics become stricter
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>")
@@ -281,7 +297,6 @@
       target_compile_definitions(${target} PRIVATE COMPILE_HOPPER_TMA_GEMMS)
       if (MSVC)
         target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /bigobj>")
-        target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=177>")
         target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4172>")
       endif()
     endif()

diff --git a/cmake/patches/cutlass/cutlass_4.2.1.patch b/cmake/patches/cutlass/cutlass_4.2.1.patch
@@ -11,6 +11,21 @@ index cb161369..2fdff179 100644
                           [&](auto init, auto i){
                             if constexpr (is_constant_v<0, decltype(get<i>(flat_stride))>) { return append(init, i); }
                             else                                                           { return init;            }
+diff --git a/include/cutlass/cuda_host_adapter.hpp b/include/cutlass/cuda_host_adapter.hpp
+index a8af62be..22e7332d 100644
+--- a/include/cutlass/cuda_host_adapter.hpp
++++ b/include/cutlass/cuda_host_adapter.hpp
+@@ -394,6 +394,10 @@ protected:
+    * Fills a buffer in Global Memory with a byte sequence copied from host memory.
+    * This function can be overridden to dispatch to the appropriate cuMemsetD*Async API
+   */
++  // Patching to work around this error:
++  //   include\cutlass/cuda_host_adapter.hpp(414): error #20011-D: calling a __host__ function("memsetDeviceImpl")
++  //     from a __host__ __device__ function("memsetDevice") is not allowed
++  CUTLASS_HOST_DEVICE
+   virtual Status memsetDeviceImpl(
+     void* destination, ///< Device memory pointer to be filled
+     void const* fill_value, ///< Value to be filled in the buffer
 diff --git a/include/cutlass/exmy_base.h b/include/cutlass/exmy_base.h
 index be207a49..6028e01d 100644
 --- a/include/cutlass/exmy_base.h

diff --git a/cmake/winml.cmake b/cmake/winml.cmake
@@ -316,8 +316,7 @@ if (onnxruntime_WINML_NAMESPACE_OVERRIDE STREQUAL "Windows")
   target_compile_definitions(winml_adapter PRIVATE "BUILD_INBOX=1")
 endif()
 
-# will requires C++17
-set_target_properties(winml_adapter PROPERTIES CXX_STANDARD 17)
+set_target_properties(winml_adapter PROPERTIES CXX_STANDARD 20)
 set_target_properties(winml_adapter PROPERTIES CXX_STANDARD_REQUIRED ON)
 
 # Compiler definitions
@@ -645,7 +644,7 @@ onnxruntime_add_static_library(winml_lib_common
   ${winml_lib_common_dir}/CommonDeviceHelpers.cpp
 )
 
-set_target_properties(winml_lib_common PROPERTIES CXX_STANDARD 17)
+set_target_properties(winml_lib_common PROPERTIES CXX_STANDARD 20)
 set_target_properties(winml_lib_common PROPERTIES CXX_STANDARD_REQUIRED ON)
 target_compile_options(winml_lib_common PRIVATE /GR- /await /bigobj /wd4238)
 target_link_libraries(winml_lib_common PRIVATE ${WIL_TARGET})
@@ -829,9 +828,9 @@ if (winml_is_inbox)
     target_link_libraries(${new_target} PRIVATE ${link_libraries})
     target_link_options(${new_target} PRIVATE ${link_options})
 
-    # Attempt to copy linker flags 
+    # Attempt to copy linker flags
     get_target_property(link_flags ${target} LINK_FLAGS)
-    
+
     if (NOT link_flags MATCHES ".*NOTFOUND")
       set_property(TARGET ${new_target} PROPERTY LINK_FLAGS "${link_flags}")
     endif()

diff --git a/cmake/winml_unittests.cmake b/cmake/winml_unittests.cmake
@@ -19,7 +19,7 @@ set(WINML_TEST_INC_DIR
 function(set_winml_target_properties target)
   set_target_properties(${target} PROPERTIES
     FOLDER "ONNXRuntimeTest/winml"
-    CXX_STANDARD 17
+    CXX_STANDARD 20
     CXX_STANDARD_REQUIRED YES
     CXX_EXTENSIONS NO
   )

diff --git a/include/onnxruntime/core/common/gpu_profiler_common.h b/include/onnxruntime/core/common/gpu_profiler_common.h
@@ -379,8 +379,8 @@ class GPUProfilerBase : public EpProfiler {
   void MergeEvents(std::map<uint64_t, Events>& events_to_merge, Events& events) {
     Events merged_events;
 
-    auto event_iter = std::make_move_iterator(events.begin());
-    auto event_end = std::make_move_iterator(events.end());
+    auto event_iter = events.begin();
+    auto event_end = events.end();
     for (auto& map_iter : events_to_merge) {
       if (map_iter.second.empty()) {
         continue;
@@ -395,7 +395,7 @@ class GPUProfilerBase : public EpProfiler {
               (event_iter->ts == ts &&
                (event_iter + 1) != event_end &&
                (event_iter + 1)->ts == ts))) {
-        merged_events.emplace_back(*event_iter);
+        merged_events.emplace_back(*std::make_move_iterator(event_iter));
         ++event_iter;
       }
 
@@ -409,7 +409,7 @@ class GPUProfilerBase : public EpProfiler {
         copy_op_names = true;
         op_name = event_iter->args["op_name"];
         parent_name = event_iter->name;
-        merged_events.emplace_back(*event_iter);
+        merged_events.emplace_back(*std::make_move_iterator(event_iter));
         ++event_iter;
       }
 
@@ -428,7 +428,9 @@ class GPUProfilerBase : public EpProfiler {
     }
 
     // move any remaining events
-    merged_events.insert(merged_events.end(), event_iter, event_end);
+    merged_events.insert(merged_events.end(),
+                         std::make_move_iterator(event_iter),
+                         std::make_move_iterator(event_end));
     std::swap(events, merged_events);
   }
 

diff --git a/include/onnxruntime/core/common/logging/logging.h b/include/onnxruntime/core/common/logging/logging.h
@@ -17,7 +17,6 @@
 #include "core/common/logging/macros.h"
 #include "core/common/logging/severity.h"
 #include "core/common/logging/sink_types.h"
-#include "date/date.h"
 
 /*
 
@@ -56,43 +55,30 @@
 namespace onnxruntime {
 namespace logging {
 
-using Timestamp = std::chrono::time_point<std::chrono::system_clock>;
-
-// C++20 has operator<< in std::chrono for Timestamp type but mac builds need additional checks
-// to ensure usage is valid.
-// TODO: As we enable C++20 on other platforms we may need similar checks.
-// define a temporary value to determine whether to use the std::chrono or date implementation.
-#define ORT_USE_CXX20_STD_CHRONO __cplusplus >= 202002L
-
-// Apply constraints for mac builds
-#if __APPLE__
-#include <TargetConditionals.h>
-
-// Catalyst check must be first as it has both TARGET_OS_MACCATALYST and TARGET_OS_MAC set
-#if TARGET_OS_MACCATALYST
-// maccatalyst requires version 16.3
-#if (defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED < 160300)
-#undef ORT_USE_CXX20_STD_CHRONO
-#endif
+// This class wraps `std::chrono::system_clock::time_point` and provides `operator<<`.
+// It is a workaround for the inconsistent availability of `std::chrono::operator<<` for
+// `std::chrono::system_clock::time_point`.
+// When all builds support `std::chrono::operator<<`, we can simplify to this:
+//   `using Timestamp = std::chrono::system_clock::time_point;`
+class Timestamp {
+ public:
+  using TimePoint = std::chrono::system_clock::time_point;
+  Timestamp(const TimePoint& time_point) noexcept : time_point_{time_point} {}
 
-#elif TARGET_OS_MAC
-// Xcode added support for C++20's std::chrono::operator<< in SDK version 14.4,
-// but the target macOS version must also be >= 13.3 for it to be used.
-#if (defined(__MAC_OS_X_VERSION_MAX_ALLOWED) && __MAC_OS_X_VERSION_MAX_ALLOWED < 140400) || \
-    (defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < 130300)
-#undef ORT_USE_CXX20_STD_CHRONO
-#endif
+  friend std::ostream& operator<<(std::ostream& os, const Timestamp& time_stamp) {
+    return time_stamp.WriteToStream(os);
+  }
 
-#endif
-#endif  // __APPLE__
+  friend std::wostream& operator<<(std::wostream& os, const Timestamp& time_stamp) {
+    return time_stamp.WriteToWStream(os);
+  }
 
-#if ORT_USE_CXX20_STD_CHRONO
-namespace timestamp_ns = std::chrono;
-#else
-namespace timestamp_ns = ::date;
-#endif
+ private:
+  std::ostream& WriteToStream(std::ostream& os) const;
+  std::wostream& WriteToWStream(std::wostream& os) const;
 
-#undef ORT_USE_CXX20_STD_CHRONO
+  TimePoint time_point_{};
+};
 
 #ifndef NDEBUG
 ORT_ATTRIBUTE_UNUSED static bool vlog_enabled = true;  // Set directly based on your needs.