From 588463774b1144cef27061f7b9f55f58ef499c7b Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 27 Jan 2026 13:09:29 -0800
Subject: [PATCH 01/39] Update to C++20.

---
 cmake/CMakeLists.txt | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 42432041a8b01..6b17389f7f4bf 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -35,12 +35,7 @@ include(CheckSymbolExists)
 include(GNUInstallDirs) # onnxruntime_providers_* require CMAKE_INSTALL_* variables
 
 if (NOT CMAKE_CXX_STANDARD)
-  # TODO: update this once all system adapt c++20
-  if (CMAKE_SYSTEM_NAME STREQUAL "Darwin")
-    set(CMAKE_CXX_STANDARD 20)
-  else()
-    set(CMAKE_CXX_STANDARD 17)
-  endif()
+  set(CMAKE_CXX_STANDARD 20)
 endif()
 
 if (MSVC)

From 98c31aef4bd50e6ab8febce2e7df5feb92a1074f Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 27 Jan 2026 13:10:31 -0800
Subject: [PATCH 02/39] Fix ostream::operator<< usage with wchar_t*.

---
 onnxruntime/core/session/utils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/session/utils.cc b/onnxruntime/core/session/utils.cc
index a354cf26368d4..7ad09e1a2cd5e 100644
--- a/onnxruntime/core/session/utils.cc
+++ b/onnxruntime/core/session/utils.cc
@@ -549,7 +549,7 @@ Status LoadPluginOrProviderBridge(const std::string& registration_name,
                                                             true,
                                                             ProviderLibraryPathType::Absolute);
   bool is_provider_bridge = provider_library->Load() == Status::OK();  // library has GetProvider
-  LOGS_DEFAULT(INFO) << "Loading EP library: " << library_path
+  LOGS_DEFAULT(INFO) << "Loading EP library: " << resolved_library_path
                      << (is_provider_bridge ? " as a provider bridge" : " as a plugin");
 
   // create EpLibraryPlugin to ensure CreateEpFactories and ReleaseEpFactory are available

From 9171d14ed45ae153e2c54cf7fb445fa396e2417e Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 27 Jan 2026 17:03:56 -0800
Subject: [PATCH 03/39] try to fix time_point output operator availability
 check

---
 .../onnxruntime/core/common/logging/logging.h | 30 ++-----------------
 1 file changed, 3 insertions(+), 27 deletions(-)

diff --git a/include/onnxruntime/core/common/logging/logging.h b/include/onnxruntime/core/common/logging/logging.h
index dc930ce52eaa9..38e3aa3725b71 100644
--- a/include/onnxruntime/core/common/logging/logging.h
+++ b/include/onnxruntime/core/common/logging/logging.h
@@ -58,33 +58,9 @@ namespace logging {
 
 using Timestamp = std::chrono::time_point<std::chrono::system_clock>;
 
-// C++20 has operator<< in std::chrono for Timestamp type but mac builds need additional checks
-// to ensure usage is valid.
-// TODO: As we enable C++20 on other platforms we may need similar checks.
-// define a temporary value to determine whether to use the std::chrono or date implementation.
-#define ORT_USE_CXX20_STD_CHRONO __cplusplus >= 202002L
-
-// Apply constraints for mac builds
-#if __APPLE__
-#include <TargetConditionals.h>
-
-// Catalyst check must be first as it has both TARGET_OS_MACCATALYST and TARGET_OS_MAC set
-#if TARGET_OS_MACCATALYST
-// maccatalyst requires version 16.3
-#if (defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED < 160300)
-#undef ORT_USE_CXX20_STD_CHRONO
-#endif
-
-#elif TARGET_OS_MAC
-// Xcode added support for C++20's std::chrono::operator<< in SDK version 14.4,
-// but the target macOS version must also be >= 13.3 for it to be used.
-#if (defined(__MAC_OS_X_VERSION_MAX_ALLOWED) && __MAC_OS_X_VERSION_MAX_ALLOWED < 140400) || \
-    (defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < 130300)
-#undef ORT_USE_CXX20_STD_CHRONO
-#endif
-
-#endif
-#endif  // __APPLE__
+// C++20 has operator<< in std::chrono for Timestamp type but we need to check if it is available.
+// define temporary macro ORT_USE_CXX20_STD_CHRONO to determine whether to use the std::chrono or date implementation.
+#define ORT_USE_CXX20_STD_CHRONO __cpp_lib_chrono >= 201803L
 
 #if ORT_USE_CXX20_STD_CHRONO
 namespace timestamp_ns = std::chrono;

From 92b3334f0cd5b685501ae5baa59787f9063e80c6 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 27 Jan 2026 19:01:16 -0800
Subject: [PATCH 04/39] add back Apple stuff and handle IOS

---
 .../onnxruntime/core/common/logging/logging.h | 24 ++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/include/onnxruntime/core/common/logging/logging.h b/include/onnxruntime/core/common/logging/logging.h
index 38e3aa3725b71..329c0ed1bf8c5 100644
--- a/include/onnxruntime/core/common/logging/logging.h
+++ b/include/onnxruntime/core/common/logging/logging.h
@@ -58,10 +58,32 @@ namespace logging {
 
 using Timestamp = std::chrono::time_point<std::chrono::system_clock>;
 
-// C++20 has operator<< in std::chrono for Timestamp type but we need to check if it is available.
+// C++20 has operator<< in std::chrono for Timestamp type but we need to check if usage is valid.
 // define temporary macro ORT_USE_CXX20_STD_CHRONO to determine whether to use the std::chrono or date implementation.
 #define ORT_USE_CXX20_STD_CHRONO __cpp_lib_chrono >= 201803L
 
+// Apply constraints for Apple builds
+#if __APPLE__
+#include <TargetConditionals.h>
+
+// iOS check must be first as it also has TARGET_OS_MAC set
+#if TARGET_OS_IOS
+// iOS requires version 16.3
+#if (defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED < 160300)
+#undef ORT_USE_CXX20_STD_CHRONO
+#endif
+
+#elif TARGET_OS_MAC
+// Xcode added support for C++20's std::chrono::operator<< in SDK version 14.4,
+// but the target macOS version must also be >= 13.3 for it to be used.
+#if (defined(__MAC_OS_X_VERSION_MAX_ALLOWED) && __MAC_OS_X_VERSION_MAX_ALLOWED < 140400) || \
+    (defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < 130300)
+#undef ORT_USE_CXX20_STD_CHRONO
+#endif
+
+#endif
+#endif  // __APPLE__
+
 #if ORT_USE_CXX20_STD_CHRONO
 namespace timestamp_ns = std::chrono;
 #else

From 831ae91ad7007826403f605cb8ce04bda29e1cc6 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Wed, 28 Jan 2026 20:34:16 -0800
Subject: [PATCH 05/39] make Timestamp a separate type and implement stream
 insertion operator for it

---
 .../onnxruntime/core/common/logging/logging.h | 26 +++++++++++++++----
 .../core/common/logging/sinks/ostream_sink.cc |  6 +----
 .../platform/apple/logging/apple_log_sink.mm  |  4 +--
 onnxruntime/test/common/logging/helpers.h     |  2 --
 .../test/util/include/test/capturing_sink.h   |  2 --
 5 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/include/onnxruntime/core/common/logging/logging.h b/include/onnxruntime/core/common/logging/logging.h
index 329c0ed1bf8c5..386a9421fc47e 100644
--- a/include/onnxruntime/core/common/logging/logging.h
+++ b/include/onnxruntime/core/common/logging/logging.h
@@ -56,9 +56,7 @@ struct OrtLogger;  // opaque API type. is always an instance of Logger
 namespace onnxruntime {
 namespace logging {
 
-using Timestamp = std::chrono::time_point<std::chrono::system_clock>;
-
-// C++20 has operator<< in std::chrono for Timestamp type but we need to check if usage is valid.
+// C++20 has std::chrono::operator<< for std::chrono::system_clock::time_point but we need to check if usage is valid.
 // define temporary macro ORT_USE_CXX20_STD_CHRONO to determine whether to use the std::chrono or date implementation.
 #define ORT_USE_CXX20_STD_CHRONO __cpp_lib_chrono >= 201803L
 
@@ -85,13 +83,31 @@ using Timestamp = std::chrono::time_point<std::chrono::system_clock>;
 #endif  // __APPLE__
 
 #if ORT_USE_CXX20_STD_CHRONO
-namespace timestamp_ns = std::chrono;
+namespace timestamp_stream_insertion_op_ns = std::chrono;
 #else
-namespace timestamp_ns = ::date;
+namespace timestamp_stream_insertion_op_ns = ::date;
 #endif
 
 #undef ORT_USE_CXX20_STD_CHRONO
 
+// This class wraps `std::chrono::system_clock::time_point` and provides `operator<<`.
+// It is a workaround for the inconsistent availability of `std::chrono::operator<<` for
+// `std::chrono::system_clock::time_point`.
+// When all builds support `std::chrono::operator<<`, we can simplify to this:
+//   `using Timestamp = std::chrono::system_clock::time_point;`
+class Timestamp {
+ public:
+  using TimePoint = std::chrono::system_clock::time_point;
+  Timestamp(const TimePoint& time_point) noexcept : time_point_{time_point} {}
+
+  friend std::ostream& operator<<(std::ostream& os, const Timestamp& time_stamp) {
+    return timestamp_stream_insertion_op_ns::operator<<(os, time_stamp.time_point_);
+  }
+
+ private:
+  TimePoint time_point_{};
+};
+
 #ifndef NDEBUG
 ORT_ATTRIBUTE_UNUSED static bool vlog_enabled = true;  // Set directly based on your needs.
 #else
diff --git a/onnxruntime/core/common/logging/sinks/ostream_sink.cc b/onnxruntime/core/common/logging/sinks/ostream_sink.cc
index 64441a2b20de2..f8fdb5e1906ed 100644
--- a/onnxruntime/core/common/logging/sinks/ostream_sink.cc
+++ b/onnxruntime/core/common/logging/sinks/ostream_sink.cc
@@ -23,9 +23,6 @@ struct Color {
 
 #ifndef _WIN32
 void OStreamSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) {
-  // operator for formatting of timestamp in ISO8601 format including microseconds
-  using timestamp_ns::operator<<;
-
   // Two options as there may be multiple calls attempting to write to the same sink at once:
   // 1) Use mutex to synchronize access to the stream.
   // 2) Create the message in an ostringstream and output in one call.
@@ -45,8 +42,7 @@ void OStreamSink::SendImpl(const Timestamp& timestamp, const std::string& logger
   }
 #endif
 
-  timestamp_ns::operator<<(msg, timestamp);  // handle ambiguity with C++20 where date and std::chrono have operator<<
-  msg << " [" << message.SeverityPrefix() << ":" << message.Category() << ":" << logger_id << ", "
+  msg << timestamp << " [" << message.SeverityPrefix() << ":" << message.Category() << ":" << logger_id << ", "
       << message.Location().ToString() << "] " << message.Message();
 
 #ifndef ORT_MINIMAL_BUILD
diff --git a/onnxruntime/core/platform/apple/logging/apple_log_sink.mm b/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
index 6abbe76a7f151..862ea0bf3c825 100644
--- a/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
+++ b/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
@@ -11,11 +11,9 @@
 namespace logging {
 
 void AppleLogSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) {
-  using timestamp_ns::operator<<;
   std::ostringstream msg;
 
-  timestamp_ns::operator<<(msg, timestamp);  // handle ambiguity with C++20 where date and std::chrono have operator<<
-  msg << " [" << message.SeverityPrefix() << ":" << message.Category() << ":" << logger_id << ", "
+  msg << timestamp << " [" << message.SeverityPrefix() << ":" << message.Category() << ":" << logger_id << ", "
       << message.Location().ToString() << "] " << message.Message();
   NSLog(@"%s", msg.str().c_str());
 }
diff --git a/onnxruntime/test/common/logging/helpers.h b/onnxruntime/test/common/logging/helpers.h
index 0b623fe9ee09a..bf4d30184b7f6 100644
--- a/onnxruntime/test/common/logging/helpers.h
+++ b/onnxruntime/test/common/logging/helpers.h
@@ -39,8 +39,6 @@ class MockEtwSink : public ::onnxruntime::logging::ISink {
 #endif
 
 ACTION(PrintArgs) {
-  using onnxruntime::logging::timestamp_ns::operator<<;
-
   // const Timestamp &timestamp, const std::string &logger_id, const Message &message
   //                  arg0                          arg1                        arg2
   std::cout << arg1 << "@" << arg0 << " "
diff --git a/onnxruntime/test/util/include/test/capturing_sink.h b/onnxruntime/test/util/include/test/capturing_sink.h
index 7d978d1bd1e56..37e1aecabdf25 100644
--- a/onnxruntime/test/util/include/test/capturing_sink.h
+++ b/onnxruntime/test/util/include/test/capturing_sink.h
@@ -14,8 +14,6 @@ using namespace ::onnxruntime::logging;
 class CapturingSink : public logging::ISink {
  public:
   void SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) override {
-    // operator for formatting of timestamp in ISO8601 format including microseconds
-    using timestamp_ns::operator<<;
     std::ostringstream msg;
 
     msg << timestamp << " [" << message.SeverityPrefix() << ":" << message.Category() << ":" << logger_id << ", "

From 1e2ad83b6b147bae1bc9a11d241c7479b81d7c5f Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 29 Jan 2026 10:28:08 -0800
Subject: [PATCH 06/39] move date.h dependency into logging.cc

---
 .../onnxruntime/core/common/logging/logging.h | 39 ++----------------
 onnxruntime/core/common/logging/logging.cc    | 41 +++++++++++++++++++
 .../platform/posix/logging/syslog_sink.cc     |  2 -
 .../core/platform/windows/logging/etw_sink.h  |  1 -
 .../test/common/logging/logging_test.cc       |  2 -
 .../test/util/include/capturing_sink.h        |  4 --
 6 files changed, 44 insertions(+), 45 deletions(-)

diff --git a/include/onnxruntime/core/common/logging/logging.h b/include/onnxruntime/core/common/logging/logging.h
index 386a9421fc47e..ec6f5df38778d 100644
--- a/include/onnxruntime/core/common/logging/logging.h
+++ b/include/onnxruntime/core/common/logging/logging.h
@@ -17,7 +17,6 @@
 #include "core/common/logging/macros.h"
 #include "core/common/logging/severity.h"
 #include "core/common/logging/sink_types.h"
-#include "date/date.h"
 
 /*
 
@@ -56,40 +55,6 @@ struct OrtLogger;  // opaque API type. is always an instance of Logger
 namespace onnxruntime {
 namespace logging {
 
-// C++20 has std::chrono::operator<< for std::chrono::system_clock::time_point but we need to check if usage is valid.
-// define temporary macro ORT_USE_CXX20_STD_CHRONO to determine whether to use the std::chrono or date implementation.
-#define ORT_USE_CXX20_STD_CHRONO __cpp_lib_chrono >= 201803L
-
-// Apply constraints for Apple builds
-#if __APPLE__
-#include <TargetConditionals.h>
-
-// iOS check must be first as it also has TARGET_OS_MAC set
-#if TARGET_OS_IOS
-// iOS requires version 16.3
-#if (defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED < 160300)
-#undef ORT_USE_CXX20_STD_CHRONO
-#endif
-
-#elif TARGET_OS_MAC
-// Xcode added support for C++20's std::chrono::operator<< in SDK version 14.4,
-// but the target macOS version must also be >= 13.3 for it to be used.
-#if (defined(__MAC_OS_X_VERSION_MAX_ALLOWED) && __MAC_OS_X_VERSION_MAX_ALLOWED < 140400) || \
-    (defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < 130300)
-#undef ORT_USE_CXX20_STD_CHRONO
-#endif
-
-#endif
-#endif  // __APPLE__
-
-#if ORT_USE_CXX20_STD_CHRONO
-namespace timestamp_stream_insertion_op_ns = std::chrono;
-#else
-namespace timestamp_stream_insertion_op_ns = ::date;
-#endif
-
-#undef ORT_USE_CXX20_STD_CHRONO
-
 // This class wraps `std::chrono::system_clock::time_point` and provides `operator<<`.
 // It is a workaround for the inconsistent availability of `std::chrono::operator<<` for
 // `std::chrono::system_clock::time_point`.
@@ -101,10 +66,12 @@ class Timestamp {
   Timestamp(const TimePoint& time_point) noexcept : time_point_{time_point} {}
 
   friend std::ostream& operator<<(std::ostream& os, const Timestamp& time_stamp) {
-    return timestamp_stream_insertion_op_ns::operator<<(os, time_stamp.time_point_);
+    return time_stamp.WriteToStream(os);
   }
 
  private:
+  std::ostream& WriteToStream(std::ostream& os) const;
+
   TimePoint time_point_{};
 };
 
diff --git a/onnxruntime/core/common/logging/logging.cc b/onnxruntime/core/common/logging/logging.cc
index a79e7300cffce..3f80ee4bcec50 100644
--- a/onnxruntime/core/common/logging/logging.cc
+++ b/onnxruntime/core/common/logging/logging.cc
@@ -28,8 +28,49 @@
 #include "logging.h"
 #endif
 
+// C++20 has std::chrono::operator<< for std::chrono::system_clock::time_point but we need to check if usage is valid.
+// define temporary macro ORT_USE_CXX20_STD_CHRONO to determine whether to use the std::chrono or date implementation.
+#define ORT_USE_CXX20_STD_CHRONO __cpp_lib_chrono >= 201803L
+
+// Apply constraints for Apple builds
+#if __APPLE__
+#include <TargetConditionals.h>
+
+// iOS check must be first as it also has TARGET_OS_MAC set
+#if TARGET_OS_IOS
+// iOS requires version 16.3
+#if (defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED < 160300)
+#undef ORT_USE_CXX20_STD_CHRONO
+#endif
+
+#elif TARGET_OS_MAC
+// Xcode added support for C++20's std::chrono::operator<< in SDK version 14.4,
+// but the target macOS version must also be >= 13.3 for it to be used.
+#if (defined(__MAC_OS_X_VERSION_MAX_ALLOWED) && __MAC_OS_X_VERSION_MAX_ALLOWED < 140400) || \
+    (defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < 130300)
+#undef ORT_USE_CXX20_STD_CHRONO
+#endif
+
+#endif
+#endif  // __APPLE__
+
+#if ORT_USE_CXX20_STD_CHRONO
+namespace timestamp_stream_insertion_op_ns = std::chrono;
+#else
+#include "date/date.h"
+
+namespace timestamp_stream_insertion_op_ns = ::date;
+#endif
+
+#undef ORT_USE_CXX20_STD_CHRONO
+
 namespace onnxruntime {
 namespace logging {
+
+std::ostream& Timestamp::WriteToStream(std::ostream& os) const {
+  return timestamp_stream_insertion_op_ns::operator<<(os, time_point_);
+}
+
 const char* Category::onnxruntime = "onnxruntime";
 const char* Category::System = "System";
 
diff --git a/onnxruntime/core/platform/posix/logging/syslog_sink.cc b/onnxruntime/core/platform/posix/logging/syslog_sink.cc
index 9fbd26f093498..e5b60cf4742ef 100644
--- a/onnxruntime/core/platform/posix/logging/syslog_sink.cc
+++ b/onnxruntime/core/platform/posix/logging/syslog_sink.cc
@@ -4,7 +4,6 @@
 #include "core/common/logging/logging.h"
 #include "core/common/logging/capture.h"
 #include "syslog_sink.h"
-#include "date/date.h"
 
 namespace onnxruntime {
 namespace logging {
@@ -12,7 +11,6 @@ namespace logging {
 constexpr const char* SYSLOG_LEVEL = "76432";
 
 void SysLogSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) {
-  using date::operator<<;
   std::stringstream msg;
 
   // syslog has it own timestamp but not as accurate as our timestamp. So we are going to keep both,
diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.h b/onnxruntime/core/platform/windows/logging/etw_sink.h
index 62b762886ca82..d0c08a2144c20 100644
--- a/onnxruntime/core/platform/windows/logging/etw_sink.h
+++ b/onnxruntime/core/platform/windows/logging/etw_sink.h
@@ -16,7 +16,6 @@
 
 #ifdef ETW_TRACE_LOGGING_SUPPORTED
 
-#include <date/date.h>
 #include <atomic>
 #include <iostream>
 #include <string>
diff --git a/onnxruntime/test/common/logging/logging_test.cc b/onnxruntime/test/common/logging/logging_test.cc
index d3af022f83e86..8e1817e777d9e 100644
--- a/onnxruntime/test/common/logging/logging_test.cc
+++ b/onnxruntime/test/common/logging/logging_test.cc
@@ -14,8 +14,6 @@
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(disable : 26400)
 #endif
-// if we pull in the whole 'testing' namespace we get warnings from date.h as both use '_' in places.
-// to avoid that we explicitly pull in the pieces we are using
 using testing::Eq;
 using testing::Field;
 using testing::Ge;
diff --git a/onnxruntime/test/util/include/capturing_sink.h b/onnxruntime/test/util/include/capturing_sink.h
index 39788947602df..37e1aecabdf25 100644
--- a/onnxruntime/test/util/include/capturing_sink.h
+++ b/onnxruntime/test/util/include/capturing_sink.h
@@ -6,8 +6,6 @@
 #include "core/common/logging/logging.h"
 #include "core/common/logging/isink.h"
 
-#include "date/date.h"
-
 namespace onnxruntime {
 namespace test {
 
@@ -16,8 +14,6 @@ using namespace ::onnxruntime::logging;
 class CapturingSink : public logging::ISink {
  public:
   void SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) override {
-    // operator for formatting of timestamp in ISO8601 format including microseconds
-    using date::operator<<;
     std::ostringstream msg;
 
     msg << timestamp << " [" << message.SeverityPrefix() << ":" << message.Category() << ":" << logger_id << ", "

From dad1617c8abaf4c3f847ba564298efcb1cc58ad0 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 29 Jan 2026 14:19:12 -0800
Subject: [PATCH 07/39] add exception for MSVC CUDA EP build for now...

---
 cmake/CMakeLists.txt | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 6b17389f7f4bf..8eca34f4f80e7 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -21,11 +21,6 @@ if("${CMAKE_CXX_COMPILER_ID}" MATCHES "IntelLLVM")
   endif()
 endif()
 
-# Needed for Java
-if (NOT CMAKE_CXX_STANDARD)
-  set(CMAKE_C_STANDARD 99)
-endif()
-
 include(CheckCXXCompilerFlag)
 include(CheckLanguage)
 include(CMakeDependentOption)
@@ -34,10 +29,6 @@ include(CheckFunctionExists)
 include(CheckSymbolExists)
 include(GNUInstallDirs) # onnxruntime_providers_* require CMAKE_INSTALL_* variables
 
-if (NOT CMAKE_CXX_STANDARD)
-  set(CMAKE_CXX_STANDARD 20)
-endif()
-
 if (MSVC)
   #  Make sure Visual Studio sets __cplusplus macro correctly: https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus")
@@ -249,6 +240,24 @@ option(onnxruntime_USE_OPENVINO_INTERFACE "Build ONNXRuntime shared lib which is
 option(onnxruntime_USE_VITISAI_INTERFACE "Build ONNXRuntime shared lib which is compatible with Vitis-AI EP interface" OFF)
 option(onnxruntime_USE_QNN_INTERFACE "Build ONNXRuntime shared lib which is compatible with QNN EP interface" OFF)
 
+# Set C/C++ standard versions
+if (NOT CMAKE_C_STANDARD)
+  # Needed for Java
+  set(CMAKE_C_STANDARD 99)
+endif()
+
+if (NOT CMAKE_CXX_STANDARD)
+  # TODO move all builds to C++20
+  if (MSVC AND onnxruntime_USE_CUDA)
+    # There's a compilation error from CUTLASS header "cute/tensor.hpp" when attempting to use C++20:
+    #   cutlass-src\include\cute\stride.hpp(299,46): error C3545: 'Ints': parameter pack expects a non-type template
+    #   argument
+    set(CMAKE_CXX_STANDARD 17)
+  else()
+    set(CMAKE_CXX_STANDARD 20)
+  endif()
+endif()
+
 if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 11.1)
   message(FATAL_ERROR  "GCC version must be greater than or equal to 11.1")
 endif()

From 7f34d3bf95466234b16a1a1adc5a674abb3c6a59 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 29 Jan 2026 18:48:11 -0800
Subject: [PATCH 08/39] add wostream overloads for Timestamp operator<<

---
 include/onnxruntime/core/common/logging/logging.h     | 5 +++++
 onnxruntime/core/common/logging/logging.cc            | 4 ++++
 onnxruntime/core/common/logging/sinks/ostream_sink.cc | 3 ---
 onnxruntime/core/session/inference_session.cc         | 2 +-
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/onnxruntime/core/common/logging/logging.h b/include/onnxruntime/core/common/logging/logging.h
index ec6f5df38778d..4e70f0414cb70 100644
--- a/include/onnxruntime/core/common/logging/logging.h
+++ b/include/onnxruntime/core/common/logging/logging.h
@@ -69,8 +69,13 @@ class Timestamp {
     return time_stamp.WriteToStream(os);
   }
 
+  friend std::wostream& operator<<(std::wostream& os, const Timestamp& time_stamp) {
+    return time_stamp.WriteToWStream(os);
+  }
+
  private:
   std::ostream& WriteToStream(std::ostream& os) const;
+  std::wostream& WriteToWStream(std::wostream& os) const;
 
   TimePoint time_point_{};
 };
diff --git a/onnxruntime/core/common/logging/logging.cc b/onnxruntime/core/common/logging/logging.cc
index 3f80ee4bcec50..2f02199bde379 100644
--- a/onnxruntime/core/common/logging/logging.cc
+++ b/onnxruntime/core/common/logging/logging.cc
@@ -71,6 +71,10 @@ std::ostream& Timestamp::WriteToStream(std::ostream& os) const {
   return timestamp_stream_insertion_op_ns::operator<<(os, time_point_);
 }
 
+std::wostream& Timestamp::WriteToWStream(std::wostream& os) const {
+  return timestamp_stream_insertion_op_ns::operator<<(os, time_point_);
+}
+
 const char* Category::onnxruntime = "onnxruntime";
 const char* Category::System = "System";
 
diff --git a/onnxruntime/core/common/logging/sinks/ostream_sink.cc b/onnxruntime/core/common/logging/sinks/ostream_sink.cc
index f8fdb5e1906ed..1c4968502eabb 100644
--- a/onnxruntime/core/common/logging/sinks/ostream_sink.cc
+++ b/onnxruntime/core/common/logging/sinks/ostream_sink.cc
@@ -62,9 +62,6 @@ void OStreamSink::SendImpl(const Timestamp& timestamp, const std::string& logger
 }
 #else
 void WOStreamSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) {
-  // operator for formatting of timestamp in ISO8601 format including microseconds
-  using date::operator<<;
-
   // Two options as there may be multiple calls attempting to write to the same sink at once:
   // 1) Use mutex to synchronize access to the stream.
   // 2) Create the message in an ostringstream and output in one call.
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 0944be87591e2..863ea76e7aa3d 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -113,7 +113,7 @@ inline const wchar_t* GetDateFormatString<wchar_t>() {
   return L"%Y-%m-%d_%H-%M-%S";
 }
 #endif
-// TODO: use LoggingManager::GetTimestamp and date::operator<<
+// TODO: use LoggingManager::GetTimestamp and operator<<
 // (see ostream_sink.cc for an example)
 // to simplify this and match the log file timestamp format.
 template <typename T>

From ad1aab8198f4be00bdeb9d2d63a62e888ee47567 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Fri, 30 Jan 2026 11:28:17 -0800
Subject: [PATCH 09/39] use std::filesystem::path:string instead of u8string in
 onnxruntime/core/providers/vitisai/imp/global_api.cc

---
 onnxruntime/core/providers/vitisai/imp/global_api.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index ec529c2ad1fc2..b74eb1cae4a16 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -238,7 +238,7 @@ vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> c
   if (s_library_vitisaiep.compile_onnx_model_vitisai_ep_v4) {
     Status status = Status::OK();
     auto status_ptr = reinterpret_cast<void*>(&status);
-    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_v4(model_path.u8string(), graph_viewer.GetGraph(), options, status_ptr, change_status_with_error, logger), vaip_execution_provider_deletor);
+    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_v4(model_path.string(), graph_viewer.GetGraph(), options, status_ptr, change_status_with_error, logger), vaip_execution_provider_deletor);
     if (!status.IsOK()) {
       ORT_THROW(status);
     }
@@ -246,7 +246,7 @@ vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> c
   } else if (s_library_vitisaiep.compile_onnx_model_vitisai_ep_v3) {
     Status status = Status::OK();
     auto status_ptr = reinterpret_cast<void*>(&status);
-    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_v3(model_path.u8string(), graph_viewer.GetGraph(), options, status_ptr, change_status_with_error), vaip_execution_provider_deletor);
+    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_v3(model_path.string(), graph_viewer.GetGraph(), options, status_ptr, change_status_with_error), vaip_execution_provider_deletor);
     if (!status.IsOK()) {
       ORT_THROW(status);
     }
@@ -254,13 +254,13 @@ vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> c
   } else if (s_library_vitisaiep.compile_onnx_model_vitisai_ep_with_error_handling) {
     Status status = Status::OK();
     auto status_ptr = reinterpret_cast<void*>(&status);
-    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_with_error_handling(model_path.u8string(), graph_viewer.GetGraph(), options, status_ptr, change_status_with_error), vaip_execution_provider_deletor);
+    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_with_error_handling(model_path.string(), graph_viewer.GetGraph(), options, status_ptr, change_status_with_error), vaip_execution_provider_deletor);
     if (!status.IsOK()) {
       ORT_THROW(status);
     }
     return ret;
   } else {
-    return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_with_options(model_path.u8string(), graph_viewer.GetGraph(), options), vaip_execution_provider_deletor);
+    return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_with_options(model_path.string(), graph_viewer.GetGraph(), options), vaip_execution_provider_deletor);
   }
 }
 
@@ -707,4 +707,4 @@ CreateExecutionProviderFromAnotherEp(const std::string& lib, const OrtSessionOpt
   std::ignore = provider->CreateIExecutionProvider(nullptr, nullptr, 0, const_cast<onnxruntime::ProviderOptions&>(provider_options), session_options, *((OrtLogger*)nullptr), ret);
 
   return ret;
-}
\ No newline at end of file
+}

From 848c8b1757d3ac9dde84542cc40f67ab515f2257 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Mon, 2 Feb 2026 11:05:12 -0800
Subject: [PATCH 10/39] don't build android from Docker in minimal build
 workflow

---
 .github/workflows/linux_minimal_build.yml | 62 +++--------------------
 1 file changed, 6 insertions(+), 56 deletions(-)

diff --git a/.github/workflows/linux_minimal_build.yml b/.github/workflows/linux_minimal_build.yml
index 7d481475e7ded..1f78f56c412a7 100644
--- a/.github/workflows/linux_minimal_build.yml
+++ b/.github/workflows/linux_minimal_build.yml
@@ -530,43 +530,18 @@ jobs:
             --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
 
   # Job 7: Extended minimal build with NNAPI EP for Android(arm64-v8a) and skip tests.
-  # NOTE: Keeping this as direct docker run due to custom volume mounts needed for Android SDK/NDK
   build_extended_minimal_android:
     name: 7. Build Extended Minimal (Android NNAPI)
-    needs: build_full_ort # Depends on Job 1 for test data
     runs-on: [
         "self-hosted",
         "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU",
         "JobId=build_extended_minimal_android-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
         ]
-    permissions: # Permissions needed for build-docker-image
-      contents: read
-      packages: write
-      id-token: write # If using OIDC for ACR login
     steps:
       - name: Checkout repository
         uses: actions/checkout@v6
         with:
           submodules: false
-      - uses: actions/setup-node@v6
-        with:
-          node-version: 20
-      - name: Download Test Data Artifact
-        uses: actions/download-artifact@v7
-        with:
-          name: test_data
-          path: ${{ runner.temp }}/.test_data/
-
-      - name: Get Docker Image using Action
-        uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
-        id: build_docker_image_step
-        with:
-          dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
-          image-name: ghcr.io/microsoft/onnxruntime/onnxruntimecpubuildcix64
-          push: true
-          azure-container-registry-name: onnxruntimebuildcache
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Setup Android NDK
         uses: ./.github/actions/setup-android-ndk
@@ -574,43 +549,18 @@ jobs:
           ndk-version: 28.0.13004108
           # Use default android-sdk-root if not specified
 
-      - name: Run Build 7 (Using docker run)
+      - name: Run Build 7
         shell: bash
         run: |
-          # Create the target dir for build output inside the runner's temp dir first
-          mkdir -p ${{ runner.temp }}/7
-
-          # Ensure ANDROID_NDK_HOME is available and get its real path
-          if [ -z "$ANDROID_NDK_HOME" ]; then
-            echo "ANDROID_NDK_HOME is not set."
-            exit 1
-          fi
-          NDK_HOME_REALPATH=$(realpath $ANDROID_NDK_HOME)
-
-          # Ensure ANDROID_HOME is available
-          if [ -z "$ANDROID_HOME" ]; then
-            echo "ANDROID_HOME is not set. Using default /usr/local/lib/android/sdk"
-            export ANDROID_HOME=/usr/local/lib/android/sdk
-          fi
-
-          docker run --rm \
-            --volume ${{ env.BUILD_SOURCES_DIRECTORY }}:/onnxruntime_src \
-            --volume ${{ runner.temp }}:/build \
-            --volume $ANDROID_HOME:/android_home \
-            --volume $NDK_HOME_REALPATH:/ndk_home \
-            -e ALLOW_RELEASED_ONNX_OPSET_ONLY=1 \
-            -e NIGHTLY_BUILD=1 -e RUNNER_TEMP=/build \
-            ${{ steps.build_docker_image_step.outputs.full-image-name }} \
-            bash -c "python3 -m pip install -r /onnxruntime_src/tools/ci_build/requirements/pybind/requirements.txt \
-            && python3 /onnxruntime_src/tools/ci_build/build.py \
-              --build_dir /build/7 \
+          python3 ./tools/ci_build/build.py \
+              --build_dir ./build.extended_minimal.nnapi \
               --cmake_generator Ninja \
               --config MinSizeRel \
               --skip_submodule_sync \
               --parallel --use_binskim_compliant_compile_flags \
               --android \
-              --android_sdk_path /android_home \
-              --android_ndk_path /ndk_home \
+              --android_sdk_path "$ANDROID_HOME" \
+              --android_ndk_path "$ANDROID_NDK_HOME" \
               --android_abi=arm64-v8a \
               --android_api=29 \
               --use_nnapi \
@@ -618,5 +568,5 @@ jobs:
               --build_shared_lib \
               --disable_ml_ops \
               --disable_exceptions \
-              --skip_tests"
+              --skip_tests
         working-directory: ${{ env.BUILD_SOURCES_DIRECTORY }}

From 2c78745f754793203ea34df6fde23e0a0a8bef8d Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Mon, 2 Feb 2026 13:17:51 -0800
Subject: [PATCH 11/39] disable C++20 when onnxruntime_USE_CUDA is enabled for
 all builds, update comment

---
 cmake/CMakeLists.txt | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 8eca34f4f80e7..03a37764169df 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -247,11 +247,24 @@ if (NOT CMAKE_C_STANDARD)
 endif()
 
 if (NOT CMAKE_CXX_STANDARD)
-  # TODO move all builds to C++20
-  if (MSVC AND onnxruntime_USE_CUDA)
-    # There's a compilation error from CUTLASS header "cute/tensor.hpp" when attempting to use C++20:
-    #   cutlass-src\include\cute\stride.hpp(299,46): error C3545: 'Ints': parameter pack expects a non-type template
-    #   argument
+  # TODO: enable C++20 for all builds
+  # set(CMAKE_CXX_STANDARD 20)
+
+  if (onnxruntime_USE_CUDA)
+    # Known issues when updating from C++17 to C++20:
+    # - MSVC + onnxruntime_USE_CUDA:
+    #   - Compilation error from CUTLASS header "cute/tensor.hpp" when attempting to use C++20:
+    #       cutlass-src\include\cute\stride.hpp(299,46): error C3545: 'Ints': parameter pack expects a non-type
+    #       template argument
+    # - GCC + onnxruntime_USE_CUDA:
+    #   - Compilation error from onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc when adding an element to
+    #     `std::vector<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig>` in
+    #     onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc:
+    #       /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/stl_construct.h:97:14: error: writing 1 byte into a
+    #       region of size 0 [-Werror=stringop-overflow=]
+    #   - Possibly a spurious warning
+    #
+    # When the CUDA EP becomes an independent plugin EP, we can keep building it with C++17 if needed.
     set(CMAKE_CXX_STANDARD 17)
   else()
     set(CMAKE_CXX_STANDARD 20)

From a035d7f0f194d4326198f343b40bd61f8c4bfa48 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Mon, 2 Feb 2026 13:21:02 -0800
Subject: [PATCH 12/39] update comment again

---
 cmake/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 03a37764169df..3d3f19c829d85 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -264,7 +264,8 @@ if (NOT CMAKE_CXX_STANDARD)
     #       region of size 0 [-Werror=stringop-overflow=]
     #   - Possibly a spurious warning
     #
-    # When the CUDA EP becomes an independent plugin EP, we can keep building it with C++17 if needed.
+    # When the CUDA EP becomes an independent plugin EP, hopefully we can update all of onnxruntime to C++20.
+    # We can keep building the CUDA plugin EP with C++17 if needed.
     set(CMAKE_CXX_STANDARD 17)
   else()
     set(CMAKE_CXX_STANDARD 20)

From 06fec4fa4eb28a5e79fd1f21c3e89d83e2fb4515 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 3 Feb 2026 18:15:05 -0800
Subject: [PATCH 13/39] tweak comment

---
 cmake/CMakeLists.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 3d3f19c829d85..6d8c09bea6f8e 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -253,13 +253,12 @@ if (NOT CMAKE_CXX_STANDARD)
   if (onnxruntime_USE_CUDA)
     # Known issues when updating from C++17 to C++20:
     # - MSVC + onnxruntime_USE_CUDA:
-    #   - Compilation error from CUTLASS header "cute/tensor.hpp" when attempting to use C++20:
+    #   - Compilation error from CUTLASS header cute/tensor.hpp:
     #       cutlass-src\include\cute\stride.hpp(299,46): error C3545: 'Ints': parameter pack expects a non-type
     #       template argument
     # - GCC + onnxruntime_USE_CUDA:
     #   - Compilation error from onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc when adding an element to
-    #     `std::vector<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig>` in
-    #     onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc:
+    #     `std::vector<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig>`:
     #       /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/stl_construct.h:97:14: error: writing 1 byte into a
     #       region of size 0 [-Werror=stringop-overflow=]
     #   - Possibly a spurious warning

From 7ec3538feff19a8f72d7ac18bc8276f02bbc1af6 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Mon, 9 Feb 2026 09:39:01 -0800
Subject: [PATCH 14/39] DEBUG: CMake generate tracing

---
 tools/ci_build/build.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index a0712af35e455..a2b361028ef36 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -364,7 +364,7 @@ def generate_build_tree(
 ):
     log.info("Generating CMake build tree")
     cmake_dir = os.path.join(source_dir, "cmake")
-    cmake_args = [cmake_path, cmake_dir]
+    cmake_args = [cmake_path, cmake_dir, "--trace-expand"]
     if not use_dev_mode(args):
         cmake_args += ["--compile-no-warning-as-error"]
 

From 79de47aabebec2562ff23fe9351c793f81db181b Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Mon, 9 Feb 2026 11:29:56 -0800
Subject: [PATCH 15/39] try disabling module scan for Android QNN

---
 .../ci_build/github/android/default_qnn_aar_build_settings.json  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/ci_build/github/android/default_qnn_aar_build_settings.json b/tools/ci_build/github/android/default_qnn_aar_build_settings.json
index 5ac49f582d23e..a260a2be4a2c5 100644
--- a/tools/ci_build/github/android/default_qnn_aar_build_settings.json
+++ b/tools/ci_build/github/android/default_qnn_aar_build_settings.json
@@ -12,6 +12,7 @@
         "--build_java",
         "--build_shared_lib",
         "--use_qnn=static_lib",
+        "--cmake_extra_defines=CMAKE_CXX_SCAN_FOR_MODULES=OFF",
         "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF",
         "--skip_tests"
 

From 2ef0bdbf6472058228d844cffc9d9ff887143152 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Mon, 9 Feb 2026 11:30:39 -0800
Subject: [PATCH 16/39] Revert "DEBUG: CMake generate tracing"

This reverts commit 7ec3538feff19a8f72d7ac18bc8276f02bbc1af6.
---
 tools/ci_build/build.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index a2b361028ef36..a0712af35e455 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -364,7 +364,7 @@ def generate_build_tree(
 ):
     log.info("Generating CMake build tree")
     cmake_dir = os.path.join(source_dir, "cmake")
-    cmake_args = [cmake_path, cmake_dir, "--trace-expand"]
+    cmake_args = [cmake_path, cmake_dir]
     if not use_dev_mode(args):
         cmake_args += ["--compile-no-warning-as-error"]
 

From 4d22c087a90f123b43033f8b9ec3aa8c724092df Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 24 Feb 2026 12:50:58 -0800
Subject: [PATCH 17/39] add cutlass issue link

---
 cmake/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 50aa4ee84518d..ea2d33e42dd8b 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -258,7 +258,7 @@ if (NOT CMAKE_CXX_STANDARD)
   if (onnxruntime_USE_CUDA)
     # Known issues when updating from C++17 to C++20:
     # - MSVC + onnxruntime_USE_CUDA:
-    #   - Compilation error from CUTLASS header cute/tensor.hpp:
+    #   - Compilation error from CUTLASS header cute/tensor.hpp (https://github.com/NVIDIA/cutlass/issues/3065):
     #       cutlass-src\include\cute\stride.hpp(299,46): error C3545: 'Ints': parameter pack expects a non-type
     #       template argument
     # - GCC + onnxruntime_USE_CUDA:

From 282545bde1d2c4fbfc270d2c50c3d5cdc3559d6b Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Wed, 25 Feb 2026 09:20:24 -0800
Subject: [PATCH 18/39] move cmake code around so that options and setting of
 CMAKE_CXX_STANDARD are earlier

---
 cmake/CMakeLists.txt | 65 ++++++++++++++++++++++----------------------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index ea2d33e42dd8b..33bbbcfeeb088 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -10,39 +10,7 @@ cmake_policy(SET CMP0104 OLD)
 # Project
 project(onnxruntime C CXX ASM)
 
-# Disable fast-math for Intel oneAPI compiler
-if("${CMAKE_CXX_COMPILER_ID}" MATCHES "IntelLLVM")
-  if("${CMAKE_CXX_COMPILER_ID}" MATCHES "MSVC-like")
-    # Using icx-cl compiler driver with MSVC-like arguments
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:precise")
-  else()
-    # Using icpx compiler driver
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-fast-math")
-  endif()
-endif()
-
-include(CheckCXXCompilerFlag)
-include(CheckLanguage)
 include(CMakeDependentOption)
-include(FetchContent)
-include(CheckFunctionExists)
-include(CheckSymbolExists)
-include(GNUInstallDirs) # onnxruntime_providers_* require CMAKE_INSTALL_* variables
-
-if (MSVC)
-  #  Make sure Visual Studio sets __cplusplus macro correctly: https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus")
-endif()
-
-set_property(GLOBAL PROPERTY USE_FOLDERS ON)
-# NOTE: POSITION INDEPENDENT CODE hurts performance, and it only make sense on POSIX systems
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-
-enable_testing()
-if (NOT CMAKE_BUILD_TYPE)
-  message(STATUS "Build type not set - using RelWithDebInfo")
-  set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose build type: Debug Release RelWithDebInfo MinSizeRel." FORCE)
-endif()
 
 # Options
 option(onnxruntime_USE_VCPKG "Build with the vcpkg package manager" OFF)
@@ -276,6 +244,39 @@ if (NOT CMAKE_CXX_STANDARD)
   endif()
 endif()
 
+# Disable fast-math for Intel oneAPI compiler
+if("${CMAKE_CXX_COMPILER_ID}" MATCHES "IntelLLVM")
+  if("${CMAKE_CXX_COMPILER_ID}" MATCHES "MSVC-like")
+    # Using icx-cl compiler driver with MSVC-like arguments
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:precise")
+  else()
+    # Using icpx compiler driver
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-fast-math")
+  endif()
+endif()
+
+include(CheckCXXCompilerFlag)
+include(CheckLanguage)
+include(FetchContent)
+include(CheckFunctionExists)
+include(CheckSymbolExists)
+include(GNUInstallDirs) # onnxruntime_providers_* require CMAKE_INSTALL_* variables
+
+if (MSVC)
+  #  Make sure Visual Studio sets __cplusplus macro correctly: https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus")
+endif()
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+# NOTE: POSITION INDEPENDENT CODE hurts performance, and it only make sense on POSIX systems
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+enable_testing()
+if (NOT CMAKE_BUILD_TYPE)
+  message(STATUS "Build type not set - using RelWithDebInfo")
+  set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose build type: Debug Release RelWithDebInfo MinSizeRel." FORCE)
+endif()
+
 if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 11.1)
   message(FATAL_ERROR  "GCC version must be greater than or equal to 11.1")
 endif()

From 9c21b3fbb1e1bc996afc12a09726da4c6dd04cd5 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Mon, 2 Mar 2026 13:34:08 -0800
Subject: [PATCH 19/39] vitisai global_api.cc - use onnxruntime::ToUTF8String
 on model path

---
 onnxruntime/core/providers/vitisai/imp/global_api.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index 0db57196c3c68..ad22187a75cd9 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -12,6 +12,7 @@
 #endif
 #include "./vai_assert.h"
 
+#include "core/common/common.h"
 #include "core/common/exceptions.h"
 #include "core/framework/error_code_helper.h"
 #include "core/providers/shared/common.h"
@@ -233,12 +234,13 @@ void change_status_with_error(void* status_ptr, int error_code, const char* erro
 
 vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> compile_onnx_model(
     const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::logging::Logger& logger, const onnxruntime::ProviderOptions& options) {
-  auto model_path = graph_viewer.ModelPath();
+  const auto model_path_string = onnxruntime::ToUTF8String(graph_viewer.ModelPath().native());
+
   auto vaip_execution_provider_deletor = s_library_vitisaiep.vaip_execution_provider_deletor;
   if (s_library_vitisaiep.compile_onnx_model_vitisai_ep_v4) {
     Status status = Status::OK();
     auto status_ptr = reinterpret_cast<void*>(&status);
-    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_v4(model_path.string(), graph_viewer.GetGraph(), options, status_ptr, change_status_with_error, logger), vaip_execution_provider_deletor);
+    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_v4(model_path_string, graph_viewer.GetGraph(), options, status_ptr, change_status_with_error, logger), vaip_execution_provider_deletor);
     if (!status.IsOK()) {
       ORT_THROW(status);
     }
@@ -246,7 +248,7 @@ vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> c
   } else if (s_library_vitisaiep.compile_onnx_model_vitisai_ep_v3) {
     Status status = Status::OK();
     auto status_ptr = reinterpret_cast<void*>(&status);
-    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_v3(model_path.string(), graph_viewer.GetGraph(), options, status_ptr, change_status_with_error), vaip_execution_provider_deletor);
+    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_v3(model_path_string, graph_viewer.GetGraph(), options, status_ptr, change_status_with_error), vaip_execution_provider_deletor);
     if (!status.IsOK()) {
       ORT_THROW(status);
     }
@@ -254,13 +256,13 @@ vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> c
   } else if (s_library_vitisaiep.compile_onnx_model_vitisai_ep_with_error_handling) {
     Status status = Status::OK();
     auto status_ptr = reinterpret_cast<void*>(&status);
-    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_with_error_handling(model_path.string(), graph_viewer.GetGraph(), options, status_ptr, change_status_with_error), vaip_execution_provider_deletor);
+    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_with_error_handling(model_path_string, graph_viewer.GetGraph(), options, status_ptr, change_status_with_error), vaip_execution_provider_deletor);
     if (!status.IsOK()) {
       ORT_THROW(status);
     }
     return ret;
   } else {
-    return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_with_options(model_path.string(), graph_viewer.GetGraph(), options), vaip_execution_provider_deletor);
+    return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_with_options(model_path_string, graph_viewer.GetGraph(), options), vaip_execution_provider_deletor);
   }
 }
 

From 554363655eb1c252dc733a9a4eae149ff9ba1ed4 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Wed, 4 Mar 2026 11:25:55 -0800
Subject: [PATCH 20/39] try C++20 for everything, pass /permissive to CUDA MSVC
 build

---
 cmake/CMakeLists.txt                   | 41 +++++++-------------------
 cmake/onnxruntime_providers_cuda.cmake |  8 +++++
 2 files changed, 18 insertions(+), 31 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index ec662d75c53bf..de16ca1ac5fbe 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -10,6 +10,16 @@ cmake_policy(SET CMP0104 OLD)
 # Project
 project(onnxruntime C CXX ASM)
 
+# Set C/C++ standard versions
+if (NOT CMAKE_C_STANDARD)
+  # Needed for Java
+  set(CMAKE_C_STANDARD 99)
+endif()
+
+if (NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 20)
+endif()
+
 include(CMakeDependentOption)
 
 # Options
@@ -213,37 +223,6 @@ option(onnxruntime_USE_OPENVINO_INTERFACE "Build ONNXRuntime shared lib which is
 option(onnxruntime_USE_VITISAI_INTERFACE "Build ONNXRuntime shared lib which is compatible with Vitis-AI EP interface" OFF)
 option(onnxruntime_USE_QNN_INTERFACE "Build ONNXRuntime shared lib which is compatible with QNN EP interface" OFF)
 
-# Set C/C++ standard versions
-if (NOT CMAKE_C_STANDARD)
-  # Needed for Java
-  set(CMAKE_C_STANDARD 99)
-endif()
-
-if (NOT CMAKE_CXX_STANDARD)
-  # TODO: enable C++20 for all builds
-  # set(CMAKE_CXX_STANDARD 20)
-
-  if (onnxruntime_USE_CUDA)
-    # Known issues when updating from C++17 to C++20:
-    # - MSVC + onnxruntime_USE_CUDA:
-    #   - Compilation error from CUTLASS header cute/tensor.hpp (https://github.com/NVIDIA/cutlass/issues/3065):
-    #       cutlass-src\include\cute\stride.hpp(299,46): error C3545: 'Ints': parameter pack expects a non-type
-    #       template argument
-    # - GCC + onnxruntime_USE_CUDA:
-    #   - Compilation error from onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc when adding an element to
-    #     `std::vector<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig>`:
-    #       /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/stl_construct.h:97:14: error: writing 1 byte into a
-    #       region of size 0 [-Werror=stringop-overflow=]
-    #   - Possibly a spurious warning
-    #
-    # When the CUDA EP becomes an independent plugin EP, hopefully we can update all of onnxruntime to C++20.
-    # We can keep building the CUDA plugin EP with C++17 if needed.
-    set(CMAKE_CXX_STANDARD 17)
-  else()
-    set(CMAKE_CXX_STANDARD 20)
-  endif()
-endif()
-
 # Disable fast-math for Intel oneAPI compiler
 if("${CMAKE_CXX_COMPILER_ID}" MATCHES "IntelLLVM")
   if("${CMAKE_CXX_COMPILER_ID}" MATCHES "MSVC-like")
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 94dba5bcec93c..41ba18769e688 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -149,6 +149,14 @@
     onnxruntime_add_shared_library_module(onnxruntime_providers_cuda ${onnxruntime_providers_cuda_all_srcs})
   endif()
 
+  if (MSVC)
+    # Use /permissive to work around compilation error from CUTLASS header cute/tensor.hpp:
+    #   cutlass-src\include\cute\stride.hpp(299,46): error C3545: 'Ints': parameter pack expects a non-type
+    #     template argument
+    # See https://github.com/NVIDIA/cutlass/issues/3065
+    target_compile_options(onnxruntime_providers_cuda PRIVATE "/permissive")
+  endif()
+
   if(WIN32)
     # FILE_NAME preprocessor definition is used in onnxruntime_providers_cuda.rc
     target_compile_definitions(onnxruntime_providers_cuda PRIVATE FILE_NAME=\"onnxruntime_providers_cuda.dll\")

From 1a8f8973772fa01b253b42bfd206204929186601 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Wed, 4 Mar 2026 13:47:49 -0800
Subject: [PATCH 21/39] try adding /permissive another way, update
 CUDA_STANDARD to 20 too

---
 cmake/CMakeLists.txt                   | 2 +-
 cmake/onnxruntime_providers_cuda.cmake | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index de16ca1ac5fbe..fd41fcbb1309c 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -1473,7 +1473,7 @@ configure_file(onnxruntime_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime_c
 get_property(onnxruntime_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
 
 if (onnxruntime_USE_CUDA)
-  set(CMAKE_CUDA_STANDARD 17)
+  set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
   if(onnxruntime_CUDA_HOME)
     file(TO_CMAKE_PATH ${onnxruntime_CUDA_HOME} CUDAToolkit_ROOT)
   endif()
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 41ba18769e688..2cde0dd0dc269 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -154,7 +154,10 @@
     #   cutlass-src\include\cute\stride.hpp(299,46): error C3545: 'Ints': parameter pack expects a non-type
     #     template argument
     # See https://github.com/NVIDIA/cutlass/issues/3065
-    target_compile_options(onnxruntime_providers_cuda PRIVATE "/permissive")
+    target_compile_options(onnxruntime_providers_cuda PRIVATE
+      "$<$<COMPILE_LANGUAGE:CXX>:/permissive>"
+      #"$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /permissive>"
+    )
   endif()
 
   if(WIN32)

From f2fcee0a6c7b82c4460a592f138647ca12026e4e Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Wed, 4 Mar 2026 14:18:26 -0800
Subject: [PATCH 22/39] add CUDA language /permissive too

---
 cmake/onnxruntime_providers_cuda.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 2cde0dd0dc269..e12757f86daef 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -156,7 +156,7 @@
     # See https://github.com/NVIDIA/cutlass/issues/3065
     target_compile_options(onnxruntime_providers_cuda PRIVATE
       "$<$<COMPILE_LANGUAGE:CXX>:/permissive>"
-      #"$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /permissive>"
+      "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /permissive>"
     )
   endif()
 

From 5873aee8d65de58b0aeb4acf1a399ec07e31d5d3 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Wed, 4 Mar 2026 15:02:59 -0800
Subject: [PATCH 23/39] change GridDim::maxThreadsPerBlock and
 GridDim::maxElementsPerThread to be static constexpr data members instead of
 enum values

---
 onnxruntime/core/providers/cuda/cu_inc/common.cuh | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
index ec794b46d3f0e..cf85ffbc92fa1 100644
--- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
@@ -683,10 +683,8 @@ inline __host__ __device__ INT CeilDiv(INT a, INT2 b)  // ceil(a/b)
 }
 
 struct GridDim {
-  enum : CUDA_LONG {
-    maxThreadsPerBlock = 256,  // max threads per block
-    maxElementsPerThread = 4,  // max element processed per thread
-  };
+  static constexpr CUDA_LONG maxThreadsPerBlock = 256;  // max threads per block
+  static constexpr CUDA_LONG maxElementsPerThread = 4;  // max element processed per thread
 };
 
 // aligned vector generates vectorized load/store on CUDA

From 99f245fba90b4d704ed18f211705dd2b7a76e19b Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Wed, 4 Mar 2026 17:06:14 -0800
Subject: [PATCH 24/39] cutlass patch to work around error

---
 cmake/external/cutlass.cmake              |  2 +-
 cmake/patches/cutlass/cutlass_4.2.1.patch | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/cmake/external/cutlass.cmake b/cmake/external/cutlass.cmake
index df554269dfc7f..83d8a156b630f 100644
--- a/cmake/external/cutlass.cmake
+++ b/cmake/external/cutlass.cmake
@@ -4,7 +4,7 @@ onnxruntime_fetchcontent_declare(
   URL ${DEP_URL_cutlass}
   URL_HASH SHA1=${DEP_SHA1_cutlass}
   EXCLUDE_FROM_ALL
-PATCH_COMMAND ${Patch_EXECUTABLE} --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/cutlass/cutlass_4.2.1.patch
+  PATCH_COMMAND ${Patch_EXECUTABLE} --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/cutlass/cutlass_4.2.1.patch
 )
 
 FetchContent_GetProperties(cutlass)
diff --git a/cmake/patches/cutlass/cutlass_4.2.1.patch b/cmake/patches/cutlass/cutlass_4.2.1.patch
index 3a3ec5ba103ef..e955327a7a494 100644
--- a/cmake/patches/cutlass/cutlass_4.2.1.patch
+++ b/cmake/patches/cutlass/cutlass_4.2.1.patch
@@ -11,6 +11,20 @@ index cb161369..2fdff179 100644
                           [&](auto init, auto i){
                             if constexpr (is_constant_v<0, decltype(get<i>(flat_stride))>) { return append(init, i); }
                             else                                                           { return init;            }
+diff --git a/include/cutlass/cuda_host_adapter.hpp b/include/cutlass/cuda_host_adapter.hpp
+index a8af62be..74409054 100644
+--- a/include/cutlass/cuda_host_adapter.hpp
++++ b/include/cutlass/cuda_host_adapter.hpp
+@@ -394,6 +394,9 @@ protected:
+    * Fills a buffer in Global Memory with a byte sequence copied from host memory.
+    * This function can be overridden to dispatch to the appropriate cuMemsetD*Async API
+   */
++  // Patching to work around this error:
++  //   include\cutlass/cuda_host_adapter.hpp(414): error #20011-D: calling a __host__ function("memsetDeviceImpl") from a __host__ __device__ function("memsetDevice") is not allowed
++  CUTLASS_HOST_DEVICE
+   virtual Status memsetDeviceImpl(
+     void* destination, ///< Device memory pointer to be filled
+     void const* fill_value, ///< Value to be filled in the buffer
 diff --git a/include/cutlass/exmy_base.h b/include/cutlass/exmy_base.h
 index be207a49..6028e01d 100644
 --- a/include/cutlass/exmy_base.h

From 042b504c71c14716ed620bde28446c631985ec93 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Wed, 4 Mar 2026 18:19:08 -0800
Subject: [PATCH 25/39] move #177 warning suppression to CUDA 12.8+ section

---
 cmake/onnxruntime_providers_cuda.cmake | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index e12757f86daef..78be50473b56d 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -200,6 +200,10 @@
       endif()
       # skip diagnosis error caused by cuda header files
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=221>")
+
+      # suppress warnings like this:
+      #   cutlass-src\include\cute/arch/mma_sm120.hpp(3128): error #177-D: variable "tidA" was declared but never referenced
+      target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=177>")
     endif()
 
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
@@ -292,7 +296,6 @@
       target_compile_definitions(${target} PRIVATE COMPILE_HOPPER_TMA_GEMMS)
       if (MSVC)
         target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /bigobj>")
-        target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=177>")
         target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4172>")
       endif()
     endif()

From bacffa2f11186913d72287ffca4df1d480eacd03 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Wed, 4 Mar 2026 18:29:32 -0800
Subject: [PATCH 26/39] undo CMake code moving around

---
 cmake/CMakeLists.txt | 73 ++++++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 37 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index fd41fcbb1309c..234947470c9be 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -20,7 +20,43 @@ if (NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 20)
 endif()
 
+# Disable fast-math for Intel oneAPI compiler
+if("${CMAKE_CXX_COMPILER_ID}" MATCHES "IntelLLVM")
+  if("${CMAKE_CXX_COMPILER_ID}" MATCHES "MSVC-like")
+    # Using icx-cl compiler driver with MSVC-like arguments
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:precise")
+  else()
+    # Using icpx compiler driver
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-fast-math")
+  endif()
+endif()
+
+include(CheckCXXCompilerFlag)
+include(CheckLanguage)
 include(CMakeDependentOption)
+include(FetchContent)
+include(CheckFunctionExists)
+include(CheckSymbolExists)
+include(GNUInstallDirs) # onnxruntime_providers_* require CMAKE_INSTALL_* variables
+
+if (MSVC)
+  #  Make sure Visual Studio sets __cplusplus macro correctly: https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus")
+
+  # Prevents CMake from injecting '#pragma system_header', which results in warnings being disabled in projects that
+  # use precompiled headers.
+  set(CMAKE_PCH_PROLOGUE "")
+endif()
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+# NOTE: POSITION INDEPENDENT CODE hurts performance, and it only make sense on POSIX systems
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+enable_testing()
+if (NOT CMAKE_BUILD_TYPE)
+  message(STATUS "Build type not set - using RelWithDebInfo")
+  set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose build type: Debug Release RelWithDebInfo MinSizeRel." FORCE)
+endif()
 
 # Options
 option(onnxruntime_USE_VCPKG "Build with the vcpkg package manager" OFF)
@@ -223,43 +259,6 @@ option(onnxruntime_USE_OPENVINO_INTERFACE "Build ONNXRuntime shared lib which is
 option(onnxruntime_USE_VITISAI_INTERFACE "Build ONNXRuntime shared lib which is compatible with Vitis-AI EP interface" OFF)
 option(onnxruntime_USE_QNN_INTERFACE "Build ONNXRuntime shared lib which is compatible with QNN EP interface" OFF)
 
-# Disable fast-math for Intel oneAPI compiler
-if("${CMAKE_CXX_COMPILER_ID}" MATCHES "IntelLLVM")
-  if("${CMAKE_CXX_COMPILER_ID}" MATCHES "MSVC-like")
-    # Using icx-cl compiler driver with MSVC-like arguments
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:precise")
-  else()
-    # Using icpx compiler driver
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-fast-math")
-  endif()
-endif()
-
-include(CheckCXXCompilerFlag)
-include(CheckLanguage)
-include(FetchContent)
-include(CheckFunctionExists)
-include(CheckSymbolExists)
-include(GNUInstallDirs) # onnxruntime_providers_* require CMAKE_INSTALL_* variables
-
-if (MSVC)
-  #  Make sure Visual Studio sets __cplusplus macro correctly: https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus")
-
-  # Prevents CMake from injecting '#pragma system_header', which results in warnings being disabled in projects that
-  # use precompiled headers.
-  set(CMAKE_PCH_PROLOGUE "")
-endif()
-
-set_property(GLOBAL PROPERTY USE_FOLDERS ON)
-# NOTE: POSITION INDEPENDENT CODE hurts performance, and it only make sense on POSIX systems
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-
-enable_testing()
-if (NOT CMAKE_BUILD_TYPE)
-  message(STATUS "Build type not set - using RelWithDebInfo")
-  set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose build type: Debug Release RelWithDebInfo MinSizeRel." FORCE)
-endif()
-
 if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 11.1)
   message(FATAL_ERROR  "GCC version must be greater than or equal to 11.1")
 endif()

From b4aca518206de3f027eca0737506427b3cf4db43 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Wed, 4 Mar 2026 18:39:37 -0800
Subject: [PATCH 27/39] break long line

---
 cmake/onnxruntime_providers_cuda.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 78be50473b56d..3e7d43e66b690 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -202,7 +202,8 @@
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=221>")
 
       # suppress warnings like this:
-      #   cutlass-src\include\cute/arch/mma_sm120.hpp(3128): error #177-D: variable "tidA" was declared but never referenced
+      #   cutlass-src\include\cute/arch/mma_sm120.hpp(3128): error #177-D: variable "tidA" was declared but never
+      #     referenced
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=177>")
     endif()
 

From e6297ffe63fbc32a5e10f3f0a35e122e59e5fb1b Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Wed, 4 Mar 2026 18:42:39 -0800
Subject: [PATCH 28/39] update patch file for line length

---
 cmake/patches/cutlass/cutlass_4.2.1.patch | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/patches/cutlass/cutlass_4.2.1.patch b/cmake/patches/cutlass/cutlass_4.2.1.patch
index e955327a7a494..e5af67eb25fcf 100644
--- a/cmake/patches/cutlass/cutlass_4.2.1.patch
+++ b/cmake/patches/cutlass/cutlass_4.2.1.patch
@@ -20,7 +20,8 @@ index a8af62be..74409054 100644
     * This function can be overridden to dispatch to the appropriate cuMemsetD*Async API
    */
 +  // Patching to work around this error:
-+  //   include\cutlass/cuda_host_adapter.hpp(414): error #20011-D: calling a __host__ function("memsetDeviceImpl") from a __host__ __device__ function("memsetDevice") is not allowed
++  //   include\cutlass/cuda_host_adapter.hpp(414): error #20011-D: calling a __host__ function("memsetDeviceImpl")
++  //     from a __host__ __device__ function("memsetDevice") is not allowed
 +  CUTLASS_HOST_DEVICE
    virtual Status memsetDeviceImpl(
      void* destination, ///< Device memory pointer to be filled

From 6d7ff32dbc3bef5487d17029222a4c51b20be060 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Wed, 4 Mar 2026 19:00:22 -0800
Subject: [PATCH 29/39] fix patch file

---
 cmake/patches/cutlass/cutlass_4.2.1.patch | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/patches/cutlass/cutlass_4.2.1.patch b/cmake/patches/cutlass/cutlass_4.2.1.patch
index e5af67eb25fcf..6776eded10640 100644
--- a/cmake/patches/cutlass/cutlass_4.2.1.patch
+++ b/cmake/patches/cutlass/cutlass_4.2.1.patch
@@ -12,10 +12,10 @@ index cb161369..2fdff179 100644
                             if constexpr (is_constant_v<0, decltype(get<i>(flat_stride))>) { return append(init, i); }
                             else                                                           { return init;            }
 diff --git a/include/cutlass/cuda_host_adapter.hpp b/include/cutlass/cuda_host_adapter.hpp
-index a8af62be..74409054 100644
+index a8af62be..22e7332d 100644
 --- a/include/cutlass/cuda_host_adapter.hpp
 +++ b/include/cutlass/cuda_host_adapter.hpp
-@@ -394,6 +394,9 @@ protected:
+@@ -394,6 +394,10 @@ protected:
     * Fills a buffer in Global Memory with a byte sequence copied from host memory.
     * This function can be overridden to dispatch to the appropriate cuMemsetD*Async API
    */

From 8dbb80f9f5c766c041480ae3915c307340d999f3 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Wed, 4 Mar 2026 20:14:15 -0800
Subject: [PATCH 30/39] don't use deprecated move_iterator operator->

---
 .../onnxruntime/core/common/gpu_profiler_common.h    | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/onnxruntime/core/common/gpu_profiler_common.h b/include/onnxruntime/core/common/gpu_profiler_common.h
index 00d5033ef2df4..999d74f196505 100644
--- a/include/onnxruntime/core/common/gpu_profiler_common.h
+++ b/include/onnxruntime/core/common/gpu_profiler_common.h
@@ -379,8 +379,8 @@ class GPUProfilerBase : public EpProfiler {
   void MergeEvents(std::map<uint64_t, Events>& events_to_merge, Events& events) {
     Events merged_events;
 
-    auto event_iter = std::make_move_iterator(events.begin());
-    auto event_end = std::make_move_iterator(events.end());
+    auto event_iter = events.begin();
+    auto event_end = events.end();
     for (auto& map_iter : events_to_merge) {
       if (map_iter.second.empty()) {
         continue;
@@ -395,7 +395,7 @@ class GPUProfilerBase : public EpProfiler {
               (event_iter->ts == ts &&
                (event_iter + 1) != event_end &&
                (event_iter + 1)->ts == ts))) {
-        merged_events.emplace_back(*event_iter);
+        merged_events.emplace_back(*std::make_move_iterator(event_iter));
         ++event_iter;
       }
 
@@ -409,7 +409,7 @@ class GPUProfilerBase : public EpProfiler {
         copy_op_names = true;
         op_name = event_iter->args["op_name"];
         parent_name = event_iter->name;
-        merged_events.emplace_back(*event_iter);
+        merged_events.emplace_back(*std::make_move_iterator(event_iter));
         ++event_iter;
       }
 
@@ -428,7 +428,9 @@ class GPUProfilerBase : public EpProfiler {
     }
 
     // move any remaining events
-    merged_events.insert(merged_events.end(), event_iter, event_end);
+    merged_events.insert(merged_events.end(),
+                         std::make_move_iterator(event_iter),
+                         std::make_move_iterator(event_end));
     std::swap(events, merged_events);
   }
 

From 6f801b2fafb776cea66f318720dda1fc92792e48 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 5 Mar 2026 11:03:32 -0800
Subject: [PATCH 31/39] try to suppress spurious stringop-overflow warning

---
 cmake/onnxruntime_config.h.in                      |  1 +
 .../contrib_ops/cuda/llm/cutlass_heuristic.cc      | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/cmake/onnxruntime_config.h.in b/cmake/onnxruntime_config.h.in
index a36f735c507ba..e5f759b9d705f 100644
--- a/cmake/onnxruntime_config.h.in
+++ b/cmake/onnxruntime_config.h.in
@@ -20,6 +20,7 @@
 #cmakedefine HAS_PARENTHESES
 #cmakedefine HAS_REALLOCARRAY
 #cmakedefine HAS_SHORTEN_64_TO_32
+#cmakedefine HAS_STRINGOP_OVERFLOW
 #cmakedefine HAS_TAUTOLOGICAL_POINTER_COMPARE
 #cmakedefine HAS_UNUSED_BUT_SET_PARAMETER
 #cmakedefine HAS_UNUSED_BUT_SET_VARIABLE
diff --git a/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc b/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc
index d53fb558ba1a1..cb643997e543e 100644
--- a/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc
+++ b/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc
@@ -28,6 +28,7 @@
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/numeric_types.h"
 #include "core/common/common.h"
+#include "onnxruntime_config.h"
 
 #include <cuda_runtime_api.h>
 #include <set>
@@ -280,6 +281,13 @@ std::vector<CutlassGemmConfig> get_candidate_configs_sm90(CutlassGemmConfig::Can
   return candidate_configs;
 }
 
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#if defined(HAS_STRINGOP_OVERFLOW)
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif  // defined(HAS_STRINGOP_OVERFLOW)
+#endif  // __GNUC__
+
 std::vector<CutlassGemmConfig> get_candidate_configs_sm100(CutlassGemmConfig::CandidateConfigTypeParam const config) {
 #ifdef FAST_BUILD
   // Fast build disables all configs except this one for SM100
@@ -354,7 +362,11 @@ std::vector<CutlassGemmConfig> get_candidate_configs_sm100(CutlassGemmConfig::Ca
   }
 #endif
 
-}  // namespace kernels
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif  // __GNUC__
 
 std::vector<CutlassGemmConfig> get_candidate_configs(
     int sm, int const max_split_k, CutlassGemmConfig::CandidateConfigTypeParam const config_type_param) {

From f5c4222d22b58f4423bc209c4ea0283a1884282d Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 5 Mar 2026 11:14:05 -0800
Subject: [PATCH 32/39] add comment

---
 onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc b/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc
index cb643997e543e..2f758b55d1cbe 100644
--- a/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc
+++ b/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc
@@ -281,6 +281,16 @@ std::vector<CutlassGemmConfig> get_candidate_configs_sm90(CutlassGemmConfig::Can
   return candidate_configs;
 }
 
+// Suppressing this warning from a Release build with GCC:
+//
+//  In function ‘constexpr decltype (::new(void*(0)) _Tp) std::construct_at(_Tp*, _Args&& ...) [with _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Args = {onnxruntime::llm::cutlass_extensions::CutlassGemmConfig}]’,
+//     inlined from ‘static constexpr void std::allocator_traits<std::allocator<_CharT> >::construct(allocator_type&, _Up*, _Args&& ...) [with _Up = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Args = {onnxruntime::llm::cutlass_extensions::CutlassGemmConfig}; _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig]’ at /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/alloc_traits.h:577:21,
+//     inlined from ‘constexpr std::vector<_Tp, _Alloc>::reference std::vector<_Tp, _Alloc>::emplace_back(_Args&& ...) [with _Args = {onnxruntime::llm::cutlass_extensions::CutlassGemmConfig}; _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Alloc = std::allocator<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig>]’ at /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/vector.tcc:117:30,
+//     inlined from ‘constexpr void std::vector<_Tp, _Alloc>::push_back(value_type&&) [with _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Alloc = std::allocator<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig>]’ at /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/stl_vector.h:1301:21,
+//     inlined from ‘std::vector<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig> onnxruntime::llm::kernels::cutlass_kernels::get_candidate_configs_sm100(onnxruntime::llm::cutlass_extensions::CutlassGemmConfig::CandidateConfigTypeParam)’ at /onnxruntime_src/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc:298:34:
+// /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/stl_construct.h:97:14: error: writing 1 byte into a region of size 0 [-Werror=stringop-overflow=]
+//    97 |     { return ::new((void*)__location) _Tp(std::forward<_Args>(__args)...); }
+//       |              ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #ifdef __GNUC__
 #pragma GCC diagnostic push
 #if defined(HAS_STRINGOP_OVERFLOW)

From 294dbd3ba5bfcc04c0bc284ac81f25cc2c222fee Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 5 Mar 2026 11:15:15 -0800
Subject: [PATCH 33/39] try to make suppression very local

---
 .../contrib_ops/cuda/llm/cutlass_heuristic.cc | 40 +++++++++----------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc b/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc
index 2f758b55d1cbe..79d2cca792602 100644
--- a/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc
+++ b/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc
@@ -281,23 +281,6 @@ std::vector<CutlassGemmConfig> get_candidate_configs_sm90(CutlassGemmConfig::Can
   return candidate_configs;
 }
 
-// Suppressing this warning from a Release build with GCC:
-//
-//  In function ‘constexpr decltype (::new(void*(0)) _Tp) std::construct_at(_Tp*, _Args&& ...) [with _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Args = {onnxruntime::llm::cutlass_extensions::CutlassGemmConfig}]’,
-//     inlined from ‘static constexpr void std::allocator_traits<std::allocator<_CharT> >::construct(allocator_type&, _Up*, _Args&& ...) [with _Up = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Args = {onnxruntime::llm::cutlass_extensions::CutlassGemmConfig}; _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig]’ at /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/alloc_traits.h:577:21,
-//     inlined from ‘constexpr std::vector<_Tp, _Alloc>::reference std::vector<_Tp, _Alloc>::emplace_back(_Args&& ...) [with _Args = {onnxruntime::llm::cutlass_extensions::CutlassGemmConfig}; _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Alloc = std::allocator<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig>]’ at /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/vector.tcc:117:30,
-//     inlined from ‘constexpr void std::vector<_Tp, _Alloc>::push_back(value_type&&) [with _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Alloc = std::allocator<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig>]’ at /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/stl_vector.h:1301:21,
-//     inlined from ‘std::vector<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig> onnxruntime::llm::kernels::cutlass_kernels::get_candidate_configs_sm100(onnxruntime::llm::cutlass_extensions::CutlassGemmConfig::CandidateConfigTypeParam)’ at /onnxruntime_src/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc:298:34:
-// /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/stl_construct.h:97:14: error: writing 1 byte into a region of size 0 [-Werror=stringop-overflow=]
-//    97 |     { return ::new((void*)__location) _Tp(std::forward<_Args>(__args)...); }
-//       |              ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#if defined(HAS_STRINGOP_OVERFLOW)
-#pragma GCC diagnostic ignored "-Wstringop-overflow"
-#endif  // defined(HAS_STRINGOP_OVERFLOW)
-#endif  // __GNUC__
-
 std::vector<CutlassGemmConfig> get_candidate_configs_sm100(CutlassGemmConfig::CandidateConfigTypeParam const config) {
 #ifdef FAST_BUILD
   // Fast build disables all configs except this one for SM100
@@ -313,8 +296,27 @@ std::vector<CutlassGemmConfig> get_candidate_configs_sm100(CutlassGemmConfig::Ca
                                                     MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_2x1x1});
       // candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x256x128B,
       //                                               MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_1x1x1});
+      // Suppressing this warning from a Release build with GCC:
+//
+//  In function ‘constexpr decltype (::new(void*(0)) _Tp) std::construct_at(_Tp*, _Args&& ...) [with _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Args = {onnxruntime::llm::cutlass_extensions::CutlassGemmConfig}]’,
+//     inlined from ‘static constexpr void std::allocator_traits<std::allocator<_CharT> >::construct(allocator_type&, _Up*, _Args&& ...) [with _Up = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Args = {onnxruntime::llm::cutlass_extensions::CutlassGemmConfig}; _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig]’ at /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/alloc_traits.h:577:21,
+//     inlined from ‘constexpr std::vector<_Tp, _Alloc>::reference std::vector<_Tp, _Alloc>::emplace_back(_Args&& ...) [with _Args = {onnxruntime::llm::cutlass_extensions::CutlassGemmConfig}; _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Alloc = std::allocator<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig>]’ at /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/vector.tcc:117:30,
+//     inlined from ‘constexpr void std::vector<_Tp, _Alloc>::push_back(value_type&&) [with _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Alloc = std::allocator<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig>]’ at /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/stl_vector.h:1301:21,
+//     inlined from ‘std::vector<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig> onnxruntime::llm::kernels::cutlass_kernels::get_candidate_configs_sm100(onnxruntime::llm::cutlass_extensions::CutlassGemmConfig::CandidateConfigTypeParam)’ at /onnxruntime_src/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc:298:34:
+// /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/stl_construct.h:97:14: error: writing 1 byte into a region of size 0 [-Werror=stringop-overflow=]
+//    97 |     { return ::new((void*)__location) _Tp(std::forward<_Args>(__args)...); }
+//       |              ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#if defined(HAS_STRINGOP_OVERFLOW)
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif  // defined(HAS_STRINGOP_OVERFLOW)
+#endif  // __GNUC__
       candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x256x128B,
                                                     MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_1x2x1});
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif  // __GNUC__
       candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape256x64x128B,
                                                     MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_2x1x1});
       candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x64x128B,
@@ -374,10 +376,6 @@ std::vector<CutlassGemmConfig> get_candidate_configs_sm100(CutlassGemmConfig::Ca
 
 }
 
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif  // __GNUC__
-
 std::vector<CutlassGemmConfig> get_candidate_configs(
     int sm, int const max_split_k, CutlassGemmConfig::CandidateConfigTypeParam const config_type_param) {
   if ((config_type_param & CutlassGemmConfig::FP4_ONLY) && !(config_type_param & CutlassGemmConfig::BLACKWELL)) {

From bddc3aca9557a0e743bc2357c885b94956d8d035 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 5 Mar 2026 11:20:25 -0800
Subject: [PATCH 34/39] indent comment and formatting

---
 .../contrib_ops/cuda/llm/cutlass_heuristic.cc | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc b/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc
index 79d2cca792602..570ef4d9bbcdf 100644
--- a/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc
+++ b/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc
@@ -296,16 +296,17 @@ std::vector<CutlassGemmConfig> get_candidate_configs_sm100(CutlassGemmConfig::Ca
                                                     MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_2x1x1});
       // candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x256x128B,
       //                                               MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_1x1x1});
+
       // Suppressing this warning from a Release build with GCC:
-//
-//  In function ‘constexpr decltype (::new(void*(0)) _Tp) std::construct_at(_Tp*, _Args&& ...) [with _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Args = {onnxruntime::llm::cutlass_extensions::CutlassGemmConfig}]’,
-//     inlined from ‘static constexpr void std::allocator_traits<std::allocator<_CharT> >::construct(allocator_type&, _Up*, _Args&& ...) [with _Up = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Args = {onnxruntime::llm::cutlass_extensions::CutlassGemmConfig}; _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig]’ at /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/alloc_traits.h:577:21,
-//     inlined from ‘constexpr std::vector<_Tp, _Alloc>::reference std::vector<_Tp, _Alloc>::emplace_back(_Args&& ...) [with _Args = {onnxruntime::llm::cutlass_extensions::CutlassGemmConfig}; _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Alloc = std::allocator<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig>]’ at /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/vector.tcc:117:30,
-//     inlined from ‘constexpr void std::vector<_Tp, _Alloc>::push_back(value_type&&) [with _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Alloc = std::allocator<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig>]’ at /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/stl_vector.h:1301:21,
-//     inlined from ‘std::vector<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig> onnxruntime::llm::kernels::cutlass_kernels::get_candidate_configs_sm100(onnxruntime::llm::cutlass_extensions::CutlassGemmConfig::CandidateConfigTypeParam)’ at /onnxruntime_src/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc:298:34:
-// /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/stl_construct.h:97:14: error: writing 1 byte into a region of size 0 [-Werror=stringop-overflow=]
-//    97 |     { return ::new((void*)__location) _Tp(std::forward<_Args>(__args)...); }
-//       |              ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+      //
+      //  In function ‘constexpr decltype (::new(void*(0)) _Tp) std::construct_at(_Tp*, _Args&& ...) [with _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Args = {onnxruntime::llm::cutlass_extensions::CutlassGemmConfig}]’,
+      //     inlined from ‘static constexpr void std::allocator_traits<std::allocator<_CharT> >::construct(allocator_type&, _Up*, _Args&& ...) [with _Up = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Args = {onnxruntime::llm::cutlass_extensions::CutlassGemmConfig}; _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig]’ at /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/alloc_traits.h:577:21,
+      //     inlined from ‘constexpr std::vector<_Tp, _Alloc>::reference std::vector<_Tp, _Alloc>::emplace_back(_Args&& ...) [with _Args = {onnxruntime::llm::cutlass_extensions::CutlassGemmConfig}; _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Alloc = std::allocator<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig>]’ at /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/vector.tcc:117:30,
+      //     inlined from ‘constexpr void std::vector<_Tp, _Alloc>::push_back(value_type&&) [with _Tp = onnxruntime::llm::cutlass_extensions::CutlassGemmConfig; _Alloc = std::allocator<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig>]’ at /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/stl_vector.h:1301:21,
+      //     inlined from ‘std::vector<onnxruntime::llm::cutlass_extensions::CutlassGemmConfig> onnxruntime::llm::kernels::cutlass_kernels::get_candidate_configs_sm100(onnxruntime::llm::cutlass_extensions::CutlassGemmConfig::CandidateConfigTypeParam)’ at /onnxruntime_src/onnxruntime/contrib_ops/cuda/llm/cutlass_heuristic.cc:298:34:
+      // /opt/rh/gcc-toolset-14/root/usr/include/c++/14/bits/stl_construct.h:97:14: error: writing 1 byte into a region of size 0 [-Werror=stringop-overflow=]
+      //    97 |     { return ::new((void*)__location) _Tp(std::forward<_Args>(__args)...); }
+      //       |              ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #ifdef __GNUC__
 #pragma GCC diagnostic push
 #if defined(HAS_STRINGOP_OVERFLOW)
@@ -317,6 +318,7 @@ std::vector<CutlassGemmConfig> get_candidate_configs_sm100(CutlassGemmConfig::Ca
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif  // __GNUC__
+
       candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape256x64x128B,
                                                     MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_2x1x1});
       candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x64x128B,
@@ -373,7 +375,6 @@ std::vector<CutlassGemmConfig> get_candidate_configs_sm100(CutlassGemmConfig::Ca
     ORT_THROW("Not Implemented: SM100 GEMM candidates have not been defined.");
   }
 #endif
-
 }
 
 std::vector<CutlassGemmConfig> get_candidate_configs(

From 73019d4bf00014e34fea33f86a37e3f0f7f77ee7 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 5 Mar 2026 12:17:02 -0800
Subject: [PATCH 35/39] set CMAKE_CXX_SCAN_FOR_MODULES in CMakeLists.txt

---
 cmake/CMakeLists.txt                                       | 7 +++++++
 .../github/android/default_full_aar_build_settings.json    | 1 -
 .../github/android/default_qnn_aar_build_settings.json     | 1 -
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 234947470c9be..7db6a070ab9ea 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -20,6 +20,13 @@ if (NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 20)
 endif()
 
+# We don't use C++20 modules yet.
+# There are some known issues to address first:
+# - Android builds from Linux Docker containers have trouble finding clang-scan-deps.
+# - The MSVC /permissive option is needed for compiling some of the CUDA EP code which uses CUTLASS.
+#   This option is not compatible with C++20 modules.
+set(CMAKE_CXX_SCAN_FOR_MODULES OFF)
+
 # Disable fast-math for Intel oneAPI compiler
 if("${CMAKE_CXX_COMPILER_ID}" MATCHES "IntelLLVM")
   if("${CMAKE_CXX_COMPILER_ID}" MATCHES "MSVC-like")
diff --git a/tools/ci_build/github/android/default_full_aar_build_settings.json b/tools/ci_build/github/android/default_full_aar_build_settings.json
index 94c25d65a0937..bc3d02d65b167 100644
--- a/tools/ci_build/github/android/default_full_aar_build_settings.json
+++ b/tools/ci_build/github/android/default_full_aar_build_settings.json
@@ -17,7 +17,6 @@
         "--use_nnapi",
         "--use_xnnpack",
         "--use_webgpu",
-        "--cmake_extra_defines=CMAKE_CXX_SCAN_FOR_MODULES=OFF",
         "--skip_tests"
     ]
 }
diff --git a/tools/ci_build/github/android/default_qnn_aar_build_settings.json b/tools/ci_build/github/android/default_qnn_aar_build_settings.json
index a260a2be4a2c5..5ac49f582d23e 100644
--- a/tools/ci_build/github/android/default_qnn_aar_build_settings.json
+++ b/tools/ci_build/github/android/default_qnn_aar_build_settings.json
@@ -12,7 +12,6 @@
         "--build_java",
         "--build_shared_lib",
         "--use_qnn=static_lib",
-        "--cmake_extra_defines=CMAKE_CXX_SCAN_FOR_MODULES=OFF",
         "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF",
         "--skip_tests"
 

From d3b2ff95ae3bc9a17c99ad2c0dc44d4c939b730a Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 5 Mar 2026 14:03:50 -0800
Subject: [PATCH 36/39] clarify comment about cxx_std_17 feature

---
 cmake/onnxruntime_fuzz_test.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/onnxruntime_fuzz_test.cmake b/cmake/onnxruntime_fuzz_test.cmake
index eea411d938176..2935b58ffa61a 100644
--- a/cmake/onnxruntime_fuzz_test.cmake
+++ b/cmake/onnxruntime_fuzz_test.cmake
@@ -60,7 +60,7 @@ if (onnxruntime_FUZZ_ENABLED)
     # compile the executables
     onnxruntime_add_executable(onnxruntime_security_fuzz ${SEC_FUZ_SRC})
 
-    # compile with c++17
+    # compile with at least c++17
     target_compile_features(onnxruntime_security_fuzz PUBLIC cxx_std_17)
 
     # Security fuzzing engine header file reference

From 90b2c73a93135e103f86084e354c1e593fb52bc1 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 5 Mar 2026 14:21:16 -0800
Subject: [PATCH 37/39] update some lingering old CXX_STANDARD values. let's
 see what happens...

---
 cmake/winml.cmake                                     |  9 ++++-----
 cmake/winml_unittests.cmake                           |  2 +-
 js/node/CMakeLists.txt                                |  2 +-
 js/react_native/android/CMakeLists.txt                |  6 +++---
 .../tools/tensorrt/perf/mem_test/CMakeLists.txt       |  6 +++---
 .../aarch64/python/cpu/scripts/install_protobuf.sh    |  2 +-
 .../github/linux/docker/scripts/install_protobuf.sh   |  2 +-
 tools/python/util/vcpkg_helpers.py                    | 11 +++++------
 8 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/cmake/winml.cmake b/cmake/winml.cmake
index f2651d0cbc2b2..8f80299cc491c 100644
--- a/cmake/winml.cmake
+++ b/cmake/winml.cmake
@@ -316,8 +316,7 @@ if (onnxruntime_WINML_NAMESPACE_OVERRIDE STREQUAL "Windows")
   target_compile_definitions(winml_adapter PRIVATE "BUILD_INBOX=1")
 endif()
 
-# will requires C++17
-set_target_properties(winml_adapter PROPERTIES CXX_STANDARD 17)
+set_target_properties(winml_adapter PROPERTIES CXX_STANDARD 20)
 set_target_properties(winml_adapter PROPERTIES CXX_STANDARD_REQUIRED ON)
 
 # Compiler definitions
@@ -645,7 +644,7 @@ onnxruntime_add_static_library(winml_lib_common
   ${winml_lib_common_dir}/CommonDeviceHelpers.cpp
 )
 
-set_target_properties(winml_lib_common PROPERTIES CXX_STANDARD 17)
+set_target_properties(winml_lib_common PROPERTIES CXX_STANDARD 20)
 set_target_properties(winml_lib_common PROPERTIES CXX_STANDARD_REQUIRED ON)
 target_compile_options(winml_lib_common PRIVATE /GR- /await /bigobj /wd4238)
 target_link_libraries(winml_lib_common PRIVATE ${WIL_TARGET})
@@ -829,9 +828,9 @@ if (winml_is_inbox)
     target_link_libraries(${new_target} PRIVATE ${link_libraries})
     target_link_options(${new_target} PRIVATE ${link_options})
 
-    # Attempt to copy linker flags 
+    # Attempt to copy linker flags
     get_target_property(link_flags ${target} LINK_FLAGS)
-    
+
     if (NOT link_flags MATCHES ".*NOTFOUND")
       set_property(TARGET ${new_target} PROPERTY LINK_FLAGS "${link_flags}")
     endif()
diff --git a/cmake/winml_unittests.cmake b/cmake/winml_unittests.cmake
index d857a83f504a5..eb2d69e16223e 100644
--- a/cmake/winml_unittests.cmake
+++ b/cmake/winml_unittests.cmake
@@ -19,7 +19,7 @@ set(WINML_TEST_INC_DIR
 function(set_winml_target_properties target)
   set_target_properties(${target} PROPERTIES
     FOLDER "ONNXRuntimeTest/winml"
-    CXX_STANDARD 17
+    CXX_STANDARD 20
     CXX_STANDARD_REQUIRED YES
     CXX_EXTENSIONS NO
   )
diff --git a/js/node/CMakeLists.txt b/js/node/CMakeLists.txt
index aedb1e35158ef..845d1a80b7b8f 100644
--- a/js/node/CMakeLists.txt
+++ b/js/node/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.11)
 
 project (onnxruntime-node)
 
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 
 add_compile_definitions(NAPI_VERSION=${napi_build_version})
 add_compile_definitions(ORT_API_MANUAL_INIT)
diff --git a/js/react_native/android/CMakeLists.txt b/js/react_native/android/CMakeLists.txt
index 2f814e871ad77..0bcf552ff9e41 100644
--- a/js/react_native/android/CMakeLists.txt
+++ b/js/react_native/android/CMakeLists.txt
@@ -4,7 +4,7 @@ cmake_minimum_required(VERSION 3.9.0)
 set(PACKAGE_NAME "onnxruntime-react-native")
 set(BUILD_DIR ${CMAKE_SOURCE_DIR}/build)
 set(CMAKE_VERBOSE_MAKEFILE ON)
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 
 option(ORT_EXTENSIONS_ENABLED "Enable Ort Extensions" NO)
 option(USE_NNAPI "Use NNAPI" YES)
@@ -80,10 +80,10 @@ add_library(
   ../cpp/SessionUtils.cpp
   ../cpp/TensorUtils.cpp)
 
-# Configure C++ 17
+# Configure C++20
 set_target_properties(
   onnxruntimejsi
-  PROPERTIES CXX_STANDARD 17
+  PROPERTIES CXX_STANDARD 20
              CXX_EXTENSIONS OFF
              POSITION_INDEPENDENT_CODE ON)
 
diff --git a/onnxruntime/python/tools/tensorrt/perf/mem_test/CMakeLists.txt b/onnxruntime/python/tools/tensorrt/perf/mem_test/CMakeLists.txt
index d77a763396f77..0f797255e918c 100644
--- a/onnxruntime/python/tools/tensorrt/perf/mem_test/CMakeLists.txt
+++ b/onnxruntime/python/tools/tensorrt/perf/mem_test/CMakeLists.txt
@@ -4,10 +4,10 @@ set(CMAKE_BUILD_TYPE Debug)
 
 cmake_minimum_required(VERSION 3.13)
 
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-include_directories( 
+include_directories(
 	/code/onnxruntime/include/onnxruntime/core/session/
 	/code/onnxruntime/include/onnxruntime/core/providers/tensorrt/
 )
@@ -17,6 +17,6 @@ set(CMAKE_CXX_FLAGS "-fsanitize=address  -fsanitize=leak -g ${CMAKE_CXX_FLAGS}")
 set(CMAKE_C_FLAGS "-fsanitize=address  -fsanitize=leak -g ${CMAKE_C_FLAGS}")
 set(CMAKE_EXE_LINKER_FLAGS "-fsanitize=address  -fsanitize=leak ${CMAKE_EXE_LINKER_FLAGS}")
 set(CMAKE_MODULE_LINKER_FLAGS "-fsanitize=address  -fsanitize=leak ${CMAKE_MODULE_LINKER_FLAGS}")
-        
+
 ADD_EXECUTABLE(onnx_memtest  main.cpp)
 target_link_libraries(onnx_memtest onnxruntime)
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/install_protobuf.sh b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/install_protobuf.sh
index 31b5ca6f9e69b..4b967c1f3ae3b 100755
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/install_protobuf.sh
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/install_protobuf.sh
@@ -13,7 +13,7 @@ done
 
 
 
-EXTRA_CMAKE_ARGS="-DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_CXX_STANDARD=17"
+EXTRA_CMAKE_ARGS="-DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_CXX_STANDARD=20"
 
 case "$(uname -s)" in
    Darwin*)
diff --git a/tools/ci_build/github/linux/docker/scripts/install_protobuf.sh b/tools/ci_build/github/linux/docker/scripts/install_protobuf.sh
index 31b5ca6f9e69b..4b967c1f3ae3b 100755
--- a/tools/ci_build/github/linux/docker/scripts/install_protobuf.sh
+++ b/tools/ci_build/github/linux/docker/scripts/install_protobuf.sh
@@ -13,7 +13,7 @@ done
 
 
 
-EXTRA_CMAKE_ARGS="-DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_CXX_STANDARD=17"
+EXTRA_CMAKE_ARGS="-DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_CXX_STANDARD=20"
 
 case "$(uname -s)" in
    Darwin*)
diff --git a/tools/python/util/vcpkg_helpers.py b/tools/python/util/vcpkg_helpers.py
index f976e525aff93..966bfc41ce4c2 100644
--- a/tools/python/util/vcpkg_helpers.py
+++ b/tools/python/util/vcpkg_helpers.py
@@ -280,7 +280,7 @@ def generate_triplet_for_android(
 
             if ldflags:
                 f.write(f'set(VCPKG_LINKER_FLAGS "{" ".join(ldflags)}")\n')
-            f.write("list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)\n")
+            f.write("list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)\n")
             add_build_type(f, config)
             add_port_configs(
                 f, enable_exception, False, enable_minimal_build, use_full_protobuf=use_full_protobuf
@@ -471,10 +471,9 @@ def generate_triplet_for_posix_platform(
 
             if ldflags:
                 f.write(f'set(VCPKG_LINKER_FLAGS "{" ".join(ldflags)}")\n')
-            if os_name == "osx":
-                f.write("list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)\n")
-            else:
-                f.write("list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)\n")
+
+            f.write("list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)\n")
+
             add_build_type(f, config)
             add_port_configs(
                 f, enable_exception, False, enable_minimal_build, use_full_protobuf=use_full_protobuf
@@ -734,7 +733,7 @@ def generate_windows_triplets(build_dir: str, configs: set[str], toolset_version
                                         if cxxflags:
                                             f.write(f'set(VCPKG_CXX_FLAGS "{" ".join(cxxflags)}")\n')
                                         f.write(
-                                            "list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)\n"
+                                            "list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=20)\n"
                                         )
                                         if ldflags:
                                             f.write(f'set(VCPKG_LINKER_FLAGS "{" ".join(ldflags)}")\n')

From 0d613a3193e65f01e40ded6f7b86e16c7efc2e85 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 5 Mar 2026 14:54:57 -0800
Subject: [PATCH 38/39] move --diag-suppress=177 out of CUDA 12.8+ block

---
 cmake/onnxruntime_providers_cuda.cmake | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 3e7d43e66b690..7e62f72afaf97 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -191,6 +191,11 @@
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
     endif()
 
+    # suppress warnings like this:
+    #   cutlass-src\include\cute/arch/mma_sm120.hpp(3128): error #177-D: variable "tidA" was declared but never
+    #     referenced
+    target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=177>")
+
     # Since CUDA 12.8, compiling diagnostics become stricter
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>")
@@ -200,11 +205,6 @@
       endif()
       # skip diagnosis error caused by cuda header files
       target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=221>")
-
-      # suppress warnings like this:
-      #   cutlass-src\include\cute/arch/mma_sm120.hpp(3128): error #177-D: variable "tidA" was declared but never
-      #     referenced
-      target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=177>")
     endif()
 
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)

From 291f0a00c860f1f9826e0f74ca6746d5fadf86e8 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 5 Mar 2026 16:57:06 -0800
Subject: [PATCH 39/39] update vcpkg_helpers.py to pass module scanning option
 too

---
 cmake/CMakeLists.txt               |  1 +
 tools/python/util/vcpkg_helpers.py | 23 ++++++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 7db6a070ab9ea..d2967739cbde4 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -25,6 +25,7 @@ endif()
 # - Android builds from Linux Docker containers have trouble finding clang-scan-deps.
 # - The MSVC /permissive option is needed for compiling some of the CUDA EP code which uses CUTLASS.
 #   This option is not compatible with C++20 modules.
+# So we will skip module scanning for now.
 set(CMAKE_CXX_SCAN_FOR_MODULES OFF)
 
 # Disable fast-math for Intel oneAPI compiler
diff --git a/tools/python/util/vcpkg_helpers.py b/tools/python/util/vcpkg_helpers.py
index 966bfc41ce4c2..8d1c665f631d9 100644
--- a/tools/python/util/vcpkg_helpers.py
+++ b/tools/python/util/vcpkg_helpers.py
@@ -150,6 +150,21 @@ def add_build_type(f, build_type: str) -> None:
         )
 
 
+def _get_cxx_standard_cmake_configure_options_str() -> str:
+    # These should match what's specified in cmake/CMakeLists.txt.
+    options = [
+        "-DCMAKE_CXX_STANDARD=20",
+        # We don't use C++20 modules yet.
+        # There are some known issues to address first:
+        # - Android builds from Linux Docker containers have trouble finding clang-scan-deps.
+        # - The MSVC /permissive option is needed for compiling some of the CUDA EP code which uses CUTLASS.
+        #   This option is not compatible with C++20 modules.
+        # So we will skip module scanning for now.
+        "-DCMAKE_CXX_SCAN_FOR_MODULES=OFF",
+    ]
+    return " ".join(options)
+
+
 def generate_triplet_for_android(
     build_dir: str,
     configs: set[str],
@@ -280,7 +295,9 @@ def generate_triplet_for_android(
 
             if ldflags:
                 f.write(f'set(VCPKG_LINKER_FLAGS "{" ".join(ldflags)}")\n')
-            f.write("list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)\n")
+
+            f.write(f"list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS {_get_cxx_standard_cmake_configure_options_str()})\n")
+
             add_build_type(f, config)
             add_port_configs(
                 f, enable_exception, False, enable_minimal_build, use_full_protobuf=use_full_protobuf
@@ -472,7 +489,7 @@ def generate_triplet_for_posix_platform(
             if ldflags:
                 f.write(f'set(VCPKG_LINKER_FLAGS "{" ".join(ldflags)}")\n')
 
-            f.write("list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)\n")
+            f.write(f"list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS {_get_cxx_standard_cmake_configure_options_str()})\n")
 
             add_build_type(f, config)
             add_port_configs(
@@ -733,7 +750,7 @@ def generate_windows_triplets(build_dir: str, configs: set[str], toolset_version
                                         if cxxflags:
                                             f.write(f'set(VCPKG_CXX_FLAGS "{" ".join(cxxflags)}")\n')
                                         f.write(
-                                            "list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=20)\n"
+                                            f"list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error {_get_cxx_standard_cmake_configure_options_str()})\n"
                                         )
                                         if ldflags:
                                             f.write(f'set(VCPKG_LINKER_FLAGS "{" ".join(ldflags)}")\n')