From 4f620c38a402f37e98b0f115ab7ae139a205b5c7 Mon Sep 17 00:00:00 2001 From: Luis Rios Date: Mon, 18 May 2026 03:51:00 -0700 Subject: [PATCH 1/4] fix: avoid vector copies in CheckIfSubtreesAreEqual (#27854) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `indices` is built once and then only read during recursive calls to `CheckIfSubtreesAreEqual`. However it was passed by value, causing a full copy on every recursive call. Changed to `const&`. ## Data from the profiler: To collect the following data, a model with a single TreeEnsembleClassifier node (5000 trees and 3.3 million nodes) has been used. The loading time dropped from 18 minutes to about 4 seconds. ### After Screenshot 2026-03-25 at 6 40 25 PM ### Before Screenshot 2026-03-25 at 6 40 40 PM --- onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h index 8ed9a40097d4b..2530a1f73f81a 100644 --- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h +++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h @@ -115,7 +115,7 @@ class TreeEnsembleCommon : public TreeEnsembleCommonAttributes { const InlinedVector& truenode_ids, const InlinedVector& falsenode_ids, gsl::span nodes_featureids, gsl::span nodes_values_as_tensor, gsl::span node_values, gsl::span target_class_weights, gsl::span target_class_weights_as_tensor, - const InlinedVector& node_tree_ids, InlinedVector> indices); + const InlinedVector& node_tree_ids, const InlinedVector>& indices); size_t AddNodes(const size_t i, const InlinedVector& cmodes, const InlinedVector& truenode_ids, const InlinedVector& falsenode_ids, gsl::span nodes_featureids, gsl::span nodes_values_as_tensor, gsl::span node_values, @@ -383,7 +383,7 @@ bool TreeEnsembleCommon::CheckIfSubtreesAr const InlinedVector& truenode_ids, const InlinedVector& falsenode_ids, gsl::span nodes_featureids, gsl::span nodes_values_as_tensor, gsl::span node_values, gsl::span target_class_weights, gsl::span target_class_weights_as_tensor, - const InlinedVector& node_tree_ids, InlinedVector> indices) { + const InlinedVector& node_tree_ids, const InlinedVector>& indices) { if (left_id == right_id) { return true; } From aa416f5cec6ab3ed2fc269fc3e07812ab3fa0307 Mon Sep 17 00:00:00 2001 From: Jiajia Qin Date: Mon, 18 May 2026 23:26:19 +0800 Subject: [PATCH 2/4] webgpu: Generalize FlashAttention prefill shared-memory path (#28520) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Remove the `Subgroups` feature requirement from `CanApplyFlashAttention`, enabling flash attention on devices without subgroup support - Generalize the Apple-specific shared-memory prefill path into a `use_shm_path` flag that activates for Apple, NVIDIA, or any device lacking subgroups - Replace `is_apple` shader parameter with `use_shm_path` throughout the WGSL template ## Motivation Two issues exist on the current main branch: 1. **NVIDIA prefill produces incorrect results (regression from #28511):** PR #28511 increased `max_k_step` to 32 for NVIDIA in C++, but the shader's subgroup-based path only has `qk_1..qk_4` (16 hardcoded key indices). When `sg_size=32` (e.g. RTX 5080), the loop steps by 32 but only computes QK for keys 0-15, silently skipping keys 16-31. This produces incorrect attention output for models like phi4. 2. **Flash attention prefill unavailable without Subgroups:** `CanApplyFlashAttention` gates on `context.HasFeature(wgpu::FeatureName::Subgroups)`, forcing devices without subgroup support to fall back to the slower split-reduce 2-kernel path for prefill, even though the Apple shared-memory path in the shader is fully subgroup-free. This PR fixes both issues by routing Apple, NVIDIA, and no-subgroup devices through the loop-based shared-memory path (`use_shm_path`), which naturally handles any `max_k_step` value via `array` and loop iteration — no hardcoded key count. ## Test plan - [x] Built ORT with WebGPU EP on Windows (Release, VS 2022) - [x] Deployed and ran phi4-graph-prune model: output verified correct ("1+1 equals 2.") - [x] Lint check passed (`lintrunner -a`) --- onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc | 9 +++++---- onnxruntime/contrib_ops/webgpu/bert/flash_attention.h | 9 +++++---- .../webgpu/bert/flash_attention.wgsl.template | 6 +++--- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc index f1391ba1e3528..8217a07448266 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc @@ -212,7 +212,6 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const { return WGSL_TEMPLATE_APPLY(shader, "bert/flash_attention.wgsl.template", WGSL_TEMPLATE_PARAMETER(has_attention_bias, has_attention_bias_), WGSL_TEMPLATE_PARAMETER(has_head_sink, has_head_sink_), - WGSL_TEMPLATE_PARAMETER(is_apple, is_apple_), WGSL_TEMPLATE_PARAMETER(is_fp16, is_fp16_), WGSL_TEMPLATE_PARAMETER(is_qualcomm, is_qualcomm_), WGSL_TEMPLATE_PARAMETER(is_unidirectional, is_unidirectional_), @@ -221,7 +220,8 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const { WGSL_TEMPLATE_PARAMETER(q_BNSH, q_BNSH_), WGSL_TEMPLATE_PARAMETER(qkv_head_size, qkv_head_size_), WGSL_TEMPLATE_PARAMETER(qkv_num_heads, qkv_num_heads_), - WGSL_TEMPLATE_PARAMETER(use_seqlen_k, use_seqlen_k_)); + WGSL_TEMPLATE_PARAMETER(use_seqlen_k, use_seqlen_k_), + WGSL_TEMPLATE_PARAMETER(use_shm_path, use_shm_path_)); } Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader) const { @@ -486,6 +486,7 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co bool is_qualcomm = context.AdapterInfo().vendor == std::string_view{"qualcomm"}; bool is_nvidia = context.AdapterInfo().vendor == std::string_view{"nvidia"}; bool is_apple = context.AdapterInfo().vendor == std::string_view{"apple"}; + bool has_subgroups = context.HasFeature(wgpu::FeatureName::Subgroups); bool is_fp16 = (Q->GetElementType() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16); bool q_BNSH = parameters.qkv_format_ == Q_K_V_BNSH; bool has_head_sink = head_sink != nullptr; @@ -498,6 +499,7 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co parameters.is_unidirectional_, is_nvidia, is_apple, + has_subgroups, q_BNSH, use_seqlen_k, has_head_sink}; @@ -532,7 +534,7 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co program.SetDispatchGroupSize(parameters.batch_size_ * parameters.num_heads_ * num_seq_tile) .SetWorkgroupSize(prefill_tile_size) - .CacheHint(has_attention_bias, parameters.head_size_, parameters.num_heads_, parameters.is_unidirectional_, is_qualcomm, is_nvidia, is_apple, q_BNSH, use_seqlen_k, has_head_sink, program.max_k_step()) + .CacheHint(has_attention_bias, parameters.head_size_, parameters.num_heads_, parameters.is_unidirectional_, is_qualcomm, is_nvidia, is_apple, has_subgroups, q_BNSH, use_seqlen_k, has_head_sink, program.max_k_step()) .AddUniformVariables({{static_cast(parameters.sequence_length_)}, {static_cast(parameters.total_sequence_length_)}, {static_cast(present_sequence_length)}, @@ -584,7 +586,6 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co bool CanApplyFlashAttention(const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context) { return !parameters.is_packed_qkv_ && parameters.head_size_ == parameters.v_head_size_ && - context.HasFeature(wgpu::FeatureName::Subgroups) && ((context.AdapterInfo().vendor == std::string_view{"qualcomm"} && parameters.head_size_ % 8 == 0) || parameters.head_size_ % 4 == 0); } diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h index 27fa56e333874..e75b6378f67c6 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h @@ -77,6 +77,7 @@ class FlashAttentionProgram final : public Program { bool is_unidirectional, bool is_nvidia, bool is_apple, + bool has_subgroups, bool q_BNSH, bool use_seqlen_k = false, bool has_head_sink = false) @@ -88,12 +89,12 @@ class FlashAttentionProgram final : public Program { qkv_num_heads_(qkv_num_heads), is_unidirectional_(is_unidirectional), is_nvidia_(is_nvidia), - is_apple_(is_apple), + use_shm_path_(is_apple || is_nvidia || !has_subgroups), q_BNSH_(q_BNSH), use_seqlen_k_(use_seqlen_k), has_head_sink_(has_head_sink) { - if (is_apple || is_nvidia) { - // On Apple and NVIDIA, use an optimized loop-based path with dynamic max_k_step. + if (use_shm_path_) { + // Use shared-memory loop-based path with dynamic max_k_step. // Compute max_k_step from workgroup shared memory budget: k_tile + v_tile = 2 * element_size * head_size * max_k_step const int element_size = is_fp16 ? 2 : 4; constexpr int kMinWorkgroupStorageBudgetBytes = 16384; @@ -130,7 +131,7 @@ class FlashAttentionProgram final : public Program { int qkv_num_heads_; bool is_unidirectional_; bool is_nvidia_; - bool is_apple_; + bool use_shm_path_; bool q_BNSH_; bool use_seqlen_k_; bool has_head_sink_; diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.wgsl.template index db41ac12ce268..6b620043413e3 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.wgsl.template +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.wgsl.template @@ -1,7 +1,6 @@ #param has_attention_bias #param has_head_sink -#param is_apple #param is_fp16 #param is_qualcomm #param is_unidirectional @@ -10,6 +9,7 @@ #param qkv_head_size #param qkv_num_heads #param use_seqlen_k +#param use_shm_path #param max_k_step_param const head_size : u32 = qkv_head_size; @@ -61,7 +61,7 @@ fn loadq(batch_idx : u32, q_idx_global : u32, head_idx : u32, alpha : q_element_ } } -#if is_apple +#if use_shm_path var qk_scores : array; @@ -240,7 +240,7 @@ $MAIN { let seq_causal_length = total_sequence_length; #endif -#if is_apple +#if use_shm_path for (var k_start = 0u; k_start < loop_bound; k_start += max_k_step) { workgroupBarrier(); From 770da7dd224b7d8ca2720863b609c5fe22f06222 Mon Sep 17 00:00:00 2001 From: Edward Chen <18449977+edgchen1@users.noreply.github.com> Date: Mon, 18 May 2026 09:14:54 -0700 Subject: [PATCH 3/4] [WebGPU plugin EP] Package improvements (#28525) ### Description - Add copyright headers to source files - Enrich Python and NuGet package metadata - Add ORT license files to packages - Clean up readme files ### Motivation and Context WebGPU plugin EP packaging improvements. Note: Similar updates can be considered for the CUDA plugin EP, but this PR is scoped to just the WebGPU EP for ease of cherry-picking into the WebGPU plugin EP release branch. --- plugin-ep-webgpu/_packaging_utils.py | 1 + .../Microsoft.ML.OnnxRuntime.EP.WebGpu.csproj | 6 +++- .../WebGpuEp.cs | 3 ++ plugin-ep-webgpu/csharp/README.md | 23 -------------- plugin-ep-webgpu/csharp/pack_nuget.py | 21 +++++++++++++ .../csharp/test/WebGpuEpNuGetTest/Program.cs | 3 ++ plugin-ep-webgpu/python/README.md | 7 ++--- plugin-ep-webgpu/python/build_wheel.py | 13 ++++++++ .../python/onnxruntime_ep_webgpu/__init__.py | 3 ++ plugin-ep-webgpu/python/pyproject.toml.in | 31 +++++++++++++++++-- .../python/requirements-build-wheel.txt | 2 +- plugin-ep-webgpu/python/setup.py | 3 ++ .../python/test/test_webgpu_plugin_ep.py | 4 +++ 13 files changed, 88 insertions(+), 32 deletions(-) diff --git a/plugin-ep-webgpu/_packaging_utils.py b/plugin-ep-webgpu/_packaging_utils.py index 201b3342ff39c..84850e4dee5fe 100644 --- a/plugin-ep-webgpu/_packaging_utils.py +++ b/plugin-ep-webgpu/_packaging_utils.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. + """Shared utilities for the WebGPU plugin EP packaging scripts. Not a public API.""" from __future__ import annotations diff --git a/plugin-ep-webgpu/csharp/Microsoft.ML.OnnxRuntime.EP.WebGpu/Microsoft.ML.OnnxRuntime.EP.WebGpu.csproj b/plugin-ep-webgpu/csharp/Microsoft.ML.OnnxRuntime.EP.WebGpu/Microsoft.ML.OnnxRuntime.EP.WebGpu.csproj index 58860c46b9c16..5bfbac0308e01 100644 --- a/plugin-ep-webgpu/csharp/Microsoft.ML.OnnxRuntime.EP.WebGpu/Microsoft.ML.OnnxRuntime.EP.WebGpu.csproj +++ b/plugin-ep-webgpu/csharp/Microsoft.ML.OnnxRuntime.EP.WebGpu/Microsoft.ML.OnnxRuntime.EP.WebGpu.csproj @@ -16,7 +16,8 @@ ONNX;ONNX Runtime;Machine Learning;AI;Deep Learning;WebGPU - MIT + LICENSE + https://onnxruntime.ai https://github.com/microsoft/onnxruntime git © Microsoft Corporation. All rights reserved. @@ -29,6 +30,9 @@ + + +