From 4f620c38a402f37e98b0f115ab7ae139a205b5c7 Mon Sep 17 00:00:00 2001 From: Luis Rios Date: Mon, 18 May 2026 03:51:00 -0700 Subject: [PATCH 01/13] fix: avoid vector copies in CheckIfSubtreesAreEqual (#27854) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `indices` is built once and then only read during recursive calls to `CheckIfSubtreesAreEqual`. However it was passed by value, causing a full copy on every recursive call. Changed to `const&`. ## Data from the profiler: To collect the following data, a model with a single TreeEnsembleClassifier node (5000 trees and 3.3 million nodes) has been used. The loading time dropped from 18 minutes to about 4 seconds. ### After Screenshot 2026-03-25 at 6 40 25 PM ### Before Screenshot 2026-03-25 at 6 40 40 PM --- onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h index 8ed9a40097d4b..2530a1f73f81a 100644 --- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h +++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h @@ -115,7 +115,7 @@ class TreeEnsembleCommon : public TreeEnsembleCommonAttributes { const InlinedVector& truenode_ids, const InlinedVector& falsenode_ids, gsl::span nodes_featureids, gsl::span nodes_values_as_tensor, gsl::span node_values, gsl::span target_class_weights, gsl::span target_class_weights_as_tensor, - const InlinedVector& node_tree_ids, InlinedVector> indices); + const InlinedVector& node_tree_ids, const InlinedVector>& indices); size_t AddNodes(const size_t i, const InlinedVector& cmodes, const InlinedVector& truenode_ids, const InlinedVector& falsenode_ids, gsl::span nodes_featureids, gsl::span nodes_values_as_tensor, gsl::span node_values, @@ -383,7 +383,7 @@ bool TreeEnsembleCommon::CheckIfSubtreesAr const InlinedVector& truenode_ids, const InlinedVector& falsenode_ids, gsl::span nodes_featureids, gsl::span nodes_values_as_tensor, gsl::span node_values, gsl::span target_class_weights, gsl::span target_class_weights_as_tensor, - const InlinedVector& node_tree_ids, InlinedVector> indices) { + const InlinedVector& node_tree_ids, const InlinedVector>& indices) { if (left_id == right_id) { return true; } From aa416f5cec6ab3ed2fc269fc3e07812ab3fa0307 Mon Sep 17 00:00:00 2001 From: Jiajia Qin Date: Mon, 18 May 2026 23:26:19 +0800 Subject: [PATCH 02/13] webgpu: Generalize FlashAttention prefill shared-memory path (#28520) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Remove the `Subgroups` feature requirement from `CanApplyFlashAttention`, enabling flash attention on devices without subgroup support - Generalize the Apple-specific shared-memory prefill path into a `use_shm_path` flag that activates for Apple, NVIDIA, or any device lacking subgroups - Replace `is_apple` shader parameter with `use_shm_path` throughout the WGSL template ## Motivation Two issues exist on the current main branch: 1. **NVIDIA prefill produces incorrect results (regression from #28511):** PR #28511 increased `max_k_step` to 32 for NVIDIA in C++, but the shader's subgroup-based path only has `qk_1..qk_4` (16 hardcoded key indices). When `sg_size=32` (e.g. RTX 5080), the loop steps by 32 but only computes QK for keys 0-15, silently skipping keys 16-31. This produces incorrect attention output for models like phi4. 2. **Flash attention prefill unavailable without Subgroups:** `CanApplyFlashAttention` gates on `context.HasFeature(wgpu::FeatureName::Subgroups)`, forcing devices without subgroup support to fall back to the slower split-reduce 2-kernel path for prefill, even though the Apple shared-memory path in the shader is fully subgroup-free. This PR fixes both issues by routing Apple, NVIDIA, and no-subgroup devices through the loop-based shared-memory path (`use_shm_path`), which naturally handles any `max_k_step` value via `array` and loop iteration — no hardcoded key count. ## Test plan - [x] Built ORT with WebGPU EP on Windows (Release, VS 2022) - [x] Deployed and ran phi4-graph-prune model: output verified correct ("1+1 equals 2.") - [x] Lint check passed (`lintrunner -a`) --- onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc | 9 +++++---- onnxruntime/contrib_ops/webgpu/bert/flash_attention.h | 9 +++++---- .../webgpu/bert/flash_attention.wgsl.template | 6 +++--- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc index f1391ba1e3528..8217a07448266 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc @@ -212,7 +212,6 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const { return WGSL_TEMPLATE_APPLY(shader, "bert/flash_attention.wgsl.template", WGSL_TEMPLATE_PARAMETER(has_attention_bias, has_attention_bias_), WGSL_TEMPLATE_PARAMETER(has_head_sink, has_head_sink_), - WGSL_TEMPLATE_PARAMETER(is_apple, is_apple_), WGSL_TEMPLATE_PARAMETER(is_fp16, is_fp16_), WGSL_TEMPLATE_PARAMETER(is_qualcomm, is_qualcomm_), WGSL_TEMPLATE_PARAMETER(is_unidirectional, is_unidirectional_), @@ -221,7 +220,8 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const { WGSL_TEMPLATE_PARAMETER(q_BNSH, q_BNSH_), WGSL_TEMPLATE_PARAMETER(qkv_head_size, qkv_head_size_), WGSL_TEMPLATE_PARAMETER(qkv_num_heads, qkv_num_heads_), - WGSL_TEMPLATE_PARAMETER(use_seqlen_k, use_seqlen_k_)); + WGSL_TEMPLATE_PARAMETER(use_seqlen_k, use_seqlen_k_), + WGSL_TEMPLATE_PARAMETER(use_shm_path, use_shm_path_)); } Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader) const { @@ -486,6 +486,7 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co bool is_qualcomm = context.AdapterInfo().vendor == std::string_view{"qualcomm"}; bool is_nvidia = context.AdapterInfo().vendor == std::string_view{"nvidia"}; bool is_apple = context.AdapterInfo().vendor == std::string_view{"apple"}; + bool has_subgroups = context.HasFeature(wgpu::FeatureName::Subgroups); bool is_fp16 = (Q->GetElementType() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16); bool q_BNSH = parameters.qkv_format_ == Q_K_V_BNSH; bool has_head_sink = head_sink != nullptr; @@ -498,6 +499,7 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co parameters.is_unidirectional_, is_nvidia, is_apple, + has_subgroups, q_BNSH, use_seqlen_k, has_head_sink}; @@ -532,7 +534,7 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co program.SetDispatchGroupSize(parameters.batch_size_ * parameters.num_heads_ * num_seq_tile) .SetWorkgroupSize(prefill_tile_size) - .CacheHint(has_attention_bias, parameters.head_size_, parameters.num_heads_, parameters.is_unidirectional_, is_qualcomm, is_nvidia, is_apple, q_BNSH, use_seqlen_k, has_head_sink, program.max_k_step()) + .CacheHint(has_attention_bias, parameters.head_size_, parameters.num_heads_, parameters.is_unidirectional_, is_qualcomm, is_nvidia, is_apple, has_subgroups, q_BNSH, use_seqlen_k, has_head_sink, program.max_k_step()) .AddUniformVariables({{static_cast(parameters.sequence_length_)}, {static_cast(parameters.total_sequence_length_)}, {static_cast(present_sequence_length)}, @@ -584,7 +586,6 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co bool CanApplyFlashAttention(const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context) { return !parameters.is_packed_qkv_ && parameters.head_size_ == parameters.v_head_size_ && - context.HasFeature(wgpu::FeatureName::Subgroups) && ((context.AdapterInfo().vendor == std::string_view{"qualcomm"} && parameters.head_size_ % 8 == 0) || parameters.head_size_ % 4 == 0); } diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h index 27fa56e333874..e75b6378f67c6 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h @@ -77,6 +77,7 @@ class FlashAttentionProgram final : public Program { bool is_unidirectional, bool is_nvidia, bool is_apple, + bool has_subgroups, bool q_BNSH, bool use_seqlen_k = false, bool has_head_sink = false) @@ -88,12 +89,12 @@ class FlashAttentionProgram final : public Program { qkv_num_heads_(qkv_num_heads), is_unidirectional_(is_unidirectional), is_nvidia_(is_nvidia), - is_apple_(is_apple), + use_shm_path_(is_apple || is_nvidia || !has_subgroups), q_BNSH_(q_BNSH), use_seqlen_k_(use_seqlen_k), has_head_sink_(has_head_sink) { - if (is_apple || is_nvidia) { - // On Apple and NVIDIA, use an optimized loop-based path with dynamic max_k_step. + if (use_shm_path_) { + // Use shared-memory loop-based path with dynamic max_k_step. // Compute max_k_step from workgroup shared memory budget: k_tile + v_tile = 2 * element_size * head_size * max_k_step const int element_size = is_fp16 ? 2 : 4; constexpr int kMinWorkgroupStorageBudgetBytes = 16384; @@ -130,7 +131,7 @@ class FlashAttentionProgram final : public Program { int qkv_num_heads_; bool is_unidirectional_; bool is_nvidia_; - bool is_apple_; + bool use_shm_path_; bool q_BNSH_; bool use_seqlen_k_; bool has_head_sink_; diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.wgsl.template index db41ac12ce268..6b620043413e3 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.wgsl.template +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.wgsl.template @@ -1,7 +1,6 @@ #param has_attention_bias #param has_head_sink -#param is_apple #param is_fp16 #param is_qualcomm #param is_unidirectional @@ -10,6 +9,7 @@ #param qkv_head_size #param qkv_num_heads #param use_seqlen_k +#param use_shm_path #param max_k_step_param const head_size : u32 = qkv_head_size; @@ -61,7 +61,7 @@ fn loadq(batch_idx : u32, q_idx_global : u32, head_idx : u32, alpha : q_element_ } } -#if is_apple +#if use_shm_path var qk_scores : array; @@ -240,7 +240,7 @@ $MAIN { let seq_causal_length = total_sequence_length; #endif -#if is_apple +#if use_shm_path for (var k_start = 0u; k_start < loop_bound; k_start += max_k_step) { workgroupBarrier(); From 770da7dd224b7d8ca2720863b609c5fe22f06222 Mon Sep 17 00:00:00 2001 From: Edward Chen <18449977+edgchen1@users.noreply.github.com> Date: Mon, 18 May 2026 09:14:54 -0700 Subject: [PATCH 03/13] [WebGPU plugin EP] Package improvements (#28525) ### Description - Add copyright headers to source files - Enrich Python and NuGet package metadata - Add ORT license files to packages - Clean up readme files ### Motivation and Context WebGPU plugin EP packaging improvements. Note: Similar updates can be considered for the CUDA plugin EP, but this PR is scoped to just the WebGPU EP for ease of cherry-picking into the WebGPU plugin EP release branch. --- plugin-ep-webgpu/_packaging_utils.py | 1 + .../Microsoft.ML.OnnxRuntime.EP.WebGpu.csproj | 6 +++- .../WebGpuEp.cs | 3 ++ plugin-ep-webgpu/csharp/README.md | 23 -------------- plugin-ep-webgpu/csharp/pack_nuget.py | 21 +++++++++++++ .../csharp/test/WebGpuEpNuGetTest/Program.cs | 3 ++ plugin-ep-webgpu/python/README.md | 7 ++--- plugin-ep-webgpu/python/build_wheel.py | 13 ++++++++ .../python/onnxruntime_ep_webgpu/__init__.py | 3 ++ plugin-ep-webgpu/python/pyproject.toml.in | 31 +++++++++++++++++-- .../python/requirements-build-wheel.txt | 2 +- plugin-ep-webgpu/python/setup.py | 3 ++ .../python/test/test_webgpu_plugin_ep.py | 4 +++ 13 files changed, 88 insertions(+), 32 deletions(-) diff --git a/plugin-ep-webgpu/_packaging_utils.py b/plugin-ep-webgpu/_packaging_utils.py index 201b3342ff39c..84850e4dee5fe 100644 --- a/plugin-ep-webgpu/_packaging_utils.py +++ b/plugin-ep-webgpu/_packaging_utils.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. + """Shared utilities for the WebGPU plugin EP packaging scripts. Not a public API.""" from __future__ import annotations diff --git a/plugin-ep-webgpu/csharp/Microsoft.ML.OnnxRuntime.EP.WebGpu/Microsoft.ML.OnnxRuntime.EP.WebGpu.csproj b/plugin-ep-webgpu/csharp/Microsoft.ML.OnnxRuntime.EP.WebGpu/Microsoft.ML.OnnxRuntime.EP.WebGpu.csproj index 58860c46b9c16..5bfbac0308e01 100644 --- a/plugin-ep-webgpu/csharp/Microsoft.ML.OnnxRuntime.EP.WebGpu/Microsoft.ML.OnnxRuntime.EP.WebGpu.csproj +++ b/plugin-ep-webgpu/csharp/Microsoft.ML.OnnxRuntime.EP.WebGpu/Microsoft.ML.OnnxRuntime.EP.WebGpu.csproj @@ -16,7 +16,8 @@ ONNX;ONNX Runtime;Machine Learning;AI;Deep Learning;WebGPU - MIT + LICENSE + https://onnxruntime.ai https://github.com/microsoft/onnxruntime git © Microsoft Corporation. All rights reserved. @@ -29,6 +30,9 @@ + + + When compiling a model via the Compile API in a sandboxed environment, CreateEpContextModel() would attempt to validate/generate a file output path, even when the user explicitly set the output to a buffer via SetOutputModelBuffer(). This caused std::filesystem::exists() to throw an "Access is denied" exception on the dummy model path _MODEL_EDITOR_API_MODEL_, because the sandbox restricts filesystem access. --- onnxruntime/core/framework/graph_partitioner.cc | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc index e06b99eea9fd7..0d3a84f30e1fb 100644 --- a/onnxruntime/core/framework/graph_partitioner.cc +++ b/onnxruntime/core/framework/graph_partitioner.cc @@ -1007,8 +1007,17 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers const epctx::BufferWriteFuncHolder* output_write_func_holder = ep_context_gen_options.TryGetOutputModelWriteFunc(); const std::filesystem::path* output_model_path_ptr = ep_context_gen_options.TryGetOutputModelPath(); + // Determine whether we need to resolve/validate a file system path for the output model. + // A path is needed when: + // - Writing the output model to a file (not to a buffer or write function) + // - Writing initializers to an external file (needs the model path to compute the external file location) + const bool output_is_to_file = (output_buffer_holder == nullptr && output_write_func_holder == nullptr); + const bool needs_path_for_external_initializers = + (ep_context_gen_options.TryGetExternalInitializerFileInfo() != nullptr); + std::filesystem::path valid_output_model_path; - if (output_model_path_ptr != nullptr || !graph.ModelPath().empty()) { + if ((output_is_to_file || needs_path_for_external_initializers) && + (output_model_path_ptr != nullptr || !graph.ModelPath().empty())) { std::filesystem::path output_model_path = (output_model_path_ptr != nullptr) ? *output_model_path_ptr : std::filesystem::path(""); ORT_RETURN_IF_ERROR(GetValidatedEpContextPath(output_model_path, From 2d9f35fcfe67d444a5ec0c5f133e2190418670b7 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 19 May 2026 10:05:15 -0700 Subject: [PATCH 09/13] [Security] Validate CPU cache_indirection beam indices (#28486) ## Summary - reject out-of-range `cache_indirection` beam indices in the CPU beam-attention path before they are converted into past KV offsets - keep `DecoderMaskedMultiHeadAttention` beam-width handling consistent with the `cache_indirection` shape - add CPU regression tests for `MultiHeadAttention` and `DecoderMaskedMultiHeadAttention` ## Motivation `MultiHeadAttention` and `DecoderMaskedMultiHeadAttention` on the CPU provider could consume attacker-controlled `cache_indirection` values as beam indices without validating that each element stayed within `[0, beam_width)`. That let malformed models compute offsets past the past key/value buffers. This change rejects invalid indices up front and adds focused tests for the failure path. ## Key Changes - add shared CPU validation in `AttentionCPUBase::ApplyAttentionWithBeams` so the beam path fails before any past-key or past-value reads occur - report an `INVALID_ARGUMENT` error that identifies the offending beam index and its position - validate that an explicit decoder `beam_width` input matches `cache_indirection` dimension 1 when both are present - add contrib-op tests that exercise invalid cache indirection values on the CPU execution provider ## Testing - `lintrunner -a` - `cd build/Linux/Debug && make -j4 CMakeFiles/onnxruntime_providers.dir/home/tlwu/onnxruntime/onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc.o CMakeFiles/onnxruntime_providers.dir/home/tlwu/onnxruntime/onnxruntime/contrib_ops/cpu/bert/decoder_masked_multihead_attention.cc.o CMakeFiles/onnxruntime_provider_test.dir/home/tlwu/onnxruntime/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc.o CMakeFiles/onnxruntime_provider_test.dir/home/tlwu/onnxruntime/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc.o` - full `onnxruntime_provider_test` relink/run was not completed locally --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> --- .../contrib_ops/cpu/bert/attention_cpu_base.h | 30 +++++++++++ .../decoder_masked_multihead_attention.cc | 14 +++++ ...oder_masked_multihead_attention_op_test.cc | 28 ++++++++++ .../multihead_attention_op_test.cc | 52 +++++++++++++++++++ 4 files changed, 124 insertions(+) diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h index 077090bb10911..79149f9a84dbe 100644 --- a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h +++ b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h @@ -149,6 +149,9 @@ class AttentionCPUBase : public AttentionBase { OpKernelContext* context, int beam_width, Tensor* output_qk) const { + ORT_RETURN_IF_ERROR(ValidateCacheIndirectionValues(cache_indir->Data(), batch_size, beam_width, + past_sequence_length, max_sequence_length)); + AllocatorPtr allocator; ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator)); @@ -186,6 +189,33 @@ class AttentionCPUBase : public AttentionBase { } private: + static Status ValidateCacheIndirectionValues(const int32_t* cache_indirection_data, + int batch_beam_size, + int beam_width, + int past_sequence_length, + int max_sequence_length) { + if (cache_indirection_data == nullptr || beam_width <= 0 || past_sequence_length <= 0) { + return Status::OK(); + } + + for (int batch_beam_index = 0; batch_beam_index < batch_beam_size; ++batch_beam_index) { + const int32_t* beam_indices = cache_indirection_data + + static_cast(batch_beam_index) * max_sequence_length; + for (int position = 0; position < past_sequence_length; ++position) { + const int32_t beam_index = beam_indices[position]; + if (beam_index < 0 || beam_index >= beam_width) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "cache_indirection beam index out of range. Expected [0, ", beam_width, + "), got ", beam_index, + " at flattened batch_beam index ", batch_beam_index, + ", sequence position ", position); + } + } + } + + return Status::OK(); + } + // Helper function to compute the attention probs. It does 2 things: // attention_probs(B, N, S, T) = 1/sqrt(H) x Q(B, N, S, H) x K'(B, N, T, H -> B, N, H, T) + // 1 x mask_data(B, N, S, T) diff --git a/onnxruntime/contrib_ops/cpu/bert/decoder_masked_multihead_attention.cc b/onnxruntime/contrib_ops/cpu/bert/decoder_masked_multihead_attention.cc index 0d2de59c05394..de6f47ca2626c 100644 --- a/onnxruntime/contrib_ops/cpu/bert/decoder_masked_multihead_attention.cc +++ b/onnxruntime/contrib_ops/cpu/bert/decoder_masked_multihead_attention.cc @@ -178,6 +178,20 @@ Status DecoderMaskedMultiHeadAttention::Compute(OpKernelContext* context) con "If beam width is greater than 1, then cache indirection buffer MUST be present"); } + if (cache_indir != nullptr) { + // Read beam width from cache_indirection shape directly. + // DecoderMaskedMultiHeadAttentionParameters shadows AttentionParameters::beam_width, + // so the value set by CheckInputs on the base class is not visible here. + int cache_beam_width = static_cast(cache_indir->Shape().GetDims()[1]); + if (beam_width != nullptr && beam_width_value != cache_beam_width) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'beam_width' should match cache_indirection dimension 1, got ", + beam_width_value, " and ", cache_beam_width); + } + + beam_width_value = cache_beam_width; + } + AllocatorPtr allocator; ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator)); diff --git a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc index 7cdbad3ef80a7..2451f7e03a281 100644 --- a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc +++ b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc @@ -933,5 +933,33 @@ TEST(DecoderMaskedMultiHeadAttentionTest, cpu_self_attn_fp32) { TestDecoderMaskedMultiHeadAttention(/* is_cross_attn = */ false, /* use_cuda = */ false); } +TEST(DecoderMaskedMultiHeadAttentionTest, cpu_cache_indirection_beam_index_out_of_range) { + OpTester tester("DecoderMaskedMultiHeadAttention", 1, onnxruntime::kMSDomain); + tester.AddAttribute("num_heads", 1); + tester.AddAttribute("past_present_share_buffer", 1); + + tester.AddInput("query", {2, 1, 4}, std::vector(8, 0.1f)); + tester.AddInput("key", {2, 1, 4}, std::vector(8, 0.2f)); + tester.AddInput("value", {2, 1, 4}, std::vector(8, 0.3f)); + tester.AddOptionalInputEdge(); + tester.AddOptionalInputEdge(); + tester.AddInput("past_key", {2, 1, 4, 4}, std::vector(32, 0.4f)); + tester.AddInput("past_value", {2, 1, 4, 4}, std::vector(32, 0.5f)); + tester.AddInput("past_sequence_length", {1}, {2}); + tester.AddInput("beam_width", {1}, {2}); + tester.AddInput("cache_indirection", {1, 2, 4}, {0, 2, 0, 0, 0, 0, 0, 0}); + tester.AddOptionalInputEdge(); + + tester.AddOutput("output", {2, 1, 4}, std::vector(8, 0.0f)); + tester.AddOutput("present_key", {2, 1, 4, 4}, std::vector(32, 0.0f)); + tester.AddOutput("present_value", {2, 1, 4, 4}, std::vector(32, 0.0f)); + tester.AddOptionalOutputEdge(); + + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + tester.Run(OpTester::ExpectResult::kExpectFailure, "cache_indirection beam index out of range", + {}, nullptr, &execution_providers); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc index c740959105977..9c974e03119f9 100644 --- a/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc +++ b/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc @@ -562,6 +562,58 @@ static void RunMultiHeadAttentionTests(AttentionTestData& data, } } +TEST(MultiHeadAttentionTest, CacheIndirectionBeamIndexOutOfRange) { + OpTester tester("MultiHeadAttention", 1, onnxruntime::kMSDomain); + tester.AddAttribute("num_heads", 1); + + tester.AddInput("query", {2, 1, 4}, std::vector(8, 0.1f)); + tester.AddInput("key", {2, 1, 4}, std::vector(8, 0.2f)); + tester.AddInput("value", {2, 1, 4}, std::vector(8, 0.3f)); + tester.AddOptionalInputEdge(); + tester.AddOptionalInputEdge(); + tester.AddOptionalInputEdge(); + tester.AddInput("past_key", {2, 1, 4, 4}, std::vector(32, 0.4f)); + tester.AddInput("past_value", {2, 1, 4, 4}, std::vector(32, 0.5f)); + tester.AddInput("past_sequence_length", {1}, {2}); + tester.AddInput("cache_indirection", {1, 2, 4}, {0, 2, 0, 0, 0, 0, 0, 0}); + + tester.AddOutput("output", {2, 1, 4}, std::vector(8, 0.0f)); + tester.AddOutput("present_key", {2, 1, 4, 4}, std::vector(32, 0.0f)); + tester.AddOutput("present_value", {2, 1, 4, 4}, std::vector(32, 0.0f)); + tester.AddOptionalOutputEdge(); + + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + tester.Run(OpTester::ExpectResult::kExpectFailure, "cache_indirection beam index out of range", + {}, nullptr, &execution_providers); +} + +TEST(MultiHeadAttentionTest, CacheIndirectionBeamWidthOneInvalidIndex) { + OpTester tester("MultiHeadAttention", 1, onnxruntime::kMSDomain); + tester.AddAttribute("num_heads", 1); + + tester.AddInput("query", {2, 1, 4}, std::vector(8, 0.1f)); + tester.AddInput("key", {2, 1, 4}, std::vector(8, 0.2f)); + tester.AddInput("value", {2, 1, 4}, std::vector(8, 0.3f)); + tester.AddOptionalInputEdge(); + tester.AddOptionalInputEdge(); + tester.AddOptionalInputEdge(); + tester.AddInput("past_key", {2, 1, 4, 4}, std::vector(32, 0.4f)); + tester.AddInput("past_value", {2, 1, 4, 4}, std::vector(32, 0.5f)); + tester.AddInput("past_sequence_length", {1}, {2}); + tester.AddInput("cache_indirection", {2, 1, 4}, {0, 1, 0, 0, 0, 0, 0, 0}); + + tester.AddOutput("output", {2, 1, 4}, std::vector(8, 0.0f)); + tester.AddOutput("present_key", {2, 1, 4, 4}, std::vector(32, 0.0f)); + tester.AddOutput("present_value", {2, 1, 4, 4}, std::vector(32, 0.0f)); + tester.AddOptionalOutputEdge(); + + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + tester.Run(OpTester::ExpectResult::kExpectFailure, "cache_indirection beam index out of range", + {}, nullptr, &execution_providers); +} + // Test fused cross attention kernel // It requires head_size > 32 and head_size <= 64 for T4 GPU; hidden_size == v_hidden_size. TEST(MultiHeadAttentionTest, CrossAttention_Batch2_HeadSize40) { From cd05ef401bfdb34d65759d933860be067bad362e Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Tue, 19 May 2026 10:56:15 -0700 Subject: [PATCH 10/13] Fix int32 overflow in CUDA Cast and UnaryElementWise kernels for tensors with >2^31 elements (#28386) - [x] Fix `unary_elementwise_impl.cuh`: Change `CUDA_LONG` to `int64_t` for `N` parameter and loop index in `_UnaryElementWise` kernel, and fix `blocksPerGrid` calculation - [x] Fix `cast_op.cu`: Change `CUDA_LONG` to `int64_t` for `N` parameter and loop index in `CastKernelStd`, `CastKernelSat`, and `CudaCastPairwiseKernel` kernels, and remove `static_cast` truncation - [x] Use `size_t` for `pair_count` in CudaCastPairwise to avoid double conversion (review feedback) - [x] Rename test to `CastKernelCorrectness_ModerateSize` and add `CastKernel_Int64IndexArithmetic_NoOverflow` host-side test (review feedback) - [x] Merge from main to resolve conflicts with Float8E8M0 tests --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: tianleiwu <30328909+tianleiwu@users.noreply.github.com> Co-authored-by: justinchuby <11205048+justinchuby@users.noreply.github.com> Co-authored-by: Tianlei Wu --- .../cuda/cu_inc/unary_elementwise_impl.cuh | 12 ++-- .../core/providers/cuda/tensor/cast_op.cu | 44 ++++++++------ .../test/providers/cpu/tensor/cast_op_test.cc | 57 +++++++++++++++++++ 3 files changed, 90 insertions(+), 23 deletions(-) diff --git a/onnxruntime/core/providers/cuda/cu_inc/unary_elementwise_impl.cuh b/onnxruntime/core/providers/cuda/cu_inc/unary_elementwise_impl.cuh index c8ddbadb12fb2..5959482e5664e 100644 --- a/onnxruntime/core/providers/cuda/cu_inc/unary_elementwise_impl.cuh +++ b/onnxruntime/core/providers/cuda/cu_inc/unary_elementwise_impl.cuh @@ -14,11 +14,11 @@ __global__ void _UnaryElementWise( const InT* input_data, OutT* output_data, const FuncT functor, - CUDA_LONG N) { - CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; + int64_t N) { + int64_t start = static_cast(NumElementsPerThread) * NumThreadsPerBlock * blockIdx.x + threadIdx.x; InT value[NumElementsPerThread]; - CUDA_LONG id = start; + int64_t id = start; #pragma unroll for (int i = 0; i < NumElementsPerThread; i++) { if (id < N) { @@ -47,8 +47,10 @@ void UnaryElementWiseImpl( if (count == 0) // special case where there's a dim value of 0 in the shape return; - int blocksPerGrid = static_cast(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); - CUDA_LONG N = static_cast(count); + size_t blocksPerGridSize = CeilDiv(count, static_cast(GridDim::maxThreadsPerBlock) * GridDim::maxElementsPerThread); + ORT_ENFORCE(blocksPerGridSize <= static_cast(INT32_MAX), "Grid size exceeds CUDA limits"); + int blocksPerGrid = static_cast(blocksPerGridSize); + int64_t N = static_cast(count); _UnaryElementWise <<>>( input_data, diff --git a/onnxruntime/core/providers/cuda/tensor/cast_op.cu b/onnxruntime/core/providers/cuda/tensor/cast_op.cu index a8cd6caaa5d5f..c56d613e25241 100644 --- a/onnxruntime/core/providers/cuda/tensor/cast_op.cu +++ b/onnxruntime/core/providers/cuda/tensor/cast_op.cu @@ -220,8 +220,8 @@ struct CastStd { #endif // DISABLE_FLOAT4_TYPES template -__global__ void CastKernelStd(const InT* input, OutT* output, CUDA_LONG N, CastStd cast) { - CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; +__global__ void CastKernelStd(const InT* input, OutT* output, int64_t N, CastStd cast) { + int64_t id = static_cast(NumElementsPerThread) * NumThreadsPerBlock * blockIdx.x + threadIdx.x; #pragma unroll for (int i = 0; i < NumElementsPerThread; i++) { @@ -237,11 +237,13 @@ Status CudaCastStd(cudaStream_t stream, const InT* input, OutT* output, size_t n if (num_of_elements <= 0) return Status::OK(); - int blocksPerGrid = static_cast(CeilDiv(num_of_elements, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); + size_t blocksPerGridSize = CeilDiv(num_of_elements, static_cast(GridDim::maxThreadsPerBlock) * GridDim::maxElementsPerThread); + ORT_RETURN_IF_NOT(blocksPerGridSize <= static_cast(INT32_MAX), "Grid size exceeds CUDA limits"); + int blocksPerGrid = static_cast(blocksPerGridSize); CastKernelStd<<>>( input, output, - static_cast(num_of_elements), + static_cast(num_of_elements), CastStd()); return Status::OK(); } @@ -251,10 +253,10 @@ Status CudaCastStd(cudaStream_t stream, const InT* input, OutT* output, size_t n template __global__ void CudaCastPairwiseKernel(const InPairType* input, OutPairType* output, - CUDA_LONG pair_count, + int64_t pair_count, CastStd pair_caster, CastStd singleton_caster) { - CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; + int64_t id = static_cast(NumElementsPerThread) * NumThreadsPerBlock * blockIdx.x + threadIdx.x; #pragma unroll for (int i = 0; i < NumElementsPerThread; i++) { @@ -284,9 +286,11 @@ Status CudaCastPairwise(cudaStream_t stream, const Float4E2M1x2* input, float* o bool is_odd = (num_of_elements & 0x01) != 0; - int pair_count = static_cast(num_of_elements / 2); + size_t pair_count = num_of_elements / 2; - int blocksPerGrid = static_cast(CeilDiv(pair_count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); + size_t blocksPerGridSize = CeilDiv(pair_count, static_cast(GridDim::maxThreadsPerBlock) * GridDim::maxElementsPerThread); + ORT_RETURN_IF_NOT(blocksPerGridSize <= static_cast(INT32_MAX), "Grid size exceeds CUDA limits"); + int blocksPerGrid = static_cast(blocksPerGridSize); if (pair_count == 0) { blocksPerGrid = 1; @@ -296,14 +300,14 @@ Status CudaCastPairwise(cudaStream_t stream, const Float4E2M1x2* input, float* o CudaCastPairwiseKernel <<>>( - input, reinterpret_cast(output), pair_count, + input, reinterpret_cast(output), static_cast(pair_count), CastStd(), CastStd()); } else { CudaCastPairwiseKernel <<>>( - input, reinterpret_cast(output), pair_count, + input, reinterpret_cast(output), static_cast(pair_count), CastStd(), CastStd()); } @@ -318,9 +322,11 @@ Status CudaCastPairwise(cudaStream_t stream, const float* input, Float4E2M1x2* o bool is_odd = (num_of_elements & 0x01) != 0; - int pair_count = static_cast(num_of_elements / 2); + size_t pair_count = num_of_elements / 2; - int blocksPerGrid = static_cast(CeilDiv(pair_count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); + size_t blocksPerGridSize = CeilDiv(pair_count, static_cast(GridDim::maxThreadsPerBlock) * GridDim::maxElementsPerThread); + ORT_RETURN_IF_NOT(blocksPerGridSize <= static_cast(INT32_MAX), "Grid size exceeds CUDA limits"); + int blocksPerGrid = static_cast(blocksPerGridSize); if (pair_count == 0) { blocksPerGrid = 1; @@ -330,14 +336,14 @@ Status CudaCastPairwise(cudaStream_t stream, const float* input, Float4E2M1x2* o CudaCastPairwiseKernel <<>>( - reinterpret_cast(input), output, pair_count, + reinterpret_cast(input), output, static_cast(pair_count), CastStd(), CastStd()); } else { CudaCastPairwiseKernel <<>>( - reinterpret_cast(input), output, pair_count, + reinterpret_cast(input), output, static_cast(pair_count), CastStd(), CastStd()); } @@ -353,8 +359,8 @@ template Status CudaCastPairwise(cudaStream_t stream, const #if !defined(DISABLE_FLOAT8_TYPES) template -__global__ void CastKernelSat(const InT* input, OutT* output, CUDA_LONG N, CastSat cast, bool saturate) { - CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; +__global__ void CastKernelSat(const InT* input, OutT* output, int64_t N, CastSat cast, bool saturate) { + int64_t id = static_cast(NumElementsPerThread) * NumThreadsPerBlock * blockIdx.x + threadIdx.x; #pragma unroll for (int i = 0; i < NumElementsPerThread; i++) { @@ -370,11 +376,13 @@ Status CudaCastSat(cudaStream_t stream, const InT* input, OutT* output, size_t n if (num_of_element <= 0) return Status::OK(); - int blocksPerGrid = static_cast(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); + size_t blocksPerGridSize = CeilDiv(num_of_element, static_cast(GridDim::maxThreadsPerBlock) * GridDim::maxElementsPerThread); + ORT_RETURN_IF_NOT(blocksPerGridSize <= static_cast(INT32_MAX), "Grid size exceeds CUDA limits"); + int blocksPerGrid = static_cast(blocksPerGridSize); CastKernelSat<<>>( input, output, - static_cast(num_of_element), + static_cast(num_of_element), CastSat(), saturate); return Status::OK(); diff --git a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc index 0e14bc59a09c9..038a8eaade116 100644 --- a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc @@ -3127,6 +3127,63 @@ TEST(CastOpTest, CopyCpuTensor_SubByteTypes_DistinctBuffers) { } } +// Correctness test for Cast kernel with a moderately large tensor. +// Exercises the same kernel code path as tensors > 2^31 elements but stays within +// CI GPU memory limits. For the actual overflow scenario, see the host-side test below. +TEST(CastOpTest, CastKernelCorrectness_ModerateSize) { + constexpr int64_t num_elements = 1 << 24; // 16M elements + const std::vector shape = {num_elements}; + + std::vector input(num_elements); + std::vector expected(num_elements); + for (int64_t i = 0; i < num_elements; ++i) { + input[i] = static_cast(i % 1000); + expected[i] = static_cast(i % 1000); + } + + TestCastOp(gsl::make_span(input), gsl::make_span(expected), shape); +} + +// Host-side regression test that verifies the grid launch arithmetic uses 64-bit +// types for element counts exceeding INT32_MAX. This validates the fix without +// needing to allocate > 8 GB of GPU memory. +// The fix changed: +// CUDA_LONG N = static_cast(count) // was int32 truncation +// to: +// int64_t N = static_cast(count) // correct 64-bit +TEST(CastOpTest, CastKernel_Int64IndexArithmetic_NoOverflow) { + // Simulate the grid launch calculation from UnaryElementWiseImpl / CudaCastStd + // with a count that exceeds INT32_MAX. + constexpr size_t count = static_cast(INT32_MAX) + 65536; // 2^31 + 65536 + constexpr int maxThreadsPerBlock = 256; + constexpr int maxElementsPerThread = 4; + + // Verify N is correctly represented (not truncated to int32) + int64_t N = static_cast(count); + ASSERT_GT(N, static_cast(INT32_MAX)); + ASSERT_EQ(N, static_cast(count)); + + // Verify blocksPerGrid calculation doesn't overflow + // (uses size_t arithmetic for the divisor) + size_t elements_per_block = static_cast(maxThreadsPerBlock) * maxElementsPerThread; + int blocksPerGrid = static_cast((count + elements_per_block - 1) / elements_per_block); + ASSERT_GT(blocksPerGrid, 0); + // For count = 2^31 + 65536, elements_per_block = 1024, we expect ~2M blocks + ASSERT_EQ(blocksPerGrid, static_cast((count + 1023) / 1024)); + + // Verify that the per-thread index calculation doesn't overflow in int64_t + // Simulate the last block's thread 0: id = NumElementsPerThread * NumThreadsPerBlock * (blocksPerGrid-1) + 0 + int64_t last_block_start = static_cast(maxElementsPerThread) * maxThreadsPerBlock * + (blocksPerGrid - 1); + ASSERT_GT(last_block_start, 0); // Positive (no overflow) + ASSERT_LE(last_block_start, N); // Within bounds + + // Verify the old int32 code would have failed: + // static_cast(count) would silently wrap + int32_t truncated_N = static_cast(count); + ASSERT_LT(truncated_N, 0); // Proves the old code was broken (wraps negative) +} + #if !defined(DISABLE_FLOAT8_TYPES) float FloatFromBits(uint32_t bits) { From 8c650d8146c32717096685681b071275a69bbabc Mon Sep 17 00:00:00 2001 From: Dhruvil Parikh <41384593+dparikh79@users.noreply.github.com> Date: Tue, 19 May 2026 14:02:35 -0400 Subject: [PATCH 11/13] Fix XNNPACK Gemm SIGSEGV on missing and scalar 'C' bias (#28546) ### Description Hardens the XNNPACK Gemm capability check against two SIGSEGV crashes during graph partitioning: one when the optional `C` input is omitted, one when `C` is a rank-0 (scalar) tensor. The check now guards the null `C` arg before calling `Shape()`, and rejects rank-0 `C` so the node falls back to the CPU EP cleanly. Thanks @kadu-v, the minimal Python repros made the root cause easy to confirm. Both reproduced as a hard crash on the first `InferenceSession` construction. ### Motivation and Context Fixes #28541 Fixes #28542 `Gemm::IsOnnxNodeSupported` dereferenced `C_arg->Shape()` without checking whether `C_arg` was non-null, so any Gemm without the optional bias segfaulted before the EP could decline the node. A rank-0 `C` then survived the existing checks and reached XNNPACK's fully-connected path, which doesn't implement scalar broadcast (there's already a TODO in that file noting it). That's the second SIGSEGV. ### Changes `onnxruntime/core/providers/xnnpack/math/gemm.cc`: - Null-check `C_arg` before reading its shape. Absent `C` is valid per the Gemm spec; treat it as "no bias". - Reject `C` with rank 0 from `IsOnnxNodeSupported` so the node falls through to CPU. Adding scalar broadcast support belongs with the TODO in the fully-connected path, not in the capability check. ### Testing Three regression tests in `onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc`: - `TestGemm_NoC_NoSegfault` builds a Gemm with the `C` input omitted. - `TestGemm_ScalarC_NoSegfault` builds a Gemm with a rank-0 `C`. - `TestGemm_EmptyC_NoSegfault` covers an empty-shape `C` edge case. Each test loads an `InferenceSession` with the XNNPACK EP registered and asserts no crash. I also suspect `Gemm`'s constructor has pre-existing crashes when `A` or `B` is 1-D, before the capability check even runs. Haven't reproduced it. Can file a follow-up if useful. Signed-off-by: Dhruvil --- .../core/providers/xnnpack/math/gemm.cc | 34 ++++++-- .../providers/xnnpack/xnnpack_basic_test.cc | 83 +++++++++++++++++++ 2 files changed, 109 insertions(+), 8 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/math/gemm.cc b/onnxruntime/core/providers/xnnpack/math/gemm.cc index 9b78e943122de..f8992b13c8f5d 100644 --- a/onnxruntime/core/providers/xnnpack/math/gemm.cc +++ b/onnxruntime/core/providers/xnnpack/math/gemm.cc @@ -36,6 +36,10 @@ bool Gemm::IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& gra const NodeArg* A_arg = input_defs[0]; const NodeArg* B_arg = input_defs[1]; const NodeArg* C_arg = input_defs.size() == 2 ? nullptr : input_defs[2]; + // Single source of truth for "is C actually present?". Matches the kernel + // constructor's C_matrix_exists_ = C_arg && C_arg->Exists() contract and the + // has_bias convention used in xnnpack/nn/conv_base.cc. + const bool has_c = (C_arg != nullptr && C_arg->Exists()); // we only support float currently const auto* A_type = A_arg->TypeAsProto(); @@ -51,14 +55,13 @@ bool Gemm::IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& gra break; } - if (input_defs.size() == 3 && !graph.IsConstantInitializer(C_arg->Name(), true)) { + if (has_c && !graph.IsConstantInitializer(C_arg->Name(), true)) { break; } // making sure we are dealing with MatMul const ONNX_NAMESPACE::TensorShapeProto* A_shape = A_arg->Shape(); const ONNX_NAMESPACE::TensorShapeProto* B_shape = B_arg->Shape(); - const ONNX_NAMESPACE::TensorShapeProto* C_shape = C_arg->Shape(); if (!A_shape || A_shape->dim_size() >= 3) { break; @@ -68,12 +71,27 @@ bool Gemm::IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& gra break; } - if (!C_shape || C_shape->dim_size() >= 3) { - break; - } - - if (C_arg && C_arg->Exists() && (C_shape->dim(0).dim_value() != B_shape->dim(1).dim_value() && C_shape->dim(0).dim_value() != B_shape->dim(0).dim_value())) { - break; + // Optional C: if the input slot is absent (2-input Gemm) C_arg is null and we must not + // call Shape() on it. If C_arg exists but Exists() is false (empty optional input slot) + // we treat it identically: per ONNX, an empty optional input is equivalent to omitting + // the input. The kernel constructor's C_matrix_exists_ contract agrees. + if (has_c) { + const ONNX_NAMESPACE::TensorShapeProto* C_shape = C_arg->Shape(); + if (!C_shape || C_shape->dim_size() >= 3) { + break; + } + + // Rank-0 C would be out of bounds on the C_shape->dim(0) check below and the + // xnn_create_fully_connected_nc_* bias path requires a length-N vector, so reject + // and fall back to the CPU EP. + if (C_shape->dim_size() == 0) { + break; + } + + if (C_shape->dim(0).dim_value() != B_shape->dim(1).dim_value() && + C_shape->dim(0).dim_value() != B_shape->dim(0).dim_value()) { + break; + } } supported = true; diff --git a/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc b/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc index 9ca081a74c850..47c9978b3a9c8 100644 --- a/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc +++ b/onnxruntime/test/providers/xnnpack/xnnpack_basic_test.cc @@ -574,6 +574,89 @@ TEST(XnnpackEP, DISABLED_TestResize_u8_and_s8_NHWC_pytorch_half_pixel) { // [ON {ExpectedEPNodeAssignment::Some, 1e-2f /* fp32_abs_err */}); } +// Regression test for https://github.com/microsoft/onnxruntime/issues/28541. +// A two-input Gemm (no optional C bias) used to dereference a null NodeArg pointer in +// Gemm::IsOnnxNodeSupported, segfaulting InferenceSession::Initialize before any kernel +// ran. The capability check must accept the missing-C case and let the node be assigned +// to XNNPACK without crashing. +TEST(XnnpackEP, TestGemm_NoC_NoSegfault) { + const std::vector a_shape = {2, 3}; + const std::vector b_shape = {3, 4}; + auto modelBuilder = [&](ModelTestBuilder& builder) { + auto* input_a = builder.MakeInput(a_shape, -1.f, 1.f); + auto* input_b = builder.MakeInitializer(b_shape, -1.f, 1.f); + auto* output_arg = builder.MakeOutput(); + auto& gemm_node = builder.AddNode("Gemm", {input_a, input_b}, {output_arg}); + gemm_node.AddAttribute("alpha", 1.0f); + gemm_node.AddAttribute("beta", 1.0f); + gemm_node.AddAttribute("transA", static_cast(0)); + gemm_node.AddAttribute("transB", static_cast(0)); + }; + // ExpectedEPNodeAssignment::All asserts both that the session initialized without + // segfaulting AND that XNNPACK accepted the 2-input Gemm node. + RunModelTest(modelBuilder, "xnnpack_test_graph_gemm_no_c", + { + ExpectedEPNodeAssignment::All, + 1e-4f /* fp32_abs_err */, + }); +} + +// Regression test for https://github.com/microsoft/onnxruntime/issues/28542. +// A Gemm with a scalar (rank 0) C bias used to skip past the dim_size() >= 3 guard and +// then crash on C_shape->dim(0) inside Gemm::IsOnnxNodeSupported. The capability check +// must reject rank-0 C cleanly so the node falls back to the CPU EP and the session +// initializes without segfaulting. RunModelTest compares the XNNPACK + CPU run against +// the pure CPU baseline, so this also verifies numerical correctness end to end. +TEST(XnnpackEP, TestGemm_ScalarC_NoSegfault) { + const std::vector a_shape = {2, 3}; + const std::vector b_shape = {3, 4}; + auto modelBuilder = [&](ModelTestBuilder& builder) { + auto* input_a = builder.MakeInput(a_shape, -1.f, 1.f); + auto* input_b = builder.MakeInitializer(b_shape, -1.f, 1.f); + auto* input_c = builder.MakeScalarInitializer(0.5f); + auto* output_arg = builder.MakeOutput(); + auto& gemm_node = builder.AddNode("Gemm", {input_a, input_b, input_c}, {output_arg}); + gemm_node.AddAttribute("alpha", 1.0f); + gemm_node.AddAttribute("beta", 1.0f); + gemm_node.AddAttribute("transA", static_cast(0)); + gemm_node.AddAttribute("transB", static_cast(0)); + }; + RunModelTest(modelBuilder, "xnnpack_test_graph_gemm_scalar_c", + { + ExpectedEPNodeAssignment::None, + 1e-4f /* fp32_abs_err */, + }); +} + +// Defense-in-depth regression test for the C_arg->Exists() == false branch. A 3-input +// Gemm whose C slot is an empty optional input (Exists() == false) is semantically +// equivalent to a 2-input Gemm per ONNX (empty optional input == omitted input). The +// XNNPACK support check now treats them identically: both are accepted and routed to +// the bias=nullptr path in xnn_create_fully_connected_nc_*. This locks in the +// consistency between the 2-input case (TestGemm_NoC_NoSegfault) and the empty-optional +// case, matching the kernel constructor's C_matrix_exists_ = C_arg && C_arg->Exists() +// contract. +TEST(XnnpackEP, TestGemm_EmptyC_NoSegfault) { + const std::vector a_shape = {2, 3}; + const std::vector b_shape = {3, 4}; + auto modelBuilder = [&](ModelTestBuilder& builder) { + auto* input_a = builder.MakeInput(a_shape, -1.f, 1.f); + auto* input_b = builder.MakeInitializer(b_shape, -1.f, 1.f); + auto* input_c = builder.MakeEmptyInput(); + auto* output_arg = builder.MakeOutput(); + auto& gemm_node = builder.AddNode("Gemm", {input_a, input_b, input_c}, {output_arg}); + gemm_node.AddAttribute("alpha", 1.0f); + gemm_node.AddAttribute("beta", 1.0f); + gemm_node.AddAttribute("transA", static_cast(0)); + gemm_node.AddAttribute("transB", static_cast(0)); + }; + RunModelTest(modelBuilder, "xnnpack_test_graph_gemm_empty_c", + { + ExpectedEPNodeAssignment::All, + 1e-4f /* fp32_abs_err */, + }); +} + #endif } // namespace test From 6a8f021beeecfb2544519ad413fb31c81d5e7435 Mon Sep 17 00:00:00 2001 From: Adrian Lizarraga Date: Tue, 19 May 2026 11:42:30 -0700 Subject: [PATCH 12/13] Validate sparse tensor external file paths (#28408) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description Adds path traversal validation for sparse tensors with external data, closing a gap where `SparseTensorProtoToDenseTensorProto` would read external files without checking whether the path escapes the model directory. ### Bug fix (pre-existing) - **`CopySparseData` indices size check**: The `raw_data().size()` check was wrong for external data (where `raw_data` is empty). Fixed by adding a pre-unpack `raw_data` size guard for inline data and a post-unpack `unpack_buffer` size check for all data sources. ### Tests - **Security tests** (tensorutils_test.cc): Path traversal blocked (values, indices), absolute path blocked (values, indices), zero-element regression (zero dense elements, zero NNZ). All create escaping files and assert specifically for `"escapes"` error. - **Positive tests** (sparse_kernels_test.cc): 7 end-to-end tests for legitimate sparse tensors with external data — external values, external indices (INT64/INT32/INT16/INT8), both external (rank-1 and rank-2 COO). ### Known limitation (deferred) ORT_MEM_ADDR in-memory external data for sparse tensors can trigger arbitrary memory reads. This is a separate issue from path validation — `LoadSparseInitializerOrtFormat` legitimately uses in-memory markers for ORT-format models, so blanket rejection would break functionality. Should be addressed in a separate PR. ## Motivation and Context A malicious ONNX model could use `../` path traversal in sparse tensor external data locations to read arbitrary files outside the model directory. Dense tensors already had this validation; sparse tensors did not. --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../core/framework/tensorprotoutils.cc | 80 ++++- .../test/framework/sparse_kernels_test.cc | 278 +++++++++++++++ .../test/framework/tensorutils_test.cc | 318 ++++++++++++++++++ 3 files changed, 664 insertions(+), 12 deletions(-) diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc index 6f73456742160..275fa837a7257 100644 --- a/onnxruntime/core/framework/tensorprotoutils.cc +++ b/onnxruntime/core/framework/tensorprotoutils.cc @@ -2097,6 +2097,33 @@ void MakeCpuTensorCopy(const Tensor& src_tensor, Tensor& dst_tensor) { } #if !defined(DISABLE_SPARSE_TENSORS) + +// Validates that a TensorProto's external data path does not escape the model directory. +// Also validates that the file exists when filesystem access is available (skipped on WASM without a virtual FS). +// Returns Status::OK() (no-op) for tensors that do not use file-based external data. +static Status ValidateExternalDataPathForTensor(const ONNX_NAMESPACE::TensorProto& tensor_proto, + const std::filesystem::path& model_path) { + // Gates on data_location == EXTERNAL directly instead of using HasExternalData()/HasExternalDataInFile(), + // which also require data_type != UNDEFINED. That check is appropriate for data processing (can't unpack + // without a type), but too narrow for security validation: we must validate any declared external path + // regardless of data_type. + if (tensor_proto.data_location() != ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) { + return Status::OK(); + } + + std::unique_ptr external_data_info; + ORT_RETURN_IF_ERROR(ExternalDataInfo::Create(tensor_proto.external_data(), external_data_info)); + const auto& rel_path = external_data_info->GetRelPath(); + + // In-memory external data uses special marker locations — skip file path validation for those. + if (rel_path == kTensorProtoLittleEndianMemoryAddressTag || + rel_path == kTensorProtoNativeEndianMemoryAddressTag) { + return Status::OK(); + } + + return utils::ValidateExternalDataPath(model_path, rel_path); +} + static Status CopySparseData(const std::string& name, int64_t nnz_elements, const ONNX_NAMESPACE::TensorProto& indices, @@ -2115,10 +2142,18 @@ static Status CopySparseData(const std::string& name, switch (indices.data_type()) { case ONNX_NAMESPACE::TensorProto_DataType_INT64: if (needs_unpack) { - ORT_RETURN_IF_NOT(indices.raw_data().size() == SafeInt(indices_elements) * sizeof(int64_t), - "Sparse tensor: ", name, " indices raw data size does not match expected: ", - indices_elements * sizeof(int64_t)); + // For inline raw_data, validate size before unpacking to avoid a large allocation from a + // malformed tensor with small indices shape but oversized raw_data. For external data, + // raw_data is empty so we can only validate after unpacking. + if (!utils::HasExternalData(indices)) { + ORT_RETURN_IF_NOT(indices.raw_data().size() == SafeInt(indices_elements) * sizeof(int64_t), + "Sparse tensor: ", name, " indices raw data size does not match expected: ", + indices_elements * sizeof(int64_t)); + } ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer)); + ORT_RETURN_IF_NOT(unpack_buffer.size() == SafeInt(indices_elements) * sizeof(int64_t), + "Sparse tensor: ", name, " indices data size does not match expected: ", + indices_elements * sizeof(int64_t)); indices_data = ReinterpretAsSpan(gsl::make_span(unpack_buffer)); } else { ORT_RETURN_IF_NOT(indices.int64_data_size() == indices_elements, @@ -2129,10 +2164,15 @@ static Status CopySparseData(const std::string& name, break; case ONNX_NAMESPACE::TensorProto_DataType_INT32: { if (needs_unpack) { - ORT_RETURN_IF_NOT(indices.raw_data().size() == SafeInt(indices_elements) * sizeof(int32_t), - "Sparse tensor: ", name, " indices raw data size does not match expected: ", - indices_elements * sizeof(int32_t)); + if (!utils::HasExternalData(indices)) { + ORT_RETURN_IF_NOT(indices.raw_data().size() == SafeInt(indices_elements) * sizeof(int32_t), + "Sparse tensor: ", name, " indices raw data size does not match expected: ", + indices_elements * sizeof(int32_t)); + } ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer)); + ORT_RETURN_IF_NOT(unpack_buffer.size() == SafeInt(indices_elements) * sizeof(int32_t), + "Sparse tensor: ", name, " indices data size does not match expected: ", + indices_elements * sizeof(int32_t)); auto int32_span = ReinterpretAsSpan(gsl::make_span(unpack_buffer)); indices_values.insert(indices_values.cend(), int32_span.begin(), int32_span.end()); unpack_buffer.clear(); @@ -2148,10 +2188,15 @@ static Status CopySparseData(const std::string& name, } case ONNX_NAMESPACE::TensorProto_DataType_INT16: { if (needs_unpack) { - ORT_RETURN_IF_NOT(indices.raw_data().size() == SafeInt(indices_elements) * sizeof(int16_t), - "Sparse tensor: ", name, " indices raw data size does not match expected: ", - indices_elements * sizeof(int16_t)); + if (!utils::HasExternalData(indices)) { + ORT_RETURN_IF_NOT(indices.raw_data().size() == SafeInt(indices_elements) * sizeof(int16_t), + "Sparse tensor: ", name, " indices raw data size does not match expected: ", + indices_elements * sizeof(int16_t)); + } ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer)); + ORT_RETURN_IF_NOT(unpack_buffer.size() == SafeInt(indices_elements) * sizeof(int16_t), + "Sparse tensor: ", name, " indices data size does not match expected: ", + indices_elements * sizeof(int16_t)); auto int16_span = ReinterpretAsSpan(gsl::make_span(unpack_buffer)); indices_values.insert(indices_values.cend(), int16_span.begin(), int16_span.end()); unpack_buffer.clear(); @@ -2167,10 +2212,15 @@ static Status CopySparseData(const std::string& name, } case ONNX_NAMESPACE::TensorProto_DataType_INT8: { if (needs_unpack) { - ORT_RETURN_IF_NOT(indices.raw_data().size() == narrow(indices_elements), - "Sparse tensor: ", name, " indices raw data size does not match expected: ", - indices_elements * sizeof(int8_t)); + if (!utils::HasExternalData(indices)) { + ORT_RETURN_IF_NOT(indices.raw_data().size() == narrow(indices_elements), + "Sparse tensor: ", name, " indices raw data size does not match expected: ", + indices_elements * sizeof(int8_t)); + } ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer)); + ORT_RETURN_IF_NOT(unpack_buffer.size() == narrow(indices_elements), + "Sparse tensor: ", name, " indices data size does not match expected: ", + indices_elements * sizeof(int8_t)); auto int8_span = ReinterpretAsSpan(gsl::make_span(unpack_buffer)); indices_values.insert(indices_values.cend(), int8_span.begin(), int8_span.end()); unpack_buffer.clear(); @@ -2318,6 +2368,12 @@ common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseT } } + // Validate external data paths before any early returns or allocations. + // This ensures malicious paths are rejected even for zero-element tensors, + // and prevents large allocations before an invalid path is caught. + ORT_RETURN_IF_ERROR(ValidateExternalDataPathForTensor(sparse_values, model_path)); + ORT_RETURN_IF_ERROR(ValidateExternalDataPathForTensor(indices, model_path)); + if (dense_elements == 0) { // if there are no elements in the dense tensor, we can return early with an empty tensor proto return status; diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc index 59ec8f51b4f4e..9efaed8ac7bd6 100644 --- a/onnxruntime/test/framework/sparse_kernels_test.cc +++ b/onnxruntime/test/framework/sparse_kernels_test.cc @@ -2539,6 +2539,284 @@ TEST(SparseTensorConversionTests, SparseCooToDense_2DRowOutOfRange) { EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid COO 2D index")); } +// Positive tests for SparseTensorProtoToDenseTensorProto with external data. +// These verify end-to-end conversion succeeds when values and/or indices are stored +// in legitimate external files within the model directory. + +// Helper: write data to a temp file and configure a TensorProto to reference it as external data. +// The file is created in the current working directory using CreateTestFile. +// The ScopedFileDeleter is assigned immediately after file creation to ensure cleanup on any failure. +template +static void SetupExternalDataTensor(TensorProto_DataType type, + const std::vector& data, + PathString& filename, + TensorProto& tensor_proto, + ScopedFileDeleter& file_deleter) { + size_t size_in_bytes = data.size() * sizeof(T); + std::vector le_data(size_in_bytes); + + auto src_span = gsl::make_span(data.data(), data.size()); + auto dst_span = gsl::make_span(le_data.data(), le_data.size()); + ASSERT_STATUS_OK(onnxruntime::utils::WriteLittleEndian(src_span, dst_span)); + + FILE* fp; + CreateTestFile(fp, filename); + file_deleter = ScopedFileDeleter(filename); + ASSERT_EQ(size_in_bytes, fwrite(le_data.data(), 1, size_in_bytes, fp)); + ASSERT_EQ(0, fclose(fp)); + + tensor_proto.set_data_type(type); + tensor_proto.set_data_location(TensorProto_DataLocation_EXTERNAL); + + auto* loc = tensor_proto.mutable_external_data()->Add(); + loc->set_key("location"); + loc->set_value(ToUTF8String(filename)); + + auto* len = tensor_proto.mutable_external_data()->Add(); + len->set_key("length"); + len->set_value(std::to_string(size_in_bytes)); +} + +// External values + inline indices (INT64), rank-1 COO. +TEST(SparseTensorConversionTests, SparseTensorProtoToDense_ExternalValues_InlineIndices) { + // Dense shape [2, 3] = 6 elements. + // NNZ=3 values at linear indices [0, 2, 5]. + // Expected dense: [1.0, 0, 2.0, 0, 0, 3.0] + std::vector values = {1.0f, 2.0f, 3.0f}; + PathString values_file(ORT_TSTR("ext_val_XXXXXX")); + + SparseTensorProto sparse; + sparse.add_dims(2); + sparse.add_dims(3); + + ScopedFileDeleter values_deleter; + SetupExternalDataTensor(TensorProto_DataType_FLOAT, values, values_file, *sparse.mutable_values(), + values_deleter); + sparse.mutable_values()->set_name("ext_values_test"); + sparse.mutable_values()->add_dims(3); // NNZ + + auto* indices = sparse.mutable_indices(); + indices->set_data_type(TensorProto_DataType_INT64); + indices->add_dims(3); + indices->add_int64_data(0); + indices->add_int64_data(2); + indices->add_int64_data(5); + + // model_path in CWD so external files are within the model directory + std::filesystem::path model_path = std::filesystem::current_path() / "model.onnx"; + TensorProto dense; + ASSERT_STATUS_OK(utils::SparseTensorProtoToDenseTensorProto(sparse, model_path, dense)); + + ASSERT_EQ(dense.dims_size(), 2); + EXPECT_EQ(dense.dims(0), 2); + EXPECT_EQ(dense.dims(1), 3); + + std::vector unpacked(6); + ASSERT_STATUS_OK(utils::UnpackTensor(dense, model_path, unpacked.data(), unpacked.size())); + std::vector expected = {1.0f, 0.0f, 2.0f, 0.0f, 0.0f, 3.0f}; + EXPECT_EQ(unpacked, expected); +} + +// Inline values + external indices (INT64), rank-1 COO. +TEST(SparseTensorConversionTests, SparseTensorProtoToDense_InlineValues_ExternalIndicesInt64) { + // Dense shape [4] = 4 elements. + // NNZ=2 at indices [1, 3]. + // Expected dense: [0, 10.0, 0, 20.0] + std::vector indices_data = {1, 3}; + PathString indices_file(ORT_TSTR("ext_idx_XXXXXX")); + + SparseTensorProto sparse; + sparse.add_dims(4); + + auto* values = sparse.mutable_values(); + values->set_name("ext_indices_test"); + values->set_data_type(TensorProto_DataType_FLOAT); + values->add_dims(2); + values->add_float_data(10.0f); + values->add_float_data(20.0f); + + ScopedFileDeleter indices_deleter; + SetupExternalDataTensor(TensorProto_DataType_INT64, indices_data, indices_file, + *sparse.mutable_indices(), indices_deleter); + sparse.mutable_indices()->add_dims(2); + + std::filesystem::path model_path = std::filesystem::current_path() / "model.onnx"; + TensorProto dense; + ASSERT_STATUS_OK(utils::SparseTensorProtoToDenseTensorProto(sparse, model_path, dense)); + + std::vector unpacked(4); + ASSERT_STATUS_OK(utils::UnpackTensor(dense, model_path, unpacked.data(), unpacked.size())); + std::vector expected = {0.0f, 10.0f, 0.0f, 20.0f}; + EXPECT_EQ(unpacked, expected); +} + +// Inline values + external indices (INT32), rank-1 COO. +TEST(SparseTensorConversionTests, SparseTensorProtoToDense_InlineValues_ExternalIndicesInt32) { + std::vector indices_data = {0, 3}; + PathString indices_file(ORT_TSTR("ext_i32_XXXXXX")); + + SparseTensorProto sparse; + sparse.add_dims(2); + sparse.add_dims(2); + + auto* values = sparse.mutable_values(); + values->set_name("ext_int32_idx_test"); + values->set_data_type(TensorProto_DataType_FLOAT); + values->add_dims(2); + values->add_float_data(5.0f); + values->add_float_data(6.0f); + + ScopedFileDeleter indices_deleter; + SetupExternalDataTensor(TensorProto_DataType_INT32, indices_data, indices_file, + *sparse.mutable_indices(), indices_deleter); + sparse.mutable_indices()->add_dims(2); + + std::filesystem::path model_path = std::filesystem::current_path() / "model.onnx"; + TensorProto dense; + ASSERT_STATUS_OK(utils::SparseTensorProtoToDenseTensorProto(sparse, model_path, dense)); + + std::vector unpacked(4); + ASSERT_STATUS_OK(utils::UnpackTensor(dense, model_path, unpacked.data(), unpacked.size())); + std::vector expected = {5.0f, 0.0f, 0.0f, 6.0f}; + EXPECT_EQ(unpacked, expected); +} + +// Inline values + external indices (INT16), rank-1 COO. +TEST(SparseTensorConversionTests, SparseTensorProtoToDense_InlineValues_ExternalIndicesInt16) { + std::vector indices_data = {1, 2}; + PathString indices_file(ORT_TSTR("ext_i16_XXXXXX")); + + SparseTensorProto sparse; + sparse.add_dims(4); + + auto* values = sparse.mutable_values(); + values->set_name("ext_int16_idx_test"); + values->set_data_type(TensorProto_DataType_FLOAT); + values->add_dims(2); + values->add_float_data(7.0f); + values->add_float_data(8.0f); + + ScopedFileDeleter indices_deleter; + SetupExternalDataTensor(TensorProto_DataType_INT16, indices_data, indices_file, + *sparse.mutable_indices(), indices_deleter); + sparse.mutable_indices()->add_dims(2); + + std::filesystem::path model_path = std::filesystem::current_path() / "model.onnx"; + TensorProto dense; + ASSERT_STATUS_OK(utils::SparseTensorProtoToDenseTensorProto(sparse, model_path, dense)); + + std::vector unpacked(4); + ASSERT_STATUS_OK(utils::UnpackTensor(dense, model_path, unpacked.data(), unpacked.size())); + std::vector expected = {0.0f, 7.0f, 8.0f, 0.0f}; + EXPECT_EQ(unpacked, expected); +} + +// Inline values + external indices (INT8), rank-1 COO. +TEST(SparseTensorConversionTests, SparseTensorProtoToDense_InlineValues_ExternalIndicesInt8) { + std::vector indices_data = {0, 2}; + PathString indices_file(ORT_TSTR("ext_i8_XXXXXX")); + + SparseTensorProto sparse; + sparse.add_dims(3); + + auto* values = sparse.mutable_values(); + values->set_name("ext_int8_idx_test"); + values->set_data_type(TensorProto_DataType_FLOAT); + values->add_dims(2); + values->add_float_data(9.0f); + values->add_float_data(11.0f); + + ScopedFileDeleter indices_deleter; + SetupExternalDataTensor(TensorProto_DataType_INT8, indices_data, indices_file, + *sparse.mutable_indices(), indices_deleter); + sparse.mutable_indices()->add_dims(2); + + std::filesystem::path model_path = std::filesystem::current_path() / "model.onnx"; + TensorProto dense; + ASSERT_STATUS_OK(utils::SparseTensorProtoToDenseTensorProto(sparse, model_path, dense)); + + std::vector unpacked(3); + ASSERT_STATUS_OK(utils::UnpackTensor(dense, model_path, unpacked.data(), unpacked.size())); + std::vector expected = {9.0f, 0.0f, 11.0f}; + EXPECT_EQ(unpacked, expected); +} + +// Both external values and external indices (INT64), rank-1 COO. +TEST(SparseTensorConversionTests, SparseTensorProtoToDense_ExternalValues_ExternalIndicesInt64) { + // Dense shape [3, 2] = 6 elements. + // NNZ=2 at linear indices [1, 4]. + // Expected dense: [0, 100.0, 0, 0, 200.0, 0] + std::vector values_data = {100.0f, 200.0f}; + std::vector indices_data = {1, 4}; + PathString values_file(ORT_TSTR("ext_bv_XXXXXX")); + PathString indices_file(ORT_TSTR("ext_bi_XXXXXX")); + + SparseTensorProto sparse; + sparse.add_dims(3); + sparse.add_dims(2); + + ScopedFileDeleter values_deleter; + SetupExternalDataTensor(TensorProto_DataType_FLOAT, values_data, values_file, *sparse.mutable_values(), + values_deleter); + sparse.mutable_values()->set_name("ext_both_test"); + sparse.mutable_values()->add_dims(2); + + ScopedFileDeleter indices_deleter; + SetupExternalDataTensor(TensorProto_DataType_INT64, indices_data, indices_file, + *sparse.mutable_indices(), indices_deleter); + sparse.mutable_indices()->add_dims(2); + + std::filesystem::path model_path = std::filesystem::current_path() / "model.onnx"; + TensorProto dense; + ASSERT_STATUS_OK(utils::SparseTensorProtoToDenseTensorProto(sparse, model_path, dense)); + + ASSERT_EQ(dense.dims_size(), 2); + EXPECT_EQ(dense.dims(0), 3); + EXPECT_EQ(dense.dims(1), 2); + + std::vector unpacked(6); + ASSERT_STATUS_OK(utils::UnpackTensor(dense, model_path, unpacked.data(), unpacked.size())); + std::vector expected = {0.0f, 100.0f, 0.0f, 0.0f, 200.0f, 0.0f}; + EXPECT_EQ(unpacked, expected); +} + +// Both external values and external indices (INT64), rank-2 COO indices. +TEST(SparseTensorConversionTests, SparseTensorProtoToDense_ExternalValues_ExternalIndicesInt64_Rank2) { + // Dense shape [3, 3] = 9 elements. + // NNZ=2 with 2D indices: [[0, 2], [2, 0]] -> positions (0,2)=2, (2,0)=6. + // Expected dense: [0, 0, 50.0, 0, 0, 0, 60.0, 0, 0] + std::vector values_data = {50.0f, 60.0f}; + // Rank-2 indices: flattened as [row0, col0, row1, col1] + std::vector indices_data = {0, 2, 2, 0}; + PathString values_file(ORT_TSTR("ext_r2v_XXXXXX")); + PathString indices_file(ORT_TSTR("ext_r2i_XXXXXX")); + + SparseTensorProto sparse; + sparse.add_dims(3); + sparse.add_dims(3); + + ScopedFileDeleter values_deleter; + SetupExternalDataTensor(TensorProto_DataType_FLOAT, values_data, values_file, *sparse.mutable_values(), + values_deleter); + sparse.mutable_values()->set_name("ext_rank2_test"); + sparse.mutable_values()->add_dims(2); // NNZ + + ScopedFileDeleter indices_deleter; + SetupExternalDataTensor(TensorProto_DataType_INT64, indices_data, indices_file, + *sparse.mutable_indices(), indices_deleter); + sparse.mutable_indices()->add_dims(2); // NNZ + sparse.mutable_indices()->add_dims(2); // rank of dense tensor + + std::filesystem::path model_path = std::filesystem::current_path() / "model.onnx"; + TensorProto dense; + ASSERT_STATUS_OK(utils::SparseTensorProtoToDenseTensorProto(sparse, model_path, dense)); + + std::vector unpacked(9); + ASSERT_STATUS_OK(utils::UnpackTensor(dense, model_path, unpacked.data(), unpacked.size())); + std::vector expected = {0.0f, 0.0f, 50.0f, 0.0f, 0.0f, 0.0f, 60.0f, 0.0f, 0.0f}; + EXPECT_EQ(unpacked, expected); +} + #endif // !defined(DISABLE_SPARSE_TENSORS) } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc index 71ac5b49e9718..06cc3ea6ad8d2 100644 --- a/onnxruntime/test/framework/tensorutils_test.cc +++ b/onnxruntime/test/framework/tensorutils_test.cc @@ -835,6 +835,324 @@ TEST_F(PathValidationTest, WeaklyCanonicalPathNtVolumeFallback_ResolvesDotDot) { } #endif // defined(_WIN32) +#if !defined(DISABLE_SPARSE_TENSORS) +// Regression test: SparseTensorProtoToDenseTensorProto must reject external_data paths +// that escape the model directory (path traversal via "../" in location). +TEST_F(PathValidationTest, SparseTensorExternalDataPathTraversalBlocked_Values) { + // Create model directory and a "secret" file outside it. + auto model_dir = base_dir_ / "model_dir"; + std::error_code ec; + std::filesystem::create_directories(model_dir, ec); + ASSERT_FALSE(ec) << "Failed to create model_dir: " << ec.message(); + + // Write known float data to a file outside the model directory. + auto secret_file = base_dir_ / "secret.bin"; + { + std::ofstream ofs(secret_file, std::ios::binary); + ASSERT_TRUE(ofs.is_open()) << "Failed to open " << secret_file; + float secret_data[] = {42.0f, 99.0f}; + ofs.write(reinterpret_cast(secret_data), sizeof(secret_data)); + ASSERT_TRUE(ofs.good()) << "Failed to write to " << secret_file; + } + + // Construct a SparseTensorProto whose values use external data with a path-traversal location. + ONNX_NAMESPACE::SparseTensorProto sparse; + sparse.add_dims(4); // dense shape: [4] + + // Values tensor: 2 non-zero float values stored in external file. + auto* values = sparse.mutable_values(); + values->set_name("sparse_test"); + values->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + values->add_dims(2); // 2 non-zero elements + values->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL); + + auto* loc = values->add_external_data(); + loc->set_key("location"); + loc->set_value("../secret.bin"); // path traversal! + + auto* len_entry = values->add_external_data(); + len_entry->set_key("length"); + len_entry->set_value(std::to_string(2 * sizeof(float))); + + // Indices: positions 0 and 1 in the dense tensor. + auto* indices = sparse.mutable_indices(); + indices->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + indices->add_dims(2); + indices->add_int64_data(0); + indices->add_int64_data(1); + + // Attempt to convert — this should fail with a path validation error. + ONNX_NAMESPACE::TensorProto dense; + std::filesystem::path model_path = model_dir / "model.onnx"; + Status status = utils::SparseTensorProtoToDenseTensorProto(sparse, model_path, dense); + ASSERT_FALSE(status.IsOK()) << "SparseTensorProtoToDenseTensorProto should reject path-traversal " + "in values external_data location, but it succeeded (reading " + "arbitrary file outside model directory)."; + EXPECT_THAT(status.ErrorMessage(), ::testing::HasSubstr("escapes")); +} + +// Same as above but for path traversal in the indices external data. +TEST_F(PathValidationTest, SparseTensorExternalDataPathTraversalBlocked_Indices) { + auto model_dir = base_dir_ / "model_dir"; + std::error_code ec; + std::filesystem::create_directories(model_dir, ec); + ASSERT_FALSE(ec) << "Failed to create model_dir: " << ec.message(); + + // Write indices data (2 x int64) to a file outside the model directory. + auto secret_file = base_dir_ / "indices_secret.bin"; + { + std::ofstream ofs(secret_file, std::ios::binary); + ASSERT_TRUE(ofs.is_open()) << "Failed to open " << secret_file; + int64_t idx_data[] = {0, 1}; + ofs.write(reinterpret_cast(idx_data), sizeof(idx_data)); + ASSERT_TRUE(ofs.good()) << "Failed to write to " << secret_file; + } + + // Also need a valid values file inside the model directory. + auto values_file = model_dir / "values.bin"; + { + std::ofstream ofs(values_file, std::ios::binary); + ASSERT_TRUE(ofs.is_open()) << "Failed to open " << values_file; + float val_data[] = {1.0f, 2.0f}; + ofs.write(reinterpret_cast(val_data), sizeof(val_data)); + ASSERT_TRUE(ofs.good()) << "Failed to write to " << values_file; + } + + ONNX_NAMESPACE::SparseTensorProto sparse; + sparse.add_dims(4); + + // Values: legitimate external data within model directory. + auto* values = sparse.mutable_values(); + values->set_name("sparse_idx_test"); + values->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + values->add_dims(2); + values->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL); + + auto* val_loc = values->add_external_data(); + val_loc->set_key("location"); + val_loc->set_value("values.bin"); + + auto* val_len = values->add_external_data(); + val_len->set_key("length"); + val_len->set_value(std::to_string(2 * sizeof(float))); + + // Indices: external data with path traversal. + auto* indices = sparse.mutable_indices(); + indices->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + indices->add_dims(2); + indices->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL); + + auto* idx_loc = indices->add_external_data(); + idx_loc->set_key("location"); + idx_loc->set_value("../indices_secret.bin"); // path traversal! + + auto* idx_len = indices->add_external_data(); + idx_len->set_key("length"); + idx_len->set_value(std::to_string(2 * sizeof(int64_t))); + + ONNX_NAMESPACE::TensorProto dense; + std::filesystem::path model_path = model_dir / "model.onnx"; + Status status = utils::SparseTensorProtoToDenseTensorProto(sparse, model_path, dense); + ASSERT_FALSE(status.IsOK()) << "SparseTensorProtoToDenseTensorProto should reject path-traversal " + "in indices external_data location, but it succeeded."; + EXPECT_THAT(status.ErrorMessage(), ::testing::HasSubstr("escapes")); +} + +// Regression test: SparseTensorProtoToDenseTensorProto must reject absolute paths +// in values external_data location. +TEST_F(PathValidationTest, SparseTensorExternalDataAbsolutePathBlocked_Values) { + ONNX_NAMESPACE::SparseTensorProto sparse; + sparse.add_dims(4); + + auto* values = sparse.mutable_values(); + values->set_name("abs_path_test"); + values->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + values->add_dims(2); + values->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL); + + auto* loc = values->add_external_data(); + loc->set_key("location"); + loc->set_value("/data.bin"); // absolute path + + auto* len_entry = values->add_external_data(); + len_entry->set_key("length"); + len_entry->set_value(std::to_string(2 * sizeof(float))); + + auto* indices = sparse.mutable_indices(); + indices->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + indices->add_dims(2); + indices->add_int64_data(0); + indices->add_int64_data(1); + + ONNX_NAMESPACE::TensorProto dense; + std::filesystem::path model_path = base_dir_ / "model.onnx"; + Status status = utils::SparseTensorProtoToDenseTensorProto(sparse, model_path, dense); + ASSERT_FALSE(status.IsOK()) << "SparseTensorProtoToDenseTensorProto should reject absolute path " + "in values external_data location."; + EXPECT_THAT(status.ErrorMessage(), ::testing::HasSubstr("Absolute path not allowed")); + +#ifdef _WIN32 + // Also verify Windows-style absolute path. + loc->set_value("C:\\data.bin"); + status = utils::SparseTensorProtoToDenseTensorProto(sparse, model_path, dense); + ASSERT_FALSE(status.IsOK()) << "SparseTensorProtoToDenseTensorProto should reject Windows absolute path " + "in values external_data location."; + EXPECT_THAT(status.ErrorMessage(), ::testing::HasSubstr("Absolute path not allowed")); +#endif +} + +// Regression test: SparseTensorProtoToDenseTensorProto must reject absolute paths +// in indices external_data location. +TEST_F(PathValidationTest, SparseTensorExternalDataAbsolutePathBlocked_Indices) { + // Create a valid values file inside base_dir_ so values validation passes. + auto values_file = base_dir_ / "values.bin"; + { + std::ofstream ofs(values_file, std::ios::binary); + ASSERT_TRUE(ofs.is_open()) << "Failed to open " << values_file; + float val_data[] = {1.0f, 2.0f}; + ofs.write(reinterpret_cast(val_data), sizeof(val_data)); + ASSERT_TRUE(ofs.good()) << "Failed to write to " << values_file; + } + + ONNX_NAMESPACE::SparseTensorProto sparse; + sparse.add_dims(4); + + // Values: legitimate external data within base_dir_. + auto* values = sparse.mutable_values(); + values->set_name("abs_path_idx_test"); + values->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + values->add_dims(2); + values->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL); + + auto* val_loc = values->add_external_data(); + val_loc->set_key("location"); + val_loc->set_value("values.bin"); + + auto* val_len = values->add_external_data(); + val_len->set_key("length"); + val_len->set_value(std::to_string(2 * sizeof(float))); + + // Indices: external data with absolute path. + auto* indices = sparse.mutable_indices(); + indices->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + indices->add_dims(2); + indices->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL); + + auto* idx_loc = indices->add_external_data(); + idx_loc->set_key("location"); + idx_loc->set_value("/data.bin"); // absolute path + + auto* idx_len = indices->add_external_data(); + idx_len->set_key("length"); + idx_len->set_value(std::to_string(2 * sizeof(int64_t))); + + ONNX_NAMESPACE::TensorProto dense; + std::filesystem::path model_path = base_dir_ / "model.onnx"; + Status status = utils::SparseTensorProtoToDenseTensorProto(sparse, model_path, dense); + ASSERT_FALSE(status.IsOK()) << "SparseTensorProtoToDenseTensorProto should reject absolute path " + "in indices external_data location."; + EXPECT_THAT(status.ErrorMessage(), ::testing::HasSubstr("Absolute path not allowed")); + +#ifdef _WIN32 + idx_loc->set_value("C:\\data.bin"); + status = utils::SparseTensorProtoToDenseTensorProto(sparse, model_path, dense); + ASSERT_FALSE(status.IsOK()) << "SparseTensorProtoToDenseTensorProto should reject Windows absolute path " + "in indices external_data location."; + EXPECT_THAT(status.ErrorMessage(), ::testing::HasSubstr("Absolute path not allowed")); +#endif +} + +// Regression test: validation must still reject escaping paths for zero-element dense tensors, +// which previously returned early before path validation ran. +TEST_F(PathValidationTest, SparseTensorExternalDataPathTraversalBlocked_ZeroDenseElements) { + auto model_dir = base_dir_ / "model_dir"; + std::error_code ec; + std::filesystem::create_directories(model_dir, ec); + ASSERT_FALSE(ec) << "Failed to create model_dir: " << ec.message(); + + // Create the escaping file so that a "file not found" error would NOT be raised. + auto secret_file = base_dir_ / "secret.bin"; + { + std::ofstream ofs(secret_file, std::ios::binary); + ASSERT_TRUE(ofs.is_open()) << "Failed to open " << secret_file; + ofs.put('\0'); + ASSERT_TRUE(ofs.good()) << "Failed to write to " << secret_file; + } + + ONNX_NAMESPACE::SparseTensorProto sparse; + sparse.add_dims(0); // dense shape [0] → dense_elements == 0 + + auto* values = sparse.mutable_values(); + values->set_name("zero_dense_test"); + values->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + values->add_dims(0); // NNZ=0 + values->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL); + + auto* loc = values->add_external_data(); + loc->set_key("location"); + loc->set_value("../secret.bin"); // path traversal + + auto* len_entry = values->add_external_data(); + len_entry->set_key("length"); + len_entry->set_value("0"); + + auto* indices = sparse.mutable_indices(); + indices->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + indices->add_dims(0); + + ONNX_NAMESPACE::TensorProto dense; + std::filesystem::path model_path = model_dir / "model.onnx"; + Status status = utils::SparseTensorProtoToDenseTensorProto(sparse, model_path, dense); + ASSERT_FALSE(status.IsOK()) << "Should reject path-traversal in values even when dense_elements == 0."; + EXPECT_THAT(status.ErrorMessage(), ::testing::HasSubstr("escapes")); +} + +// Regression test: validation must reject escaping paths in indices even when NNZ == 0. +TEST_F(PathValidationTest, SparseTensorExternalDataPathTraversalBlocked_ZeroNNZ) { + auto model_dir = base_dir_ / "model_dir"; + std::error_code ec; + std::filesystem::create_directories(model_dir, ec); + ASSERT_FALSE(ec) << "Failed to create model_dir: " << ec.message(); + + // Create the escaping file so that a "file not found" error would NOT be raised. + auto secret_file = base_dir_ / "indices_secret.bin"; + { + std::ofstream ofs(secret_file, std::ios::binary); + ASSERT_TRUE(ofs.is_open()) << "Failed to open " << secret_file; + ofs.put('\0'); + ASSERT_TRUE(ofs.good()) << "Failed to write to " << secret_file; + } + + ONNX_NAMESPACE::SparseTensorProto sparse; + sparse.add_dims(4); // dense shape [4] → non-zero dense_elements + + auto* values = sparse.mutable_values(); + values->set_name("zero_nnz_test"); + values->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + values->add_dims(0); // NNZ=0 + + auto* indices = sparse.mutable_indices(); + indices->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + indices->add_dims(0); + indices->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL); + + auto* idx_loc = indices->add_external_data(); + idx_loc->set_key("location"); + idx_loc->set_value("../indices_secret.bin"); // path traversal + + auto* idx_len = indices->add_external_data(); + idx_len->set_key("length"); + idx_len->set_value("0"); + + ONNX_NAMESPACE::TensorProto dense; + std::filesystem::path model_path = model_dir / "model.onnx"; + Status status = utils::SparseTensorProtoToDenseTensorProto(sparse, model_path, dense); + ASSERT_FALSE(status.IsOK()) << "Should reject path-traversal in indices even when NNZ == 0."; + EXPECT_THAT(status.ErrorMessage(), ::testing::HasSubstr("escapes")); +} + +#endif // !defined(DISABLE_SPARSE_TENSORS) + TEST(TensorProtoUtilsTest, GetNodeProtoLayeringAnnotation) { // Case 1: Annotation exists { From b38731bf4dcf2406063b7f4ecb223048d62e3449 Mon Sep 17 00:00:00 2001 From: qiurui144 <39214303+qiurui144@users.noreply.github.com> Date: Wed, 20 May 2026 03:42:52 +0800 Subject: [PATCH 13/13] [MLAS] test: accept canonical NaN in activation NaN round-trip check (#28538) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Description `MlasActivationTest.ExecuteShort` (`test_activation.cpp`) feeds NaN inputs through `MlasActivation` and asserts the output matches the expected value bit-for-bit. This change adds one accepted case: when the expected value is a NaN, any NaN output passes. Non-NaN comparisons are unchanged — a finite output where a NaN is expected (or the reverse) still fails. Test-only change, no library behavior impact. Verified: `onnxruntime_mlas_test --gtest_filter=Activation.ShortExecute` on SpacemiT K3 (riscv64, RVV VLEN=256), rv-gcc 15.2 — FAILED before, PASSED after (re-run x3). x86/x64 behavior unaffected. ### Motivation and Context The bit-exact assertion (`Buffer[i].u == TestData[i][kind].u`) implicitly assumes the input NaN payload survives the activation. For kinds evaluated by floating-point arithmetic — LeakyRelu (`alpha * x`), HardSigmoid (`alpha * x + beta`) — that only holds on ISAs that propagate NaN payloads (x86, ARM). IEEE-754 does not require NaN payload propagation. RISC-V's `F` extension mandates that any FP operation producing a NaN yields the canonical quiet NaN (`0x7fc00000` for f32), discarding the payload. So on riscv64 these kinds emit `0x7fc00000` for a NaN input — a correct "NaN in → NaN out" result whose bit pattern simply differs from the input — and the bit-exact check fails. Accepting any NaN where a NaN is expected restores the test to the portable IEEE-754 **contract.** Signed-off-by: qiurui144 --- onnxruntime/test/mlas/unittest/test_activation.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/onnxruntime/test/mlas/unittest/test_activation.cpp b/onnxruntime/test/mlas/unittest/test_activation.cpp index a4334c6c80477..73d18d8a7dc38 100644 --- a/onnxruntime/test/mlas/unittest/test_activation.cpp +++ b/onnxruntime/test/mlas/unittest/test_activation.cpp @@ -247,7 +247,8 @@ class MlasActivationTest : public MlasTestBase { for (unsigned i = 0; i < _countof(TestData); i++) { // Sensitive to comparing positive/negative zero and NaNs. float error = std::min(std::fabs((Buffer[i].f - TestData[i][kind].f) / TestData[i][kind].f), std::fabs(Buffer[i].f - TestData[i][kind].f)); - EXPECT_TRUE(Buffer[i].u == TestData[i][kind].u || Buffer[i].f == TestData[i][kind].f || error < 0.000001f) + EXPECT_TRUE(Buffer[i].u == TestData[i][kind].u || Buffer[i].f == TestData[i][kind].f || error < 0.000001f || + (std::isnan(Buffer[i].f) && std::isnan(TestData[i][kind].f))) << ", Scalar Activation Kind:" << (int)kind << ", i=" << i << ", value:" << std::setw(8) << std::setfill('0') << std::hex << Buffer[i].u << ", expecting:" << std::setw(8) << std::setfill('0') << std::hex << TestData[i][kind].u;