From 4f620c38a402f37e98b0f115ab7ae139a205b5c7 Mon Sep 17 00:00:00 2001
From: Luis Rios <lhrios@gmail.com>
Date: Mon, 18 May 2026 03:51:00 -0700
Subject: [PATCH 1/4] fix: avoid vector copies in CheckIfSubtreesAreEqual
 (#27854)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`indices` is built once and then only read during recursive calls to
`CheckIfSubtreesAreEqual`. However it was passed by value, causing a
full copy on every recursive call. Changed to `const&`.

## Data from the profiler:
To collect the following data, a model with a single
TreeEnsembleClassifier node (5000 trees and 3.3 million nodes) has been
used. The loading time dropped from 18 minutes to about 4 seconds.

### After
<img width="1793" height="547" alt="Screenshot 2026-03-25 at 6 40 25 PM"
src="https://github.com/user-attachments/assets/d7c00335-8246-4bd1-9e4d-b0e956d48cdd"
/>


### Before
<img width="1763" height="548" alt="Screenshot 2026-03-25 at 6 40 40 PM"
src="https://github.com/user-attachments/assets/35683112-2919-4031-955c-922937f2df8f"
/>
---
 onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h
index 8ed9a40097d4b..2530a1f73f81a 100644
--- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h
+++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h
@@ -115,7 +115,7 @@ class TreeEnsembleCommon : public TreeEnsembleCommonAttributes {
                                const InlinedVector<size_t>& truenode_ids, const InlinedVector<size_t>& falsenode_ids, gsl::span<const int64_t> nodes_featureids,
                                gsl::span<const ThresholdType> nodes_values_as_tensor, gsl::span<const float> node_values,
                                gsl::span<const float> target_class_weights, gsl::span<const ThresholdType> target_class_weights_as_tensor,
-                               const InlinedVector<TreeNodeElementId>& node_tree_ids, InlinedVector<std::pair<TreeNodeElementId, uint32_t>> indices);
+                               const InlinedVector<TreeNodeElementId>& node_tree_ids, const InlinedVector<std::pair<TreeNodeElementId, uint32_t>>& indices);
   size_t AddNodes(const size_t i, const InlinedVector<NODE_MODE_ONNX>& cmodes, const InlinedVector<size_t>& truenode_ids,
                   const InlinedVector<size_t>& falsenode_ids, gsl::span<const int64_t> nodes_featureids,
                   gsl::span<const ThresholdType> nodes_values_as_tensor, gsl::span<const float> node_values,
@@ -383,7 +383,7 @@ bool TreeEnsembleCommon<InputType, ThresholdType, OutputType>::CheckIfSubtreesAr
     const InlinedVector<size_t>& truenode_ids, const InlinedVector<size_t>& falsenode_ids, gsl::span<const int64_t> nodes_featureids,
     gsl::span<const ThresholdType> nodes_values_as_tensor, gsl::span<const float> node_values,
     gsl::span<const float> target_class_weights, gsl::span<const ThresholdType> target_class_weights_as_tensor,
-    const InlinedVector<TreeNodeElementId>& node_tree_ids, InlinedVector<std::pair<TreeNodeElementId, uint32_t>> indices) {
+    const InlinedVector<TreeNodeElementId>& node_tree_ids, const InlinedVector<std::pair<TreeNodeElementId, uint32_t>>& indices) {
   if (left_id == right_id) {
     return true;
   }

From aa416f5cec6ab3ed2fc269fc3e07812ab3fa0307 Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajiaqin@microsoft.com>
Date: Mon, 18 May 2026 23:26:19 +0800
Subject: [PATCH 2/4] webgpu: Generalize FlashAttention prefill shared-memory
 path (#28520)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

- Remove the `Subgroups` feature requirement from
`CanApplyFlashAttention`, enabling flash attention on devices without
subgroup support
- Generalize the Apple-specific shared-memory prefill path into a
`use_shm_path` flag that activates for Apple, NVIDIA, or any device
lacking subgroups
- Replace `is_apple` shader parameter with `use_shm_path` throughout the
WGSL template

## Motivation

Two issues exist on the current main branch:

1. **NVIDIA prefill produces incorrect results (regression from
#28511):** PR #28511 increased `max_k_step` to 32 for NVIDIA in C++, but
the shader's subgroup-based path only has `qk_1..qk_4` (16 hardcoded key
indices). When `sg_size=32` (e.g. RTX 5080), the loop steps by 32 but
only computes QK for keys 0-15, silently skipping keys 16-31. This
produces incorrect attention output for models like phi4.

2. **Flash attention prefill unavailable without Subgroups:**
`CanApplyFlashAttention` gates on
`context.HasFeature(wgpu::FeatureName::Subgroups)`, forcing devices
without subgroup support to fall back to the slower split-reduce
2-kernel path for prefill, even though the Apple shared-memory path in
the shader is fully subgroup-free.

This PR fixes both issues by routing Apple, NVIDIA, and no-subgroup
devices through the loop-based shared-memory path (`use_shm_path`),
which naturally handles any `max_k_step` value via `array<q_element_t,
max_k_step>` and loop iteration — no hardcoded key count.

## Test plan

- [x] Built ORT with WebGPU EP on Windows (Release, VS 2022)
- [x] Deployed and ran phi4-graph-prune model: output verified correct
("1+1 equals 2.")
- [x] Lint check passed (`lintrunner -a`)
---
 onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc   | 9 +++++----
 onnxruntime/contrib_ops/webgpu/bert/flash_attention.h    | 9 +++++----
 .../webgpu/bert/flash_attention.wgsl.template            | 6 +++---
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
index f1391ba1e3528..8217a07448266 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
@@ -212,7 +212,6 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
   return WGSL_TEMPLATE_APPLY(shader, "bert/flash_attention.wgsl.template",
                              WGSL_TEMPLATE_PARAMETER(has_attention_bias, has_attention_bias_),
                              WGSL_TEMPLATE_PARAMETER(has_head_sink, has_head_sink_),
-                             WGSL_TEMPLATE_PARAMETER(is_apple, is_apple_),
                              WGSL_TEMPLATE_PARAMETER(is_fp16, is_fp16_),
                              WGSL_TEMPLATE_PARAMETER(is_qualcomm, is_qualcomm_),
                              WGSL_TEMPLATE_PARAMETER(is_unidirectional, is_unidirectional_),
@@ -221,7 +220,8 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
                              WGSL_TEMPLATE_PARAMETER(q_BNSH, q_BNSH_),
                              WGSL_TEMPLATE_PARAMETER(qkv_head_size, qkv_head_size_),
                              WGSL_TEMPLATE_PARAMETER(qkv_num_heads, qkv_num_heads_),
-                             WGSL_TEMPLATE_PARAMETER(use_seqlen_k, use_seqlen_k_));
+                             WGSL_TEMPLATE_PARAMETER(use_seqlen_k, use_seqlen_k_),
+                             WGSL_TEMPLATE_PARAMETER(use_shm_path, use_shm_path_));
 }
 
 Status FlashAttentionDecodeQKTProgram::GenerateShaderCode(ShaderHelper& shader) const {
@@ -486,6 +486,7 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
     bool is_qualcomm = context.AdapterInfo().vendor == std::string_view{"qualcomm"};
     bool is_nvidia = context.AdapterInfo().vendor == std::string_view{"nvidia"};
     bool is_apple = context.AdapterInfo().vendor == std::string_view{"apple"};
+    bool has_subgroups = context.HasFeature(wgpu::FeatureName::Subgroups);
     bool is_fp16 = (Q->GetElementType() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
     bool q_BNSH = parameters.qkv_format_ == Q_K_V_BNSH;
     bool has_head_sink = head_sink != nullptr;
@@ -498,6 +499,7 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
                                   parameters.is_unidirectional_,
                                   is_nvidia,
                                   is_apple,
+                                  has_subgroups,
                                   q_BNSH,
                                   use_seqlen_k,
                                   has_head_sink};
@@ -532,7 +534,7 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
 
     program.SetDispatchGroupSize(parameters.batch_size_ * parameters.num_heads_ * num_seq_tile)
         .SetWorkgroupSize(prefill_tile_size)
-        .CacheHint(has_attention_bias, parameters.head_size_, parameters.num_heads_, parameters.is_unidirectional_, is_qualcomm, is_nvidia, is_apple, q_BNSH, use_seqlen_k, has_head_sink, program.max_k_step())
+        .CacheHint(has_attention_bias, parameters.head_size_, parameters.num_heads_, parameters.is_unidirectional_, is_qualcomm, is_nvidia, is_apple, has_subgroups, q_BNSH, use_seqlen_k, has_head_sink, program.max_k_step())
         .AddUniformVariables({{static_cast<uint32_t>(parameters.sequence_length_)},
                               {static_cast<uint32_t>(parameters.total_sequence_length_)},
                               {static_cast<uint32_t>(present_sequence_length)},
@@ -584,7 +586,6 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
 bool CanApplyFlashAttention(const WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context) {
   return !parameters.is_packed_qkv_ &&
          parameters.head_size_ == parameters.v_head_size_ &&
-         context.HasFeature(wgpu::FeatureName::Subgroups) &&
          ((context.AdapterInfo().vendor == std::string_view{"qualcomm"} && parameters.head_size_ % 8 == 0) || parameters.head_size_ % 4 == 0);
 }
 
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
index 27fa56e333874..e75b6378f67c6 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
@@ -77,6 +77,7 @@ class FlashAttentionProgram final : public Program<FlashAttentionProgram> {
                         bool is_unidirectional,
                         bool is_nvidia,
                         bool is_apple,
+                        bool has_subgroups,
                         bool q_BNSH,
                         bool use_seqlen_k = false,
                         bool has_head_sink = false)
@@ -88,12 +89,12 @@ class FlashAttentionProgram final : public Program<FlashAttentionProgram> {
         qkv_num_heads_(qkv_num_heads),
         is_unidirectional_(is_unidirectional),
         is_nvidia_(is_nvidia),
-        is_apple_(is_apple),
+        use_shm_path_(is_apple || is_nvidia || !has_subgroups),
         q_BNSH_(q_BNSH),
         use_seqlen_k_(use_seqlen_k),
         has_head_sink_(has_head_sink) {
-    if (is_apple || is_nvidia) {
-      // On Apple and NVIDIA, use an optimized loop-based path with dynamic max_k_step.
+    if (use_shm_path_) {
+      // Use shared-memory loop-based path with dynamic max_k_step.
       // Compute max_k_step from workgroup shared memory budget: k_tile + v_tile = 2 * element_size * head_size * max_k_step
       const int element_size = is_fp16 ? 2 : 4;
       constexpr int kMinWorkgroupStorageBudgetBytes = 16384;
@@ -130,7 +131,7 @@ class FlashAttentionProgram final : public Program<FlashAttentionProgram> {
   int qkv_num_heads_;
   bool is_unidirectional_;
   bool is_nvidia_;
-  bool is_apple_;
+  bool use_shm_path_;
   bool q_BNSH_;
   bool use_seqlen_k_;
   bool has_head_sink_;
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.wgsl.template
index db41ac12ce268..6b620043413e3 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.wgsl.template
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.wgsl.template
@@ -1,7 +1,6 @@
 
 #param has_attention_bias
 #param has_head_sink
-#param is_apple
 #param is_fp16
 #param is_qualcomm
 #param is_unidirectional
@@ -10,6 +9,7 @@
 #param qkv_head_size
 #param qkv_num_heads
 #param use_seqlen_k
+#param use_shm_path
 #param max_k_step_param
 
 const head_size : u32 = qkv_head_size;
@@ -61,7 +61,7 @@ fn loadq(batch_idx : u32, q_idx_global : u32, head_idx : u32, alpha : q_element_
   }
 }
 
-#if is_apple
+#if use_shm_path
 
 var<private> qk_scores : array<q_element_t, max_k_step>;
 
@@ -240,7 +240,7 @@ $MAIN {
   let seq_causal_length = total_sequence_length;
 #endif
 
-#if is_apple
+#if use_shm_path
 
   for (var k_start = 0u; k_start < loop_bound; k_start += max_k_step) {
     workgroupBarrier();

From 770da7dd224b7d8ca2720863b609c5fe22f06222 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Mon, 18 May 2026 09:14:54 -0700
Subject: [PATCH 3/4] [WebGPU plugin EP] Package improvements (#28525)

### Description
<!-- Describe your changes. -->

- Add copyright headers to source files
- Enrich Python and NuGet package metadata
- Add ORT license files to packages
- Clean up readme files

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

WebGPU plugin EP packaging improvements.

Note: Similar updates can be considered for the CUDA plugin EP, but this
PR is scoped to just the WebGPU EP for ease of cherry-picking into the
WebGPU plugin EP release branch.
---
 plugin-ep-webgpu/_packaging_utils.py          |  1 +
 .../Microsoft.ML.OnnxRuntime.EP.WebGpu.csproj |  6 +++-
 .../WebGpuEp.cs                               |  3 ++
 plugin-ep-webgpu/csharp/README.md             | 23 --------------
 plugin-ep-webgpu/csharp/pack_nuget.py         | 21 +++++++++++++
 .../csharp/test/WebGpuEpNuGetTest/Program.cs  |  3 ++
 plugin-ep-webgpu/python/README.md             |  7 ++---
 plugin-ep-webgpu/python/build_wheel.py        | 13 ++++++++
 .../python/onnxruntime_ep_webgpu/__init__.py  |  3 ++
 plugin-ep-webgpu/python/pyproject.toml.in     | 31 +++++++++++++++++--
 .../python/requirements-build-wheel.txt       |  2 +-
 plugin-ep-webgpu/python/setup.py              |  3 ++
 .../python/test/test_webgpu_plugin_ep.py      |  4 +++
 13 files changed, 88 insertions(+), 32 deletions(-)

diff --git a/plugin-ep-webgpu/_packaging_utils.py b/plugin-ep-webgpu/_packaging_utils.py
index 201b3342ff39c..84850e4dee5fe 100644
--- a/plugin-ep-webgpu/_packaging_utils.py
+++ b/plugin-ep-webgpu/_packaging_utils.py
@@ -1,5 +1,6 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
+
 """Shared utilities for the WebGPU plugin EP packaging scripts. Not a public API."""
 
 from __future__ import annotations
diff --git a/plugin-ep-webgpu/csharp/Microsoft.ML.OnnxRuntime.EP.WebGpu/Microsoft.ML.OnnxRuntime.EP.WebGpu.csproj b/plugin-ep-webgpu/csharp/Microsoft.ML.OnnxRuntime.EP.WebGpu/Microsoft.ML.OnnxRuntime.EP.WebGpu.csproj
index 58860c46b9c16..5bfbac0308e01 100644
--- a/plugin-ep-webgpu/csharp/Microsoft.ML.OnnxRuntime.EP.WebGpu/Microsoft.ML.OnnxRuntime.EP.WebGpu.csproj
+++ b/plugin-ep-webgpu/csharp/Microsoft.ML.OnnxRuntime.EP.WebGpu/Microsoft.ML.OnnxRuntime.EP.WebGpu.csproj
@@ -16,7 +16,8 @@
     <PackageTags>ONNX;ONNX Runtime;Machine Learning;AI;Deep Learning;WebGPU</PackageTags>
 
     <!-- License/Repository -->
-    <PackageLicenseExpression>MIT</PackageLicenseExpression>
+    <PackageLicenseFile>LICENSE</PackageLicenseFile>
+    <PackageProjectUrl>https://onnxruntime.ai</PackageProjectUrl>
     <RepositoryUrl>https://github.com/microsoft/onnxruntime</RepositoryUrl>
     <RepositoryType>git</RepositoryType>
     <Copyright>© Microsoft Corporation. All rights reserved.</Copyright>
@@ -29,6 +30,9 @@
   <ItemGroup>
     <!-- Ensure README is included in the package -->
     <None Include="README.md" Pack="true" PackagePath="" />
+    <!-- LICENSE and ThirdPartyNotices.txt are staged here from the repo root by pack_nuget.py. -->
+    <None Include="LICENSE" Pack="true" PackagePath="" Visible="false" />
+    <None Include="ThirdPartyNotices.txt" Pack="true" PackagePath="" Visible="false" />
   </ItemGroup>
 
   <!-- Native binaries per platform. Each ItemGroup is conditioned on the runtimes/ directory
diff --git a/plugin-ep-webgpu/csharp/Microsoft.ML.OnnxRuntime.EP.WebGpu/WebGpuEp.cs b/plugin-ep-webgpu/csharp/Microsoft.ML.OnnxRuntime.EP.WebGpu/WebGpuEp.cs
index 2a5ec106aad0d..6e797494500e6 100644
--- a/plugin-ep-webgpu/csharp/Microsoft.ML.OnnxRuntime.EP.WebGpu/WebGpuEp.cs
+++ b/plugin-ep-webgpu/csharp/Microsoft.ML.OnnxRuntime.EP.WebGpu/WebGpuEp.cs
@@ -1,3 +1,6 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
 using System;
 using System.IO;
 using System.Runtime.InteropServices;
diff --git a/plugin-ep-webgpu/csharp/README.md b/plugin-ep-webgpu/csharp/README.md
index 7a2b2041e364f..0cac847a30837 100644
--- a/plugin-ep-webgpu/csharp/README.md
+++ b/plugin-ep-webgpu/csharp/README.md
@@ -61,29 +61,6 @@ python pack_nuget.py --version 0.1.0-dev `
 The package version is supplied to `pack_nuget.py` via `--version`. In the packaging pipeline, the release or
 pre-release version is derived from [`plugin-ep-webgpu/VERSION_NUMBER`](../VERSION_NUMBER).
 
-## Inspecting the Package
-
-The `.nupkg` is a ZIP file. To verify its contents:
-
-```powershell
-Expand-Archive nuget_output/Microsoft.ML.OnnxRuntime.EP.WebGpu.0.1.0-dev.nupkg `
-  -DestinationPath nuget_output/inspect -Force
-
-Get-ChildItem nuget_output/inspect -Recurse | Select-Object FullName
-```
-
-Expected layout inside the package:
-
-```
-lib/netstandard2.0/Microsoft.ML.OnnxRuntime.EP.WebGpu.dll
-runtimes/win-x64/native/onnxruntime_providers_webgpu.dll
-runtimes/win-x64/native/dxil.dll
-runtimes/win-x64/native/dxcompiler.dll
-runtimes/win-arm64/native/...
-runtimes/linux-x64/native/libonnxruntime_providers_webgpu.so
-runtimes/osx-arm64/native/libonnxruntime_providers_webgpu.dylib
-```
-
 ## Testing the Package
 
 The test app registers the WebGPU EP, creates a session, runs a simple Mul model, and validates the output.
diff --git a/plugin-ep-webgpu/csharp/pack_nuget.py b/plugin-ep-webgpu/csharp/pack_nuget.py
index b1ce61c0480e2..cf45768e2f0c7 100644
--- a/plugin-ep-webgpu/csharp/pack_nuget.py
+++ b/plugin-ep-webgpu/csharp/pack_nuget.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python3
+
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
+
 """Build the Microsoft.ML.OnnxRuntime.EP.WebGpu NuGet package.
 
 Stages native binaries from build artifacts into the runtimes/ layout expected
@@ -44,6 +46,11 @@
 PROJECT_DIR = SCRIPT_DIR / "Microsoft.ML.OnnxRuntime.EP.WebGpu"
 CSPROJ = PROJECT_DIR / "Microsoft.ML.OnnxRuntime.EP.WebGpu.csproj"
 MIN_ORT_VERSION_FILE = SCRIPT_DIR.parent / "MIN_ONNXRUNTIME_VERSION"
+REPO_ROOT = SCRIPT_DIR.parents[1]
+
+# License-related files to bundle into the .nupkg. Sourced from the repo root and copied into
+# the staging directory so the staged csproj can reference them by simple relative paths.
+LICENSE_FILES: tuple[Path, ...] = (REPO_ROOT / "LICENSE", REPO_ROOT / "ThirdPartyNotices.txt")
 
 # Import the shared template helper from _packaging_utils.py in the parent directory.
 sys.path.insert(0, str(SCRIPT_DIR.parent))
@@ -144,6 +151,19 @@ def stage_sources(staging_dir: Path) -> None:
     )
 
 
+def stage_license_files(staging_dir: Path) -> None:
+    """Copy LICENSE and ThirdPartyNotices.txt from the repo root into the staging directory.
+
+    The staged csproj references these via <None Include="LICENSE" .../> and
+    <None Include="ThirdPartyNotices.txt" .../> so they are bundled into the .nupkg at its root.
+    """
+    for src in LICENSE_FILES:
+        if not src.is_file():
+            raise PackError(f"expected license file not found: {src}")
+        shutil.copy2(src, staging_dir / src.name)
+        print(f"Staged {src.name}")
+
+
 def resolve_platform_source(
     name: str,
     binary_dir_override: Path | None,
@@ -297,6 +317,7 @@ def run_in_staging(args: argparse.Namespace, staging_dir: Path, min_ort_version_
         print(f"Reusing existing staging directory: {staging_dir}")
     else:
         stage_sources(staging_dir)
+        stage_license_files(staging_dir)
         stage_binaries(staging_dir, args, required_platforms)
         min_ort_version = min_ort_version_file.read_text(encoding="utf-8").strip()
         if not min_ort_version:
diff --git a/plugin-ep-webgpu/csharp/test/WebGpuEpNuGetTest/Program.cs b/plugin-ep-webgpu/csharp/test/WebGpuEpNuGetTest/Program.cs
index f5d1f0628c831..2001a99be5c43 100644
--- a/plugin-ep-webgpu/csharp/test/WebGpuEpNuGetTest/Program.cs
+++ b/plugin-ep-webgpu/csharp/test/WebGpuEpNuGetTest/Program.cs
@@ -1,3 +1,6 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
 using Microsoft.ML.OnnxRuntime;
 using Microsoft.ML.OnnxRuntime.EP.WebGpu;
 
diff --git a/plugin-ep-webgpu/python/README.md b/plugin-ep-webgpu/python/README.md
index 849105a439396..4767f6ccbea48 100644
--- a/plugin-ep-webgpu/python/README.md
+++ b/plugin-ep-webgpu/python/README.md
@@ -37,14 +37,11 @@ Install the wheel and dependencies in a clean environment, then run the smoke te
 ```bash
 python -m venv test_venv
 source test_venv/bin/activate  # or test_venv\Scripts\Activate.ps1 on Windows
-pip install onnx numpy
-pip install dist/onnxruntime_ep_webgpu-*.whl  # pulls in the minimum compatible onnxruntime
+pip install onnx numpy onnxruntime
+pip install dist/onnxruntime_ep_webgpu-*.whl
 python test/test_webgpu_plugin_ep.py
 ```
 
-The wheel declares a runtime dependency on the minimum compatible `onnxruntime` package, so pip will install (or
-verify) a compatible core runtime automatically.
-
 The test validates import, EP registration, device discovery, and inference (requires WebGPU-capable hardware for the
 inference portion). Set the environment variable `ORT_TEST_VERBOSE=1` to print additional diagnostic information
 (environment, available providers, discovered devices, etc.).
diff --git a/plugin-ep-webgpu/python/build_wheel.py b/plugin-ep-webgpu/python/build_wheel.py
index 6f19b88838bf9..bc361d30f9fda 100644
--- a/plugin-ep-webgpu/python/build_wheel.py
+++ b/plugin-ep-webgpu/python/build_wheel.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python3
+
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 """Build a wheel for the onnxruntime-ep-webgpu package.
 
 Combines pre-built plugin EP binaries with the Python package source to produce
@@ -52,6 +56,15 @@ def prepare_staging_dir(staging_dir: Path, binary_dir: Path, version: str):
     shutil.copy2(SCRIPT_DIR / "setup.py", staging_dir / "setup.py")
     shutil.copytree(SCRIPT_DIR / "onnxruntime_ep_webgpu", staging_dir / "onnxruntime_ep_webgpu")
 
+    # Stage the repo-root LICENSE and ThirdPartyNotices.txt next to setup.py so setuptools
+    # can bundle them via the `license-files` entry in pyproject.toml (PEP 639).
+    repo_root = SCRIPT_DIR.parents[1]
+    for license_filename in ("LICENSE", "ThirdPartyNotices.txt"):
+        src = repo_root / license_filename
+        if not src.is_file():
+            raise FileNotFoundError(f"Expected license file not found: {src}")
+        shutil.copy2(src, staging_dir / license_filename)
+
     # Copy plugin binaries into the package directory
     # Note: The binaries are assumed to be directly under `binary_dir`.
     package_dir = staging_dir / "onnxruntime_ep_webgpu"
diff --git a/plugin-ep-webgpu/python/onnxruntime_ep_webgpu/__init__.py b/plugin-ep-webgpu/python/onnxruntime_ep_webgpu/__init__.py
index 284269eb0356a..136d780c3fc0a 100644
--- a/plugin-ep-webgpu/python/onnxruntime_ep_webgpu/__init__.py
+++ b/plugin-ep-webgpu/python/onnxruntime_ep_webgpu/__init__.py
@@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 """ONNX Runtime WebGPU Plugin Execution Provider Python Package.
 
 Provides helper functions to locate the plugin EP shared library and
diff --git a/plugin-ep-webgpu/python/pyproject.toml.in b/plugin-ep-webgpu/python/pyproject.toml.in
index 83ce01f38d1c8..077843eafc5bf 100644
--- a/plugin-ep-webgpu/python/pyproject.toml.in
+++ b/plugin-ep-webgpu/python/pyproject.toml.in
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=68.0", "wheel"]
+requires = ["setuptools>=77.0", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -7,8 +7,35 @@ name = "onnxruntime-ep-webgpu"
 version = "@version@"
 description = "ONNX Runtime WebGPU Plugin Execution Provider"
 readme = "onnxruntime_ep_webgpu/README.md"
-license = {text = "MIT"}
 requires-python = ">=3.11"
+license = "MIT"
+license-files = ["LICENSE", "ThirdPartyNotices.txt"]
+authors = [
+    { name = "Microsoft Corporation", email = "onnxruntime@microsoft.com" },
+]
+keywords = ["onnx", "machine learning", "webgpu", "execution provider"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Operating System :: POSIX :: Linux",
+    "Operating System :: Microsoft :: Windows",
+    "Operating System :: MacOS",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Scientific/Engineering :: Mathematics",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development",
+    "Topic :: Software Development :: Libraries",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+]
+
+[project.urls]
+Homepage = "https://onnxruntime.ai"
+Source = "https://github.com/microsoft/onnxruntime"
+Issues = "https://github.com/microsoft/onnxruntime/issues"
+Download = "https://github.com/microsoft/onnxruntime/tags"
 
 [tool.setuptools.packages.find]
 include = ["onnxruntime_ep_webgpu*"]
diff --git a/plugin-ep-webgpu/python/requirements-build-wheel.txt b/plugin-ep-webgpu/python/requirements-build-wheel.txt
index 3a91de5aa06f2..5bc4fa50549f0 100644
--- a/plugin-ep-webgpu/python/requirements-build-wheel.txt
+++ b/plugin-ep-webgpu/python/requirements-build-wheel.txt
@@ -1,4 +1,4 @@
-setuptools>=70.1
+setuptools>=77.0
 wheel
 # Linux-only (auditwheel + patchelf are needed for manylinux compliance)
 auditwheel; sys_platform == "linux"
diff --git a/plugin-ep-webgpu/python/setup.py b/plugin-ep-webgpu/python/setup.py
index ca55ba04701bd..bd5b8104b01c9 100644
--- a/plugin-ep-webgpu/python/setup.py
+++ b/plugin-ep-webgpu/python/setup.py
@@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 """Minimal setup.py to produce a platform-specific wheel.
 
 The package contains pre-built native libraries (not CPython extension modules),
diff --git a/plugin-ep-webgpu/python/test/test_webgpu_plugin_ep.py b/plugin-ep-webgpu/python/test/test_webgpu_plugin_ep.py
index 9f3d23230dce4..87d72826d83bb 100644
--- a/plugin-ep-webgpu/python/test/test_webgpu_plugin_ep.py
+++ b/plugin-ep-webgpu/python/test/test_webgpu_plugin_ep.py
@@ -1,4 +1,8 @@
 #!/usr/bin/env python3
+
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 """Smoke test for the onnxruntime-ep-webgpu Python package.
 
 Tests:

From 22e537b870be9a25c6e7c812daa3f59d015c211d Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Mon, 18 May 2026 13:07:00 -0700
Subject: [PATCH 4/4] Reject recursive local function definitions during model
 load (#28187)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Description

Detect and reject recursive cycles in model local function definitions
during model loading, preventing stack overflow from unbounded recursion
during function inlining.

### Changes

**Call-graph construction and cycle detection** (`model_helpers.cc`,
`model_helpers.h`)
- `BuildLocalFunctionCallGraph()` builds an adjacency-list call graph
from model local functions using iterative subgraph traversal (no
recursion, safe against deeply nested subgraph attributes).
- `ValidateCallGraphAcyclic()` performs iterative DFS cycle detection.
Uses `find()` throughout (no `operator[]`) to prevent accidental map
insertions.
- `ValidateModelLocalFunctionAcyclic()` convenience wrapper.
- On cycle detection, returns a descriptive error showing the full cycle
path (e.g., `"local:first -> local:second -> local:first"`).

**Integration** (`model.cc`)
- Applied in both `Model` constructors that process local functions.

**Test coverage** (`function_test.cc`)

Integration tests (full model load):
- `RejectsSelfRecursiveLocalFunction` — function calls itself
- `RejectsMutuallyRecursiveLocalFunctions` — A→B→A cycle
- `RejectsRecursionThroughSubgraph` — recursion via subgraph attribute
(e.g., inside If node)
- `RejectsLongerCycle` — A→B→C→A cycle, verifies cycle path reports all
participants
- `RejectsMultipleIndependentCycles` — two disjoint cycles in one model
- `AcceptsAcyclicDiamond` — diamond shape (A→B, A→C, B→D, C→D), no false
positive
- `AcceptsTrivialSingleNodeFunction` — single-Identity-node function
passes validation

Unit tests (call graph validation directly):
- `CallGraphAcyclic_EmptyGraph` — empty graph
- `CallGraphAcyclic_SingleNodeNoCalls` — single function, no callees
- `CallGraphAcyclic_SelfCycle` — self-loop
- `CallGraphAcyclic_MutualCycle` — A↔B
- `CallGraphAcyclic_LongerCycle` — A→B→C→A
- `CallGraphAcyclic_DiamondNoCycle` — diamond, no false positive
- `CallGraphAcyclic_DeepChainNoCycle` — long acyclic chain
- `CallGraphAcyclic_MultipleIndependentCycles` — two independent cycles
- `CallGraphAcyclic_SharedCallsDiamondNoCycle` — shared callees, no
false positive

### Motivation

A malicious or malformed ONNX model with recursive local function
definitions would cause the runtime to recurse until stack overflow
during function inlining. This check fails model loading early with a
clear error message.

### Testing

- Incremental build succeeds
- All new integration and unit tests pass
---
 onnxruntime/core/graph/model.cc             |  11 +
 onnxruntime/core/graph/model.h              |   2 +-
 onnxruntime/core/graph/model_helpers.cc     | 189 ++++++++++
 onnxruntime/core/graph/model_helpers.h      |  41 ++
 onnxruntime/test/framework/function_test.cc | 397 ++++++++++++++++++++
 5 files changed, 639 insertions(+), 1 deletion(-)
 create mode 100644 onnxruntime/core/graph/model_helpers.cc
 create mode 100644 onnxruntime/core/graph/model_helpers.h

diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc
index fd4d266dc51f0..bfa25a5cb2e9a 100644
--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@@ -1,13 +1,20 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <algorithm>
 #include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "core/common/inlined_containers.h"
 #include "core/common/logging/logging.h"
 #include "core/flatbuffers/schema/ort.fbs.h"
 #include "core/flatbuffers/flatbuffers_utils.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/graph/model.h"
 #include "core/graph/model_editor_api_types.h"
+#include "core/graph/model_helpers.h"
 #include "core/graph/model_load_utils.h"
 
 #ifdef _MSC_VER
@@ -129,6 +136,8 @@ Model::Model(const std::string& graph_name,
                                             func_ptr);
   }
 
+  ORT_THROW_IF_ERROR(ValidateModelLocalFunctionAcyclic(model_local_functions_));
+
   model_local_function_templates_maps_.reserve(model_proto_.functions().size());
   for (auto& func : model_proto_.functions()) {
     auto func_schema_ptr = function_utils::CreateSchema(func.domain(),
@@ -261,6 +270,8 @@ Model::Model(ModelProto&& model_proto, const PathString& model_path,
     model_local_functions_.insert_or_assign(function_utils::GetFunctionIdentifier(func.domain(), func.name(), func.overload()), &func);
   }
 
+  ORT_THROW_IF_ERROR(ValidateModelLocalFunctionAcyclic(model_local_functions_));
+
   model_local_function_templates_maps_.reserve(model_proto_.functions().size());
   for (auto& func : model_proto_.functions()) {
     auto func_schema_ptr = function_utils::CreateSchema(func.domain(),
diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h
index c86aac44806bd..9a877ac6bba95 100644
--- a/onnxruntime/core/graph/model.h
+++ b/onnxruntime/core/graph/model.h
@@ -343,7 +343,7 @@ class Model {
   // map from function id to pointer of model local function proto
   // FunctionProto is hosted in ModelProto.
   // this map will be used for the local functions' schema's type/shape inference.
-  // This container is used by ONNX code and must be an std::unordered_map.
+  // Must be std::unordered_map to match ONNX_NAMESPACE::shape_inference::ModelLocalFunctionsMap.
   std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*> model_local_functions_;
   // this is the map from function id to the local function template.
   // this map will be used by graph to instantiate the function body.
diff --git a/onnxruntime/core/graph/model_helpers.cc b/onnxruntime/core/graph/model_helpers.cc
new file mode 100644
index 0000000000000..c3214d488ff0d
--- /dev/null
+++ b/onnxruntime/core/graph/model_helpers.cc
@@ -0,0 +1,189 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include "core/graph/model_helpers.h"
+
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "core/graph/function_utils.h"
+#include "core/graph/onnx_protobuf.h"
+
+namespace onnxruntime {
+
+namespace {
+
+// Iterative collection of local function calls from a sequence of nodes,
+// including nodes inside nested subgraph attributes. Avoids recursion to
+// prevent stack overflow from maliciously deep subgraph nesting.
+template <typename NodeRange>
+void CollectLocalFunctionCalls(
+    const NodeRange& nodes,
+    const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_local_functions,
+    InlinedHashSet<std::string_view>& seen_calls,
+    InlinedVector<std::string_view>& called_functions) {
+  InlinedVector<const ONNX_NAMESPACE::GraphProto*> pending_graphs;
+
+  auto process_nodes = [&](const auto& node_range) {
+    for (const auto& node : node_range) {
+      const auto function_id = function_utils::GetFunctionIdentifier(
+          node.domain(), node.op_type(), node.overload());
+      auto it = model_local_functions.find(function_id);
+      if (it != model_local_functions.end()) {
+        // Use string_view into the map key (stable storage).
+        std::string_view key_view = it->first;
+        if (seen_calls.insert(key_view).second) {
+          called_functions.push_back(key_view);
+        }
+      }
+
+      for (const auto& attr : node.attribute()) {
+        if (attr.has_g()) {
+          pending_graphs.push_back(&attr.g());
+        }
+        for (const auto& sub_graph : attr.graphs()) {
+          pending_graphs.push_back(&sub_graph);
+        }
+      }
+    }
+  };
+
+  process_nodes(nodes);
+
+  while (!pending_graphs.empty()) {
+    const auto* graph = pending_graphs.back();
+    pending_graphs.pop_back();
+    process_nodes(graph->node());
+  }
+}
+
+}  // namespace
+
+Status BuildLocalFunctionCallGraph(
+    const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_local_functions,
+    LocalFunctionCallGraph& call_graph) {
+  call_graph.reserve(model_local_functions.size());
+
+  for (const auto& [function_id, function_proto] : model_local_functions) {
+    if (function_proto == nullptr) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Null function proto for function id: ", function_id);
+    }
+
+    InlinedHashSet<std::string_view> seen_calls;
+    InlinedVector<std::string_view> callees;
+    CollectLocalFunctionCalls(function_proto->node(), model_local_functions, seen_calls, callees);
+
+    call_graph.emplace(std::string_view(function_id), std::move(callees));
+  }
+
+  return Status::OK();
+}
+
+Status ValidateCallGraphAcyclic(const LocalFunctionCallGraph& call_graph) {
+  enum class VisitState { kNotVisited,
+                          kVisiting,
+                          kVisited };
+
+  InlinedHashMap<std::string_view, VisitState> visit_states;
+  visit_states.reserve(call_graph.size());
+  for (const auto& [function_id, _] : call_graph) {
+    ORT_UNUSED_PARAMETER(_);
+    visit_states.emplace(function_id, VisitState::kNotVisited);
+  }
+
+  // Each frame records the function being visited and a pointer to its callees vector
+  // in the call graph (no per-frame allocation).
+  struct DfsFrame {
+    std::string_view function_id;
+    const InlinedVector<std::string_view>* callees;
+    size_t next_callee_index;
+  };
+
+  std::vector<DfsFrame> dfs_stack;
+
+  for (const auto& [root_id, root_callees] : call_graph) {
+    auto root_state_it = visit_states.find(root_id);
+    if (root_state_it == visit_states.end() || root_state_it->second == VisitState::kVisited) {
+      continue;
+    }
+
+    root_state_it->second = VisitState::kVisiting;
+    dfs_stack.push_back({root_id, &root_callees, 0});
+
+    while (!dfs_stack.empty()) {
+      auto& frame = dfs_stack.back();
+
+      if (frame.next_callee_index >= frame.callees->size()) {
+        // All callees processed — mark as fully visited and pop.
+        auto it = visit_states.find(frame.function_id);
+        ORT_ENFORCE(it != visit_states.end());
+        it->second = VisitState::kVisited;
+        dfs_stack.pop_back();
+        continue;
+      }
+
+      std::string_view callee_id = (*frame.callees)[frame.next_callee_index];
+      frame.next_callee_index++;
+
+      auto callee_state_it = visit_states.find(callee_id);
+      if (callee_state_it == visit_states.end()) {
+        // Callee not in the graph — skip.
+        continue;
+      }
+
+      if (callee_state_it->second == VisitState::kVisited) {
+        continue;
+      }
+
+      if (callee_state_it->second == VisitState::kVisiting) {
+        // Cycle detected. Build cycle description from the stack.
+        std::string cycle;
+        bool in_cycle = false;
+        for (const auto& f : dfs_stack) {
+          if (f.function_id == callee_id) {
+            in_cycle = true;
+          }
+          if (in_cycle) {
+            if (!cycle.empty()) {
+              cycle.append(" -> ");
+            }
+            cycle.append(f.function_id);
+          }
+        }
+        cycle.append(" -> ");
+        cycle.append(callee_id);
+
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH,
+                               "Model local function definitions must not be recursive. Cycle detected: ", cycle);
+      }
+
+      // Push callee onto the DFS stack.
+      auto callee_graph_it = call_graph.find(callee_id);
+      if (callee_graph_it == call_graph.end()) {
+        continue;
+      }
+
+      callee_state_it->second = VisitState::kVisiting;
+      dfs_stack.push_back({callee_id, &callee_graph_it->second, 0});
+    }
+  }
+
+  return Status::OK();
+}
+
+Status ValidateModelLocalFunctionAcyclic(
+    const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_local_functions) {
+  LocalFunctionCallGraph call_graph;
+  ORT_RETURN_IF_ERROR(BuildLocalFunctionCallGraph(model_local_functions, call_graph));
+  return ValidateCallGraphAcyclic(call_graph);
+}
+
+}  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/core/graph/model_helpers.h b/onnxruntime/core/graph/model_helpers.h
new file mode 100644
index 0000000000000..777f2ac611c15
--- /dev/null
+++ b/onnxruntime/core/graph/model_helpers.h
@@ -0,0 +1,41 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+#include <string_view>
+#include <unordered_map>
+
+#include "core/common/common.h"
+#include "core/common/inlined_containers.h"
+
+namespace ONNX_NAMESPACE {
+class FunctionProto;
+}
+
+namespace onnxruntime {
+
+/// Adjacency list representation of a local function call graph.
+/// Keys and values are string_views into stable storage (e.g. map keys that outlive this structure).
+using LocalFunctionCallGraph = InlinedHashMap<std::string_view, InlinedVector<std::string_view>>;
+
+/// Build a call graph adjacency list from model local functions.
+/// String views in the returned graph point into the keys of @p model_local_functions.
+Status BuildLocalFunctionCallGraph(
+    const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_local_functions,
+    LocalFunctionCallGraph& call_graph);
+
+/// Validate that a call graph contains no cycles.
+/// Returns an error with the cycle path if a cycle is detected.
+Status ValidateCallGraphAcyclic(const LocalFunctionCallGraph& call_graph);
+
+/// Convenience: build the call graph from model local functions and validate acyclicity.
+Status ValidateModelLocalFunctionAcyclic(
+    const std::unordered_map<std::string, const ONNX_NAMESPACE::FunctionProto*>& model_local_functions);
+
+}  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/test/framework/function_test.cc b/onnxruntime/test/framework/function_test.cc
index 93f2ea704a729..ee3b0a6ec2133 100644
--- a/onnxruntime/test/framework/function_test.cc
+++ b/onnxruntime/test/framework/function_test.cc
@@ -1,15 +1,20 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+#include <sstream>
+
 #include "core/graph/onnx_protobuf.h"
+#include "onnx/checker.h"
 #include "onnx/defs/parser.h"
 
 #include "core/common/span_utils.h"
 #include "core/framework/customregistry.h"
 #include "core/framework/op_kernel.h"
 #include "core/graph/model.h"
+#include "core/graph/model_helpers.h"
 #include "core/providers/cpu/cpu_execution_provider.h"
 #include "core/session/inference_session.h"
 
@@ -87,6 +92,34 @@ static void Check(const char* source,
   }
 }
 
+static Status LoadModel(const char* source) {
+  ONNX_NAMESPACE::OnnxParser parser(source);
+  ONNX_NAMESPACE::ModelProto model;
+  auto parse_status = parser.Parse(model);
+  if (!parse_status.IsOK()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to parse test model: ", parse_status.ErrorMessage());
+  }
+  if (!parser.EndOfInput()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Extra unparsed input unexpected.");
+  }
+
+  try {
+    ONNX_NAMESPACE::checker::check_model(model);
+  } catch (const std::exception& e) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "ONNX model check failed: ", e.what());
+  }
+
+  std::string serialized_model;
+  if (!model.SerializeToString(&serialized_model) || serialized_model.empty()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to serialize test model.");
+  }
+
+  SessionOptions session_options;
+  InferenceSession session_object{session_options, GetEnvironment()};
+  std::istringstream sstr(serialized_model);
+  return session_object.Load(sstr);
+}
+
 namespace {
 const char* basic_code = R"(
         <
@@ -303,6 +336,370 @@ TEST(FunctionTest, CallInConditional) {
   Check(code, "x", {1.0, 2.0, 3.0}, "y", {6.0, 12.0, 18.0});
 }
 
+TEST(FunctionTest, RejectsSelfRecursiveLocalFunction) {
+  const char* code = R"(
+        <
+        ir_version: 8,
+        opset_import: [ "" : 16, "local" : 1 ]
+        >
+        agraph (float[N] x) => (float[N] y)
+        {
+            y = local.self_recursive (x)
+        }
+
+        <
+        opset_import: [ "" : 16, "local" : 1 ],
+        domain: "local"
+        >
+        self_recursive (lx) => (ly) {
+            ly = local.self_recursive (lx)
+        }
+        )";
+
+  const auto status = LoadModel(code);
+  ASSERT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("must not be recursive"));
+}
+
+TEST(FunctionTest, RejectsMutuallyRecursiveLocalFunctions) {
+  const char* code = R"(
+        <
+        ir_version: 8,
+        opset_import: [ "" : 16, "local" : 1 ]
+        >
+        agraph (float[N] x) => (float[N] y)
+        {
+            y = local.first (x)
+        }
+
+        <
+        opset_import: [ "" : 16, "local" : 1 ],
+        domain: "local"
+        >
+        first (lx) => (ly) {
+            ly = local.second (lx)
+        }
+
+        <
+        opset_import: [ "" : 16, "local" : 1 ],
+        domain: "local"
+        >
+        second (lx) => (ly) {
+            ly = local.first (lx)
+        }
+        )";
+
+  const auto status = LoadModel(code);
+  ASSERT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("must not be recursive"));
+}
+
+TEST(FunctionTest, RejectsRecursionThroughSubgraph) {
+  // A local function that calls itself inside an If subgraph (then_branch).
+  const char* code = R"(
+        <
+        ir_version: 8,
+        opset_import: [ "" : 16, "local" : 1 ]
+        >
+        agraph (float[N] x) => (float[N] y)
+        {
+            y = local.recursive_if (x)
+        }
+
+        <
+        opset_import: [ "" : 16, "local" : 1 ],
+        domain: "local"
+        >
+        recursive_if (lx) => (ly) {
+            temp = Identity (lx)
+            cond = Constant <value = bool {1}> ()
+            ly = If (cond) <
+                then_branch = then_graph () => (float[N] then_out)
+                {
+                    then_out = local.recursive_if (temp)
+                },
+                else_branch = else_graph () => (float[N] else_out)
+                {
+                    else_out = Identity (temp)
+                }
+                >
+        }
+        )";
+
+  const auto status = LoadModel(code);
+  ASSERT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("must not be recursive"));
+}
+
+// --- Synthetic adjacency-list tests for ValidateCallGraphAcyclic ---
+// These test the cycle detection algorithm directly without constructing ONNX models.
+
+TEST(FunctionTest, CallGraphAcyclic_EmptyGraph) {
+  onnxruntime::LocalFunctionCallGraph call_graph;
+  ASSERT_STATUS_OK(onnxruntime::ValidateCallGraphAcyclic(call_graph));
+}
+
+TEST(FunctionTest, CallGraphAcyclic_SingleNodeNoCalls) {
+  // Single function with no callees.
+  std::string a = "A";
+  onnxruntime::LocalFunctionCallGraph call_graph;
+  call_graph[a] = {};
+  ASSERT_STATUS_OK(onnxruntime::ValidateCallGraphAcyclic(call_graph));
+}
+
+TEST(FunctionTest, CallGraphAcyclic_SelfCycle) {
+  std::string a = "A";
+  onnxruntime::LocalFunctionCallGraph call_graph;
+  call_graph[a] = {a};
+  const auto status = onnxruntime::ValidateCallGraphAcyclic(call_graph);
+  ASSERT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("must not be recursive"));
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("A -> A"));
+}
+
+TEST(FunctionTest, CallGraphAcyclic_MutualCycle) {
+  std::string a = "A", b = "B";
+  onnxruntime::LocalFunctionCallGraph call_graph;
+  call_graph[a] = {b};
+  call_graph[b] = {a};
+  const auto status = onnxruntime::ValidateCallGraphAcyclic(call_graph);
+  ASSERT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("must not be recursive"));
+}
+
+TEST(FunctionTest, CallGraphAcyclic_LongerCycle) {
+  // A -> B -> C -> A
+  std::string a = "A", b = "B", c = "C";
+  onnxruntime::LocalFunctionCallGraph call_graph;
+  call_graph[a] = {b};
+  call_graph[b] = {c};
+  call_graph[c] = {a};
+  const auto status = onnxruntime::ValidateCallGraphAcyclic(call_graph);
+  ASSERT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("must not be recursive"));
+  // The cycle path should include all three participants.
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("A"));
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("B"));
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("C"));
+}
+
+TEST(FunctionTest, CallGraphAcyclic_DiamondNoCycle) {
+  // A -> B, A -> C, B -> D, C -> D  (no cycle)
+  std::string a = "A", b = "B", c = "C", d = "D";
+  onnxruntime::LocalFunctionCallGraph call_graph;
+  call_graph[a] = {b, c};
+  call_graph[b] = {d};
+  call_graph[c] = {d};
+  call_graph[d] = {};
+  ASSERT_STATUS_OK(onnxruntime::ValidateCallGraphAcyclic(call_graph));
+}
+
+TEST(FunctionTest, CallGraphAcyclic_DeepChainNoCycle) {
+  // A -> B -> C -> D  (no cycle)
+  std::string a = "A", b = "B", c = "C", d = "D";
+  onnxruntime::LocalFunctionCallGraph call_graph;
+  call_graph[a] = {b};
+  call_graph[b] = {c};
+  call_graph[c] = {d};
+  call_graph[d] = {};
+  ASSERT_STATUS_OK(onnxruntime::ValidateCallGraphAcyclic(call_graph));
+}
+
+TEST(FunctionTest, CallGraphAcyclic_MultipleIndependentCycles) {
+  // Two independent cycles: A -> B -> A, C -> D -> C
+  std::string a = "A", b = "B", c = "C", d = "D";
+  onnxruntime::LocalFunctionCallGraph call_graph;
+  call_graph[a] = {b};
+  call_graph[b] = {a};
+  call_graph[c] = {d};
+  call_graph[d] = {c};
+  const auto status = onnxruntime::ValidateCallGraphAcyclic(call_graph);
+  ASSERT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("must not be recursive"));
+}
+
+TEST(FunctionTest, CallGraphAcyclic_SharedCallsDiamondNoCycle) {
+  // Regression test: acyclic model with shared function calls (diamond pattern).
+  // E -> A, E -> B, A -> C, B -> C, C -> D  (no cycle despite shared references to C)
+  std::string a = "A", b = "B", c = "C", d = "D", e = "E";
+  onnxruntime::LocalFunctionCallGraph call_graph;
+  call_graph[e] = {a, b};
+  call_graph[a] = {c};
+  call_graph[b] = {c};
+  call_graph[c] = {d};
+  call_graph[d] = {};
+  ASSERT_STATUS_OK(onnxruntime::ValidateCallGraphAcyclic(call_graph));
+}
+
+// --- Model-level integration tests ---
+
+TEST(FunctionTest, RejectsLongerCycle) {
+  // A -> B -> C -> A (three-function cycle)
+  const char* code = R"(
+        <
+        ir_version: 8,
+        opset_import: [ "" : 16, "local" : 1 ]
+        >
+        agraph (float[N] x) => (float[N] y)
+        {
+            y = local.func_a (x)
+        }
+
+        <
+        opset_import: [ "" : 16, "local" : 1 ],
+        domain: "local"
+        >
+        func_a (lx) => (ly) {
+            ly = local.func_b (lx)
+        }
+
+        <
+        opset_import: [ "" : 16, "local" : 1 ],
+        domain: "local"
+        >
+        func_b (lx) => (ly) {
+            ly = local.func_c (lx)
+        }
+
+        <
+        opset_import: [ "" : 16, "local" : 1 ],
+        domain: "local"
+        >
+        func_c (lx) => (ly) {
+            ly = local.func_a (lx)
+        }
+        )";
+
+  const auto status = LoadModel(code);
+  ASSERT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("must not be recursive"));
+}
+
+TEST(FunctionTest, AcceptsAcyclicDiamond) {
+  // A -> B, A -> C, B -> D, C -> D (diamond, no cycle)
+  const char* code = R"(
+        <
+        ir_version: 8,
+        opset_import: [ "" : 16, "local" : 1 ]
+        >
+        agraph (float[N] x) => (float[N] y)
+        {
+            y = local.func_a (x)
+        }
+
+        <
+        opset_import: [ "" : 16, "local" : 1 ],
+        domain: "local"
+        >
+        func_a (lx) => (ly) {
+            t1 = local.func_b (lx)
+            ly = local.func_c (t1)
+        }
+
+        <
+        opset_import: [ "" : 16, "local" : 1 ],
+        domain: "local"
+        >
+        func_b (lx) => (ly) {
+            ly = local.func_d (lx)
+        }
+
+        <
+        opset_import: [ "" : 16, "local" : 1 ],
+        domain: "local"
+        >
+        func_c (lx) => (ly) {
+            ly = local.func_d (lx)
+        }
+
+        <
+        opset_import: [ "" : 16 ],
+        domain: "local"
+        >
+        func_d (lx) => (ly) {
+            ly = Identity (lx)
+        }
+        )";
+
+  ASSERT_STATUS_OK(LoadModel(code));
+}
+
+TEST(FunctionTest, AcceptsTrivialSingleNodeFunction) {
+  // A local function with a single Identity node — verifies that trivial
+  // (but non-empty) function bodies pass acyclicity validation.
+  const char* code = R"(
+        <
+        ir_version: 8,
+        opset_import: [ "" : 16, "local" : 1 ]
+        >
+        agraph (float[N] x) => (float[N] y)
+        {
+            y = local.trivial_func (x)
+        }
+
+        <
+        opset_import: [ "" : 16 ],
+        domain: "local"
+        >
+        trivial_func (lx) => (ly) {
+            ly = Identity (lx)
+        }
+        )";
+
+  ASSERT_STATUS_OK(LoadModel(code));
+}
+
+TEST(FunctionTest, RejectsMultipleIndependentCycles) {
+  // Two independent cycles in the same model: A -> B -> A, C -> D -> C
+  const char* code = R"(
+        <
+        ir_version: 8,
+        opset_import: [ "" : 16, "local" : 1 ]
+        >
+        agraph (float[N] x) => (float[N] y)
+        {
+            t = local.func_a (x)
+            y = local.func_c (t)
+        }
+
+        <
+        opset_import: [ "" : 16, "local" : 1 ],
+        domain: "local"
+        >
+        func_a (lx) => (ly) {
+            ly = local.func_b (lx)
+        }
+
+        <
+        opset_import: [ "" : 16, "local" : 1 ],
+        domain: "local"
+        >
+        func_b (lx) => (ly) {
+            ly = local.func_a (lx)
+        }
+
+        <
+        opset_import: [ "" : 16, "local" : 1 ],
+        domain: "local"
+        >
+        func_c (lx) => (ly) {
+            ly = local.func_d (lx)
+        }
+
+        <
+        opset_import: [ "" : 16, "local" : 1 ],
+        domain: "local"
+        >
+        func_d (lx) => (ly) {
+            ly = local.func_c (lx)
+        }
+        )";
+
+  const auto status = LoadModel(code);
+  ASSERT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("must not be recursive"));
+}
+
 // Test use of attibute references, especially where source/target attribute
 // names are not the same. In this example, the "start : int = @s" attribute-reference
 // binds the attribute named "start" of the Shape op to the attribute named "s"