diff --git a/dotnet/w365-computer-use/W365ComputerUseSample.sln b/dotnet/w365-computer-use/W365ComputerUseSample.sln new file mode 100644 index 00000000..67fe015d --- /dev/null +++ b/dotnet/w365-computer-use/W365ComputerUseSample.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.14.36623.8 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "W365ComputerUseSample", "sample-agent\W365ComputerUseSample.csproj", "{B72D1A3E-4F8C-9E56-A1B2-C3D4E5F60789}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {B72D1A3E-4F8C-9E56-A1B2-C3D4E5F60789}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {B72D1A3E-4F8C-9E56-A1B2-C3D4E5F60789}.Debug|Any CPU.Build.0 = Debug|Any CPU + {B72D1A3E-4F8C-9E56-A1B2-C3D4E5F60789}.Release|Any CPU.ActiveCfg = Release|Any CPU + {B72D1A3E-4F8C-9E56-A1B2-C3D4E5F60789}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {D4E5F6A7-B8C9-0D1E-2F3A-4B5C6D7E8F90} + EndGlobalSection +EndGlobal diff --git a/dotnet/w365-computer-use/sample-agent/.gitignore b/dotnet/w365-computer-use/sample-agent/.gitignore new file mode 100644 index 00000000..7c608371 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/.gitignore @@ -0,0 +1,8 @@ +appsettings.Development.json +appsettings.Production.json +Screenshots/ +a365.config.json +a365.generated.config.json +app.zip +publish/ +.vscode/.env diff --git a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs new file mode 100644 index 00000000..e8c3139f --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs @@ -0,0 +1,496 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Net.Http.Headers; +using W365ComputerUseSample.ComputerUse; +using W365ComputerUseSample.Telemetry; +using Microsoft.Agents.A365.Observability.Caching; +using Microsoft.Agents.A365.Runtime.Utils; +using Microsoft.Agents.A365.Tooling.Extensions.AgentFramework.Services; +using Microsoft.Agents.Builder; +using Microsoft.Agents.Builder.App; +using Microsoft.Agents.Builder.State; +using Microsoft.Agents.Core; +using Microsoft.Agents.Core.Models; +using Microsoft.Extensions.AI; +using ModelContextProtocol.Client; + +namespace W365ComputerUseSample.Agent; + +public class MyAgent : AgentApplication +{ + private const string AgentWelcomeMessage = "Hello! I can help you perform tasks on a Windows 365 Cloud PC. Tell me what you'd like to do."; + private const string AgentHireMessage = "Thank you for hiring me! I can control a Windows desktop to accomplish tasks for you."; + private const string AgentFarewellMessage = "Thank you for your time, I enjoyed working with you."; + + private readonly IExporterTokenCache? _agentTokenCache; + private readonly ILogger _logger; + private readonly IMcpToolRegistrationService _toolService; + private readonly ComputerUseOrchestrator _orchestrator; + private readonly string[] _mcpServerUrls; + + /// + /// Subset of whose path names the W365 Computer-Use server. + /// Loaded only when the intent classifier determines CUA is required — otherwise the + /// tools/list call on this URL triggers ATG's hostname discovery and acquires a Cloud PC + /// session (10-30s). Match is by URL substring; relies on ATG's path convention of + /// keeping the server name in the path. + /// + private readonly string[] _w365McpServerUrls; + + /// + /// All non-W365 MCP server URLs (mail, calendar, etc.). Loaded eagerly — these don't + /// acquire a W365 session. + /// + private readonly string[] _otherMcpServerUrls; + + private readonly string? AgenticAuthHandlerName; + private readonly string? OboAuthHandlerName; + + /// + /// Check if a bearer token is available in the environment for development/testing. + /// + public static bool TryGetBearerTokenForDevelopment(out string? bearerToken) + { + bearerToken = Environment.GetEnvironmentVariable("BEARER_TOKEN"); + return !string.IsNullOrEmpty(bearerToken); + } + + /// + /// Checks if graceful fallback is enabled when MCP tools fail to load. + /// Only allowed in Development + SKIP_TOOLING_ON_ERRORS=true. + /// + private static bool ShouldSkipToolingOnErrors() + { + var environment = Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT") ?? + Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT") ?? + "Production"; + + var skipToolingOnErrors = Environment.GetEnvironmentVariable("SKIP_TOOLING_ON_ERRORS"); + + return environment.Equals("Development", StringComparison.OrdinalIgnoreCase) && + !string.IsNullOrEmpty(skipToolingOnErrors) && + skipToolingOnErrors.Equals("true", StringComparison.OrdinalIgnoreCase); + } + + public MyAgent( + AgentApplicationOptions options, + IConfiguration configuration, + IExporterTokenCache agentTokenCache, + IMcpToolRegistrationService toolService, + ComputerUseOrchestrator orchestrator, + ILogger logger) : base(options) + { + _agentTokenCache = agentTokenCache; + _logger = logger; + _toolService = toolService; + _orchestrator = orchestrator; + + // Support multiple MCP server URLs; fall back to single McpServer:Url for backward compat + _mcpServerUrls = configuration.GetSection("McpServers").Get() ?? []; + if (_mcpServerUrls.Length == 0) + { + var singleUrl = configuration["McpServer:Url"]; + if (!string.IsNullOrEmpty(singleUrl)) + _mcpServerUrls = [singleUrl]; + } + + // Split into W365 vs other servers by URL path — W365 load is deferred until the + // intent classifier decides CUA is needed. Avoids paying the 10-30s session + // acquisition cost on chit-chat / mail-only messages. + _w365McpServerUrls = _mcpServerUrls.Where(u => u.Contains("/mcp_W365ComputerUse", StringComparison.OrdinalIgnoreCase)).ToArray(); + _otherMcpServerUrls = _mcpServerUrls.Where(u => !u.Contains("/mcp_W365ComputerUse", StringComparison.OrdinalIgnoreCase)).ToArray(); + + AgenticAuthHandlerName = configuration.GetValue("AgentApplication:AgenticAuthHandlerName"); + OboAuthHandlerName = configuration.GetValue("AgentApplication:OboAuthHandlerName"); + + // Greet when members are added + OnConversationUpdate(ConversationUpdateEvents.MembersAdded, WelcomeMessageAsync); + + // Compute auth handler arrays once + var agenticHandlers = !string.IsNullOrEmpty(AgenticAuthHandlerName) ? [AgenticAuthHandlerName] : Array.Empty(); + var oboHandlers = !string.IsNullOrEmpty(OboAuthHandlerName) ? [OboAuthHandlerName] : Array.Empty(); + + // Handle install/uninstall + OnActivity(ActivityTypes.InstallationUpdate, OnInstallationUpdateAsync, isAgenticOnly: true, autoSignInHandlers: agenticHandlers); + OnActivity(ActivityTypes.InstallationUpdate, OnInstallationUpdateAsync, isAgenticOnly: false); + + // Handle messages — MUST BE AFTER any other message handlers + OnActivity(ActivityTypes.Message, OnMessageAsync, isAgenticOnly: true, autoSignInHandlers: agenticHandlers); + OnActivity(ActivityTypes.Message, OnMessageAsync, isAgenticOnly: false, autoSignInHandlers: oboHandlers); + } + + protected async Task WelcomeMessageAsync(ITurnContext turnContext, ITurnState turnState, CancellationToken cancellationToken) + { + await AgentMetrics.InvokeObservedAgentOperation( + "WelcomeMessage", + turnContext, + async () => + { + foreach (ChannelAccount member in turnContext.Activity.MembersAdded) + { + if (member.Id != turnContext.Activity.Recipient.Id) + { + await turnContext.SendActivityAsync(AgentWelcomeMessage); + } + } + }); + } + + protected async Task OnInstallationUpdateAsync(ITurnContext turnContext, ITurnState turnState, CancellationToken cancellationToken) + { + await AgentMetrics.InvokeObservedAgentOperation( + "InstallationUpdate", + turnContext, + async () => + { + _logger.LogInformation( + "InstallationUpdate received — Action: '{Action}', DisplayName: '{Name}', UserId: '{Id}'", + turnContext.Activity.Action ?? "(none)", + turnContext.Activity.From?.Name ?? "(unknown)", + turnContext.Activity.From?.Id ?? "(unknown)"); + + if (turnContext.Activity.Action == InstallationUpdateActionTypes.Add) + { + await turnContext.SendActivityAsync(MessageFactory.Text(AgentHireMessage), cancellationToken); + } + else if (turnContext.Activity.Action == InstallationUpdateActionTypes.Remove) + { + await turnContext.SendActivityAsync(MessageFactory.Text(AgentFarewellMessage), cancellationToken); + } + }); + } + + protected async Task OnMessageAsync(ITurnContext turnContext, ITurnState turnState, CancellationToken cancellationToken) + { + if (turnContext is null) + { + throw new ArgumentNullException(nameof(turnContext)); + } + + var fromAccount = turnContext.Activity.From; + _logger.LogDebug( + "Turn received from user — DisplayName: '{Name}', UserId: '{Id}', AadObjectId: '{AadObjectId}'", + fromAccount?.Name ?? "(unknown)", + fromAccount?.Id ?? "(unknown)", + fromAccount?.AadObjectId ?? "(none)"); + + // Select auth handler based on request type + string? ObservabilityAuthHandlerName; + string? ToolAuthHandlerName; + if (turnContext.IsAgenticRequest()) + { + ObservabilityAuthHandlerName = ToolAuthHandlerName = AgenticAuthHandlerName; + } + else + { + ObservabilityAuthHandlerName = ToolAuthHandlerName = OboAuthHandlerName; + } + + await A365OtelWrapper.InvokeObservedAgentOperation( + "MessageProcessor", + turnContext, + turnState, + _agentTokenCache, + UserAuthorization, + ObservabilityAuthHandlerName ?? string.Empty, + _logger, + async () => + { + // Typing indicator + // Single typing indicator. A background refresh loop was removed because it + // raced with the main reply path and triggered Kestrel request-body + // "Reading is already in progress" → ObjectDisposedException crashes post-response. + // Informative updates via onStatusUpdate keep the UI feedback flowing. + await turnContext.SendActivityAsync(Activity.CreateTypingActivity(), cancellationToken).ConfigureAwait(false); + + try + { + var userText = turnContext.Activity.Text?.Trim() ?? string.Empty; + var conversationId = turnContext.Activity.Conversation?.Id ?? Guid.NewGuid().ToString(); + + // Step 1: classify intent with a cheap tool-less LLM call. If the message + // doesn't need desktop control ("hi", "summarize my inbox", etc.) we skip + // W365 tool loading entirely so ATG never acquires a Cloud PC session. + var needsCua = await _orchestrator.ClassifyNeedsCuaAsync(userText, cancellationToken); + + if (!needsCua) + { + // Non-CUA fast path: load only non-W365 tools, run orchestrator with the + // computer tool withheld. Supports function-tool paths (mail/calendar/etc.) + // without touching W365. + var (_, nonCuaAdditionalTools, _) = await GetToolsAsync(turnContext, ToolAuthHandlerName, includeW365: false); + var directResponse = await _orchestrator.RunAsync( + conversationId, + userText, + w365Tools: [], + additionalTools: nonCuaAdditionalTools, + mcpClient: null, + graphAccessToken: null, + onStatusUpdate: status => turnContext.StreamingResponse.QueueInformativeUpdateAsync(status).ConfigureAwait(false), + onCuaStarting: null, + onFolderLinkReady: null, + includeCuaTool: false, + cancellationToken: cancellationToken); + turnContext.StreamingResponse.QueueTextChunk(directResponse); + return; + } + + // CUA path: SendActivity the "Got it" acknowledgment FIRST, before the streaming + // response begins. If we send it later (e.g. from inside onCuaStarting), Teams/ + // Emulator orders it visually AFTER the streaming activity's final text since + // the streaming activity was created earlier in the turn — the user sees the + // result before the acknowledgment. SendActivity here gets an earlier ID than + // the streaming activity that starts on the next line. + await turnContext.SendActivityAsync(MessageFactory.Text("Got it — working on it…"), cancellationToken).ConfigureAwait(false); + + // Get MCP tools — direct connection in Dev, SDK in Production. + // + // No "Acquiring…" bubble: GetToolsAsync may take >2s even on cache reuse + // (OBO token + headers + S2S add up), and the agent has no reliable way to + // distinguish reuse from fresh checkout in advance. A misleading bubble on + // every reuse was worse than no bubble at all. The "Got it — working on it…" + // bubble already provides feedback that the agent is working. + var (w365Tools, additionalTools, mcpClient) = await GetToolsAsync(turnContext, ToolAuthHandlerName, includeW365: true); + + try + { + if (w365Tools == null || w365Tools.Count == 0) + { + // ATG wraps tools/list failures into a synthetic "Error" tool whose Description + // carries the real reason (e.g. "no pool with an available session was found"). + // Extract it so the user sees the actionable message instead of the generic + // "Unable to connect" placeholder. + var errorMessage = ExtractW365ToolListError(additionalTools) + ?? "Unable to connect to the W365 Computer Use service. Please check your configuration."; + // Write the error into the streaming response so EndStreamAsync doesn't + // emit a confusing 'No text was streamed' alongside the real message. + turnContext.StreamingResponse.QueueTextChunk(errorMessage); + return; + } + + // Get Graph token for OneDrive screenshot upload. + // In production: acquired via agentic auth (UserAuthorization). + // In development: set GRAPH_TOKEN env var with a token that has Files.ReadWrite scope. + string? graphToken = null; + if (!string.IsNullOrEmpty(ToolAuthHandlerName)) + { + graphToken = await UserAuthorization.GetTurnTokenAsync(turnContext, ToolAuthHandlerName); + } + if (string.IsNullOrEmpty(graphToken)) + { + graphToken = Environment.GetEnvironmentVariable("GRAPH_TOKEN"); + } + + // Run the CUA loop — session is managed per conversation + var response = await _orchestrator.RunAsync( + conversationId, + userText, + w365Tools, + additionalTools: additionalTools, + mcpClient: mcpClient, + graphAccessToken: graphToken, + onStatusUpdate: status => turnContext.StreamingResponse.QueueInformativeUpdateAsync(status).ConfigureAwait(false), + onCuaStarting: async (isNewSession) => + { + if (isNewSession) + { + await turnContext.StreamingResponse.QueueInformativeUpdateAsync("Starting a session to a Windows 365 Cloud PC…"); + } + }, + onFolderLinkReady: async url => await turnContext.SendActivityAsync( + MessageFactory.Text($"📸 Screenshots for this request: [View folder]({url})"), cancellationToken), + cancellationToken: cancellationToken); + + // Send the response + turnContext.StreamingResponse.QueueTextChunk(response); + } + finally + { + // Don't dispose the MCP client — it's reused across messages and + // needed for EndSession on shutdown. It will be disposed with the app. + } + } + finally + { + try { await turnContext.StreamingResponse.EndStreamAsync(cancellationToken).ConfigureAwait(false); } + catch (ObjectDisposedException) { /* stream already disposed */ } + } + }); + } + + /// + /// Get MCP tools, separated into W365 (CUA) and additional (function) tools. + /// In Development mode with a bearer token, connects directly to the MCP server URL. + /// In Production, uses the A365 SDK to discover servers via the Tooling Gateway. + /// When is false, the W365 server(s) are skipped — + /// used on the non-CUA fast path so ATG never acquires a Cloud PC session for chit-chat. + /// + private async Task<(IList? W365Tools, IList? AdditionalTools, IMcpClient? Client)> GetToolsAsync(ITurnContext context, string? authHandlerName, bool includeW365) + { + // Acquire access token + string? accessToken = null; + string? agentId = null; + + if (!string.IsNullOrEmpty(authHandlerName)) + { + accessToken = await UserAuthorization.GetTurnTokenAsync(context, authHandlerName); + agentId = Utility.ResolveAgentIdentity(context, accessToken); + } + else if (TryGetBearerTokenForDevelopment(out var bearerToken)) + { + _logger.LogInformation("Using bearer token from environment."); + accessToken = bearerToken; + agentId = Utility.ResolveAgentIdentity(context, accessToken!); + } + + if (string.IsNullOrEmpty(accessToken) || string.IsNullOrEmpty(agentId)) + { + _logger.LogWarning("No auth token or agent identity available. Cannot connect to MCP."); + return (null, null, null); + } + + try + { + IList? allTools; + IMcpClient? mcpClient = null; + + // Development with bearer token: use orchestrator's cached MCP connection + if (TryGetBearerTokenForDevelopment(out _) && IsDevelopment()) + { + if (_mcpServerUrls.Length == 0) + throw new InvalidOperationException("McpServers (or McpServer:Url) is required in appsettings for Development mode."); + + if (includeW365) + { + // Full load: W365 + everything else. The orchestrator's cache covers both. + (allTools, mcpClient) = await _orchestrator.GetOrCreateMcpConnectionAsync(_mcpServerUrls, accessToken!); + } + else + { + // Non-CUA fast path: skip W365 entirely. Non-W365 tools have their own cache + // in the orchestrator so we don't reconnect on every non-CUA message. + allTools = await _orchestrator.GetOrCreateNonW365McpConnectionAsync(_otherMcpServerUrls, accessToken!); + } + } + else + { + // Production: use the A365 SDK's tooling gateway for server discovery. + // NOTE: The SDK loads all registered servers' tools in one call, including W365. + // The includeW365 flag can't suppress the W365 load in prod today — the SDK has + // no per-server loading API. The CUA gate still saves compute on the non-CUA + // branch (no CUA loop, no screenshots), but not the Cloud PC session. + var handlerForMcp = !string.IsNullOrEmpty(authHandlerName) + ? authHandlerName + : OboAuthHandlerName ?? AgenticAuthHandlerName ?? string.Empty; + var tokenOverride = string.IsNullOrEmpty(authHandlerName) ? accessToken : null; + + allTools = await _toolService.GetMcpToolsAsync(agentId, UserAuthorization, handlerForMcp, context, tokenOverride).ConfigureAwait(false); + } + + var w365Tools = includeW365 ? FilterW365Tools(allTools) : null; + var additionalTools = FilterAdditionalTools(allTools); + return (w365Tools, additionalTools, mcpClient); + } + catch (Exception ex) + { + if (ShouldSkipToolingOnErrors()) + { + _logger.LogWarning(ex, "Failed to connect to MCP servers. Continuing without tools (SKIP_TOOLING_ON_ERRORS=true)."); + return (null, null, null); + } + + _logger.LogError(ex, "Failed to connect to MCP servers."); + throw; + } + } + + private static bool IsDevelopment() + { + var env = Environment.GetEnvironmentVariable("ASPNETCORE_ENVIRONMENT") + ?? Environment.GetEnvironmentVariable("DOTNET_ENVIRONMENT") + ?? "Production"; + return env.Equals("Development", StringComparison.OrdinalIgnoreCase); + } + + private IList? FilterW365Tools(IList? allTools) + { + var w365Tools = allTools?.Where(t => + { + var name = (t as AIFunction)?.Name ?? t.ToString() ?? string.Empty; + return ComputerUseOrchestrator.IsW365CuaTool(name); + }).ToList(); + + if (w365Tools != null && w365Tools.Count > 0) + { + _logger.LogInformation("Found {ToolCount} W365 Computer Use tools", w365Tools.Count); + } + else + { + _logger.LogWarning("No W365 tools found among {TotalCount} MCP tools", allTools?.Count ?? 0); + } + + return w365Tools; + } + + private IList? FilterAdditionalTools(IList? allTools) + { + var additionalTools = allTools?.Where(t => + { + var name = (t as AIFunction)?.Name ?? t.ToString() ?? string.Empty; + return !ComputerUseOrchestrator.IsW365CuaTool(name); + }).ToList(); + + if (additionalTools != null && additionalTools.Count > 0) + { + _logger.LogInformation("Found {ToolCount} additional function tools: {Names}", + additionalTools.Count, + string.Join(", ", additionalTools.Select(t => (t as AIFunction)?.Name ?? "?"))); + } + + return additionalTools; + } + + /// + /// Looks for ATG's synthetic Error tool in the non-CUA tool list and extracts a + /// user-facing error reason from its description. ATG formats the description as: + /// "Tool list retrieval failed. Message='...'. ExceptionType='...'. ExceptionMessage='...'. CorrelationId=..., TimeStamp=...". + /// We prefer the ExceptionMessage field because it carries the specific reason + /// (e.g. "Failed to acquire a W365 session: no pool with an available session was found."). + /// Returns null if no error tool is present or the description can't be parsed. + /// + private static string? ExtractW365ToolListError(IList? additionalTools) + { + if (additionalTools == null || additionalTools.Count == 0) + { + return null; + } + + foreach (var tool in additionalTools) + { + if (tool is not AIFunction fn) continue; + if (!string.Equals(fn.Name, "Error", StringComparison.OrdinalIgnoreCase)) continue; + + var description = fn.Description ?? string.Empty; + var extracted = ExtractQuotedField(description, "ExceptionMessage=") + ?? ExtractQuotedField(description, "Message=") + ?? (string.IsNullOrWhiteSpace(description) ? null : description); + return extracted; + } + + return null; + } + + private static string? ExtractQuotedField(string source, string fieldPrefix) + { + var startMarker = fieldPrefix + "'"; + var start = source.IndexOf(startMarker, StringComparison.Ordinal); + if (start < 0) return null; + start += startMarker.Length; + var end = source.IndexOf("'.", start, StringComparison.Ordinal); + if (end < 0) end = source.IndexOf('\'', start); + if (end < start) return null; + var value = source.Substring(start, end - start); + return string.IsNullOrWhiteSpace(value) ? null : value; + } +} diff --git a/dotnet/w365-computer-use/sample-agent/AspNetExtensions.cs b/dotnet/w365-computer-use/sample-agent/AspNetExtensions.cs new file mode 100644 index 00000000..94e1c1a7 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/AspNetExtensions.cs @@ -0,0 +1,170 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Microsoft.Agents.Authentication; +using Microsoft.Agents.Core; +using Microsoft.AspNetCore.Authentication.JwtBearer; +using Microsoft.IdentityModel.Protocols; +using Microsoft.IdentityModel.Protocols.OpenIdConnect; +using Microsoft.IdentityModel.Tokens; +using Microsoft.IdentityModel.Validators; +using System.Collections.Concurrent; +using System.Globalization; +using System.IdentityModel.Tokens.Jwt; + +namespace W365ComputerUseSample; + +public static class AspNetExtensions +{ + private static readonly ConcurrentDictionary> _openIdMetadataCache = new(); + + public static void AddAgentAspNetAuthentication(this IServiceCollection services, IConfiguration configuration, string tokenValidationSectionName = "TokenValidation") + { + IConfigurationSection tokenValidationSection = configuration.GetSection(tokenValidationSectionName); + + if (!tokenValidationSection.Exists() || !tokenValidationSection.GetValue("Enabled", true)) + { + System.Diagnostics.Trace.WriteLine("AddAgentAspNetAuthentication: Auth disabled"); + return; + } + + services.AddAgentAspNetAuthentication(tokenValidationSection.Get()!); + } + + public static void AddAgentAspNetAuthentication(this IServiceCollection services, TokenValidationOptions validationOptions) + { + AssertionHelpers.ThrowIfNull(validationOptions, nameof(validationOptions)); + + if (validationOptions.Audiences == null || validationOptions.Audiences.Count == 0) + { + throw new ArgumentException($"{nameof(TokenValidationOptions)}:Audiences requires at least one ClientId"); + } + + foreach (var audience in validationOptions.Audiences) + { + if (!Guid.TryParse(audience, out _)) + { + throw new ArgumentException($"{nameof(TokenValidationOptions)}:Audiences values must be a GUID"); + } + } + + if (validationOptions.ValidIssuers == null || validationOptions.ValidIssuers.Count == 0) + { + validationOptions.ValidIssuers = + [ + "https://api.botframework.com", + "https://sts.windows.net/d6d49420-f39b-4df7-a1dc-d59a935871db/", + "https://login.microsoftonline.com/d6d49420-f39b-4df7-a1dc-d59a935871db/v2.0", + "https://sts.windows.net/f8cdef31-a31e-4b4a-93e4-5f571e91255a/", + "https://login.microsoftonline.com/f8cdef31-a31e-4b4a-93e4-5f571e91255a/v2.0", + "https://sts.windows.net/69e9b82d-4842-4902-8d1e-abc5b98a55e8/", + "https://login.microsoftonline.com/69e9b82d-4842-4902-8d1e-abc5b98a55e8/v2.0", + ]; + + if (!string.IsNullOrEmpty(validationOptions.TenantId) && Guid.TryParse(validationOptions.TenantId, out _)) + { + validationOptions.ValidIssuers.Add(string.Format(CultureInfo.InvariantCulture, AuthenticationConstants.ValidTokenIssuerUrlTemplateV1, validationOptions.TenantId)); + validationOptions.ValidIssuers.Add(string.Format(CultureInfo.InvariantCulture, AuthenticationConstants.ValidTokenIssuerUrlTemplateV2, validationOptions.TenantId)); + } + } + + if (string.IsNullOrEmpty(validationOptions.AzureBotServiceOpenIdMetadataUrl)) + { + validationOptions.AzureBotServiceOpenIdMetadataUrl = validationOptions.IsGov ? AuthenticationConstants.GovAzureBotServiceOpenIdMetadataUrl : AuthenticationConstants.PublicAzureBotServiceOpenIdMetadataUrl; + } + + if (string.IsNullOrEmpty(validationOptions.OpenIdMetadataUrl)) + { + validationOptions.OpenIdMetadataUrl = validationOptions.IsGov ? AuthenticationConstants.GovOpenIdMetadataUrl : AuthenticationConstants.PublicOpenIdMetadataUrl; + } + + var openIdMetadataRefresh = validationOptions.OpenIdMetadataRefresh ?? BaseConfigurationManager.DefaultAutomaticRefreshInterval; + + _ = services.AddAuthentication(options => + { + options.DefaultAuthenticateScheme = JwtBearerDefaults.AuthenticationScheme; + options.DefaultChallengeScheme = JwtBearerDefaults.AuthenticationScheme; + }) + .AddJwtBearer(options => + { + options.SaveToken = true; + options.TokenValidationParameters = new TokenValidationParameters + { + ValidateIssuer = true, + ValidateAudience = true, + ValidateLifetime = true, + ClockSkew = TimeSpan.FromMinutes(5), + ValidIssuers = validationOptions.ValidIssuers, + ValidAudiences = validationOptions.Audiences, + ValidateIssuerSigningKey = true, + RequireSignedTokens = true, + }; + + options.TokenValidationParameters.EnableAadSigningKeyIssuerValidation(); + + options.Events = new JwtBearerEvents + { + OnMessageReceived = async context => + { + string authorizationHeader = context.Request.Headers.Authorization.ToString(); + + if (string.IsNullOrEmpty(authorizationHeader)) + { + context.Options.TokenValidationParameters.ConfigurationManager ??= options.ConfigurationManager as BaseConfigurationManager; + await Task.CompletedTask.ConfigureAwait(false); + return; + } + + string[] parts = authorizationHeader?.Split(' ')!; + if (parts.Length != 2 || parts[0] != "Bearer") + { + context.Options.TokenValidationParameters.ConfigurationManager ??= options.ConfigurationManager as BaseConfigurationManager; + await Task.CompletedTask.ConfigureAwait(false); + return; + } + + JwtSecurityToken token = new(parts[1]); + string issuer = token.Claims.FirstOrDefault(claim => claim.Type == AuthenticationConstants.IssuerClaim)?.Value!; + + if (validationOptions.AzureBotServiceTokenHandling && AuthenticationConstants.BotFrameworkTokenIssuer.Equals(issuer)) + { + context.Options.TokenValidationParameters.ConfigurationManager = _openIdMetadataCache.GetOrAdd(validationOptions.AzureBotServiceOpenIdMetadataUrl, key => + { + return new ConfigurationManager(validationOptions.AzureBotServiceOpenIdMetadataUrl, new OpenIdConnectConfigurationRetriever(), new HttpClient()) + { + AutomaticRefreshInterval = openIdMetadataRefresh + }; + }); + } + else + { + context.Options.TokenValidationParameters.ConfigurationManager = _openIdMetadataCache.GetOrAdd(validationOptions.OpenIdMetadataUrl, key => + { + return new ConfigurationManager(validationOptions.OpenIdMetadataUrl, new OpenIdConnectConfigurationRetriever(), new HttpClient()) + { + AutomaticRefreshInterval = openIdMetadataRefresh + }; + }); + } + + await Task.CompletedTask.ConfigureAwait(false); + }, + OnTokenValidated = context => Task.CompletedTask, + OnForbidden = context => Task.CompletedTask, + OnAuthenticationFailed = context => Task.CompletedTask + }; + }); + } + + public class TokenValidationOptions + { + public IList? Audiences { get; set; } + public string? TenantId { get; set; } + public IList? ValidIssuers { get; set; } + public bool IsGov { get; set; } = false; + public string? AzureBotServiceOpenIdMetadataUrl { get; set; } + public string? OpenIdMetadataUrl { get; set; } + public bool AzureBotServiceTokenHandling { get; set; } = true; + public TimeSpan? OpenIdMetadataRefresh { get; set; } + } +} diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/AzureOpenAIModelProvider.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/AzureOpenAIModelProvider.cs new file mode 100644 index 00000000..96a2a7c5 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/AzureOpenAIModelProvider.cs @@ -0,0 +1,64 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text; + +namespace W365ComputerUseSample.ComputerUse; + +/// +/// Sends CUA model requests to Azure OpenAI using an API key. +/// This is the default provider for external customers. +/// +public class AzureOpenAIModelProvider : ICuaModelProvider +{ + private readonly HttpClient _httpClient; + private readonly string _url; + private readonly string _apiKey; + private readonly ILogger _logger; + + public string ModelName { get; } + + public AzureOpenAIModelProvider(IHttpClientFactory httpClientFactory, IConfiguration configuration, ILogger logger) + { + _httpClient = httpClientFactory.CreateClient("WebClient"); + _logger = logger; + var endpoint = configuration["AIServices:AzureOpenAI:Endpoint"] + ?? throw new InvalidOperationException("AIServices:AzureOpenAI:Endpoint is required."); + _apiKey = configuration["AIServices:AzureOpenAI:ApiKey"] + ?? throw new InvalidOperationException("AIServices:AzureOpenAI:ApiKey is required."); + var apiVersion = configuration["AIServices:AzureOpenAI:ApiVersion"] ?? "2025-04-01-preview"; + + // DeploymentName = deployment-based URL; ModelName = model-based URL (model sent in body) + var deploymentName = configuration["AIServices:AzureOpenAI:DeploymentName"]; + ModelName = configuration["AIServices:AzureOpenAI:ModelName"] + ?? deploymentName + ?? "computer-use-preview"; + + if (!string.IsNullOrEmpty(deploymentName)) + { + _url = $"{endpoint.TrimEnd('/')}/openai/deployments/{deploymentName}/responses?api-version={apiVersion}"; + } + else + { + // Model-based endpoint — model name goes in the request body, not the URL + _url = $"{endpoint.TrimEnd('/')}/openai/responses?api-version={apiVersion}"; + } + } + + public async Task SendAsync(string requestBody, CancellationToken cancellationToken) + { + _logger.LogInformation("Azure OpenAI request URL: {Url}", _url); + using var req = new HttpRequestMessage(HttpMethod.Post, _url); + req.Headers.Add("api-key", _apiKey); + req.Content = new StringContent(requestBody, Encoding.UTF8, "application/json"); + + var resp = await _httpClient.SendAsync(req, cancellationToken); + if (!resp.IsSuccessStatusCode) + { + var err = await resp.Content.ReadAsStringAsync(cancellationToken); + throw new HttpRequestException($"Azure OpenAI returned {resp.StatusCode}: {err}"); + } + + return await resp.Content.ReadAsStringAsync(cancellationToken); + } +} diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs new file mode 100644 index 00000000..4e14e02c --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs @@ -0,0 +1,1431 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Collections.Concurrent; +using System.Net.Http.Headers; +using System.Text.Json; +using Microsoft.Extensions.AI; +using ModelContextProtocol.Client; +using W365ComputerUseSample.ComputerUse.Models; + +namespace W365ComputerUseSample.ComputerUse; + +/// +/// Thin protocol adapter between OpenAI's computer-use-preview model and W365 MCP tools. +/// The model emits computer_call actions; this class translates them to MCP tool calls +/// and feeds back screenshots. Supports multiple concurrent sessions keyed by conversation ID. +/// +public class ComputerUseOrchestrator +{ + /// + /// Names of the CUA tools exposed by the W365 remote MCP server (as returned by its + /// tools/list). Used to identify which tools came from the W365 server vs other MCP servers + /// (mail, calendar, etc.) without per-server tracking. Includes ATG's local EndSession + /// mcptool. Update when W365 adds/renames tools. + /// + internal static readonly HashSet W365CuaToolNames = new(StringComparer.OrdinalIgnoreCase) + { + // Desktop interaction + "take_screenshot", "click", "type_text", "press_keys", "scroll", "move_mouse", + "drag_mouse", "wait_milliseconds", "get_cursor_position", "get_screen_size", + // Window management + "list_windows", "activate_window", "close_window", + // Accessibility / OCR + "get_accessibility_tree", "find_ui_element", "analyze_screen", + // Browser + "browser_navigate", "browser_click", "browser_type", "browser_get_html", + "browser_get_text", "browser_get_url", "browser_get_title", "browser_query_text", + "browser_list_tabs", "browser_switch_tab", "browser_close_tab", "browser_new_tab", + "browser_back", "browser_forward", "browser_reload", "browser_wait_for", + "browser_eval_js", "browser_screenshot", "focus_browser", + // Code / shell execution + "execute_python_code", "execute_shell_command", + // ATG-local tool + "mcp_W365ComputerUse_EndSession", + }; + + /// Returns true when identifies a W365 CUA tool. + internal static bool IsW365CuaTool(string? toolName) + => !string.IsNullOrEmpty(toolName) && W365CuaToolNames.Contains(toolName); + + private readonly ICuaModelProvider _modelProvider; + private readonly IHttpClientFactory _httpClientFactory; + private readonly HttpClient _httpClient; + private readonly ILogger _logger; + private readonly int _maxIterations; + private readonly string? _screenshotPath; + private readonly string? _oneDriveFolder; + private readonly string? _oneDriveUserId; + private readonly string _toolType; + private readonly List _tools; + + /// + /// Per-conversation session state. Each conversation (user chat) gets its own + /// W365 session, conversation history, and screenshot counter. + /// + private readonly ConcurrentDictionary _sessions = new(); + + /// + /// Primary MCP client (W365 server) — used for direct screenshot calls. + /// + private IMcpClient? _cachedMcpClient; + + /// + /// All MCP clients — one per connected server, for cleanup on shutdown. + /// + private readonly List _allMcpClients = []; + + /// + /// Shared tool list — merged tools from all connected servers. + /// + private IList? _cachedTools; + + /// + /// Tool list from non-W365 MCP servers (mail, calendar, etc.). Cached separately so that + /// non-CUA messages can be served without loading the W365 server's tools/list, which + /// triggers ATG's hostname-discovery handler and acquires a Cloud PC session (10-30s). + /// + private IList? _cachedNonW365Tools; + + /// + /// True when this orchestrator has already loaded a tools/list for the W365 MCP server in + /// the current process. A tools/list call is what triggers ATG's hostname-discovery handler + /// to acquire a Cloud PC session, so if we've cached W365 tools, ATG has a live session + /// on our behalf and the agent doesn't need to show an "acquiring session" status. + /// Does not account for server-side idle reaps (~30 min on ATG) — those surface as tool-call + /// failures handled by the orchestrator's RecoverSessionAsync path. + /// + public bool HasCachedW365Tools => _cachedTools != null && _cachedTools.Any(t => IsW365CuaTool((t as AIFunction)?.Name)); + + private const string SystemInstructions = """ + You are a helpful assistant that can also control a Windows desktop computer. + If the user's message is conversational or doesn't require computer use, respond with a helpful text message. + + ## Function tools (email, calendar, etc.) + You may have access to function tools for tasks like sending email, managing calendar, etc. + Prefer function tools over computer use when a matching one is available — they are faster and more reliable. + After calling a function tool, respond with a text message describing what you did and the result. + Do NOT call OnTaskComplete after using function tools — just respond with text. + + ## When no tool can accomplish the request + If the user asks for something and no function tool matches AND computer use cannot accomplish it either, + respond with a text message explaining clearly that you are unable to perform that task and why + (e.g. "I don't have an email tool available in this environment"). + Do NOT call OnTaskComplete in this case — only call OnTaskComplete when you have actually completed a computer-use task. + + ## Computer use (desktop control) + Only use computer actions when no function tool can accomplish the task. + When a task requires computer use, perform the actions and examine screenshots to verify they worked. + If you see browser setup or sign-in dialogs, dismiss them (Escape, X, or Skip). + Once you have completed a computer use task, call the OnTaskComplete function. + Do NOT continue looping after the task is done. + If the user sends a casual greeting or question that does not require computer use, reply with a helpful text message. + + ## Ending the Cloud PC session + Call the EndSession function ONLY when the user explicitly asks to end, close, + disconnect, release, or quit the session, or otherwise says they are done with + all work on the Cloud PC. Trigger phrases include: "end session", "close session", + "disconnect", "release the VM", "I'm done", "quit", "shut it down", "log off". + + Do NOT call EndSession in any of these situations: + - The user is starting a new task (e.g. "go to bbc.com", "open Word", "navigate to ..."). + - The user is switching topics or apps within the same session. + - You just completed a previous task — call OnTaskComplete instead, which keeps the session open for the next request. + - The user sends a casual greeting, question, or anything that's not an explicit request to end the session. + + Switching tasks inside one session is normal and expected. The session should + remain open across many user requests until the user explicitly asks to end it. + """; + + public ComputerUseOrchestrator( + ICuaModelProvider modelProvider, + IHttpClientFactory httpClientFactory, + IConfiguration configuration, + ILogger logger) + { + _modelProvider = modelProvider; + _httpClientFactory = httpClientFactory; + _httpClient = httpClientFactory.CreateClient("WebClient"); + _logger = logger; + _maxIterations = configuration.GetValue("ComputerUse:MaxIterations", 30); + _screenshotPath = configuration["Screenshots:LocalPath"]; + _oneDriveFolder = configuration["Screenshots:OneDriveFolder"]; + _oneDriveUserId = configuration["Screenshots:OneDriveUserId"]; + + _toolType = configuration["ComputerUse:ToolType"] ?? ""; + if (string.IsNullOrEmpty(_toolType)) + { + // Auto-derive from model name: gpt-* models use "computer", others use "computer_use_preview" + var modelName = _modelProvider.ModelName; + _toolType = modelName.StartsWith("gpt-", StringComparison.OrdinalIgnoreCase) ? "computer" : "computer_use_preview"; + } + var displayWidth = configuration.GetValue("ComputerUse:DisplayWidth", 1024); + var displayHeight = configuration.GetValue("ComputerUse:DisplayHeight", 768); + + // Build the computer tool definition based on the tool type: + // "computer_use_preview" — computer-use-preview model: display_width, display_height, environment + // "computer" — GPT-5.4+ models (Azure OpenAI): bare type, no params + object computerTool = _toolType switch + { + "computer" => new ComputerToolV2(), + _ => new ComputerUseTool { DisplayWidth = displayWidth, DisplayHeight = displayHeight } + }; + + _logger.LogInformation("CUA tool type: {ToolType}, display: {Width}x{Height}", _toolType, displayWidth, displayHeight); + + _tools = + [ + computerTool, + new FunctionToolDefinition + { + Name = "OnTaskComplete", + Description = "Call this function when the given task has been completed successfully." + }, + new FunctionToolDefinition + { + Name = "EndSession", + Description = "Call this function when the user wants to end, quit, disconnect, or release their computer session." + } + ]; + } + + /// + /// Lightweight intent classifier: decides whether a user message needs computer-use (CUA). + /// Runs a single tool-less model call and parses a strict YES/NO answer. On any parse error + /// or exception, returns true so we fall back to the full CUA loop — safer to pay + /// the W365 session cost than miss a legitimate computer-use request. + /// + public async Task ClassifyNeedsCuaAsync(string userMessage, CancellationToken cancellationToken = default) + { + const string ClassifierInstructions = """ + You are a router. Decide whether the user's message requires controlling or managing a + Windows desktop: clicking, typing into apps, taking screenshots, opening programs, + interacting with the GUI, OR ending/releasing a Cloud PC session. + Answer with a single word: + YES — if it needs desktop control or session management + NO — if it is chit-chat, a question answerable from knowledge, or a request that can + be fulfilled with mail/calendar/Teams/other function tools only + When uncertain, prefer YES. + """; + + try + { + var body = JsonSerializer.Serialize(new ComputerUseRequest + { + Model = _modelProvider.ModelName, + Instructions = ClassifierInstructions, + Input = [CreateUserMessage(userMessage)], + Tools = [], + Truncation = "auto" + }, new JsonSerializerOptions { DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull }); + + var responseJson = await _modelProvider.SendAsync(body, cancellationToken); + var response = JsonSerializer.Deserialize(responseJson); + if (response?.Output == null) + { + return true; + } + + foreach (var item in response.Output) + { + if (item.TryGetProperty("type", out var tProp) && tProp.GetString() == "message") + { + var replyText = ExtractText(item).Trim(); + _logger.LogInformation("CUA intent classifier reply for message {Preview}: {Reply}", Truncate(userMessage, 80), Truncate(replyText, 60)); + // Match on the first non-empty token. The router is instructed to emit a single + // word but may prepend/append fluff; trim to the leading YES/NO. + var upper = replyText.ToUpperInvariant(); + if (upper.StartsWith("NO")) return false; + if (upper.StartsWith("YES")) return true; + // Unexpected shape — default to CUA so we don't silently drop a legitimate request. + return true; + } + } + + return true; + } + catch (Exception ex) + { + _logger.LogWarning(ex, "CUA intent classifier threw — defaulting to needsCua=true."); + return true; + } + } + + /// + /// Run the CUA loop for a specific conversation. When is + /// false, the computer tool is withheld from the model's tool list so it + /// cannot emit computer_call actions — used on the non-CUA fast path where the + /// router decided the message doesn't need desktop control, so no W365 session is acquired. + /// + public async Task RunAsync( + string conversationId, + string userMessage, + IList w365Tools, + IList? additionalTools = null, + IMcpClient? mcpClient = null, + string? graphAccessToken = null, + Action? onStatusUpdate = null, + Func? onCuaStarting = null, + Func? onFolderLinkReady = null, + bool includeCuaTool = true, + CancellationToken cancellationToken = default) + { + _logger.LogInformation("Processing message for conversation {ConversationId}: {Message}", conversationId, Truncate(userMessage, 100)); + + var session = _sessions.GetOrAdd(conversationId, _ => new ConversationSession()); + + if (session.SessionStarted) + { + _logger.LogInformation("Reusing session for conversation {ConversationId}, W365SessionId={SessionId}", conversationId, session.W365SessionId); + } + + // Two-level screenshot folder layout: {yyyy-MM-dd}/{HHmmss}_{prompt-slug}. + // Set it here on every CUA-bound turn so each user prompt that triggers the CUA loop + // gets its own subfolder. Reset FolderShared so the new folder gets a fresh share + // link surfaced via onFolderLinkReady. Non-CUA turns keep the existing folder (or + // null) — they don't take screenshots, so the value is irrelevant. + if (includeCuaTool) + { + var promptSlug = SanitizeForPath(userMessage, maxLen: 30); + session.ScreenshotSubfolder = $"{DateTime.UtcNow:yyyy-MM-dd}/{DateTime.UtcNow:HHmmss}_{promptSlug}"; + session.FolderShared = false; + _logger.LogInformation("CUA turn folder for conversation {ConversationId}: {Folder}", conversationId, session.ScreenshotSubfolder); + } + + // For "computer" tool type (gpt-5.4+), include a screenshot with the FIRST user message if session already active + if (_toolType == "computer" && session.ConversationHistory.Count == 0 && session.SessionStarted) + { + var initialScreenshot = await CaptureScreenshotAsync(w365Tools, mcpClient, session.W365SessionId, cancellationToken); + var initialName = $"{conversationId[..8]}_{++session.ScreenshotCounter:D3}_initial"; + SaveScreenshotToDisk(initialScreenshot!, initialName, session.ScreenshotSubfolder); + var folderUrlReuse = await UploadScreenshotToOneDriveAsync(initialScreenshot!, $"{initialName}.png", graphAccessToken, session.ScreenshotSubfolder, session); + if (folderUrlReuse != null && onFolderLinkReady != null) + await onFolderLinkReady(folderUrlReuse); + session.ConversationHistory.Add(ToJsonElement(new + { + type = "message", + role = "user", + content = new object[] + { + new { type = "input_text", text = userMessage }, + new { type = "input_image", image_url = $"data:image/png;base64,{initialScreenshot}" } + } + })); + } + else + { + session.ConversationHistory.Add(CreateUserMessage(userMessage)); + } + + // Build the model's tools list — computer + OnTaskComplete + any additional function tools. + // When includeCuaTool is false (non-CUA fast path), skip all CUA-specific tools entirely + // (computer, OnTaskComplete, EndSession). Those require an active W365 session, and the + // classifier has already decided we don't need one. Only the caller-provided additional + // tools (mail/calendar/etc.) remain visible to the model. + var modelTools = includeCuaTool + ? new List(_tools) + : new List(); + if (additionalTools?.Count > 0) + { + // ATG injects a synthetic "Error" sentinel tool when any MCP server's tools/list + // fails (e.g. W365 session acquisition error). Its parameters schema is `{}` with + // no properties — Azure OpenAI rejects that with `invalid_function_parameters`. + // MyAgent reads the Error description for user-facing messaging before getting + // here, so it's safe to drop from the LLM call. + var llmTools = additionalTools.OfType() + .Where(t => !string.Equals(t.Name, "Error", StringComparison.OrdinalIgnoreCase)) + .ToList(); + + foreach (var tool in llmTools) + { + modelTools.Add(new FunctionToolDefinition + { + Name = tool.Name, + Description = tool.Description ?? string.Empty, + Parameters = tool.JsonSchema + }); + } + + _logger.LogInformation("Added {Count} additional function tools to model", llmTools.Count); + foreach (var tool in llmTools) + { + var schemaStr = tool.JsonSchema.GetRawText(); + _logger.LogInformation("Function tool: {Name}, Description: {Desc}, Schema: {Schema}", + tool.Name, Truncate(tool.Description ?? "", 80), Truncate(schemaStr, 200)); + } + } + + var cuaAcknowledged = false; + for (var i = 0; i < _maxIterations; i++) + { + cancellationToken.ThrowIfCancellationRequested(); + + // When running without the computer tool, strip past CUA-only turns from the history. + // Azure OpenAI 400s when an item references a tool that isn't declared in this turn: + // - `computer_call` / `computer_call_output` need `computer` or `computer_use_preview` + // - `function_call` / `function_call_output` for OnTaskComplete or EndSession need + // those CUA-only function tools declared (we strip them in non-CUA modelTools). + // Two-pass: first identify the call_ids of CUA-only function_calls so we can also + // drop their paired function_call_outputs (which carry only call_id, not the name). + // session.ConversationHistory itself is left intact so a later CUA turn still sees + // the full record. + var conversation = session.ConversationHistory; + if (!includeCuaTool) + { + var cuaOnlyCallIds = new HashSet(StringComparer.Ordinal); + foreach (var item in session.ConversationHistory) + { + if (!item.TryGetProperty("type", out var typeProp)) continue; + if (typeProp.GetString() != "function_call") continue; + if (!item.TryGetProperty("name", out var nameProp)) continue; + var name = nameProp.GetString(); + if (name != "OnTaskComplete" && name != "EndSession") continue; + if (item.TryGetProperty("call_id", out var idProp)) + { + var id = idProp.GetString(); + if (!string.IsNullOrEmpty(id)) cuaOnlyCallIds.Add(id); + } + } + + conversation = session.ConversationHistory + .Where(item => + { + if (!item.TryGetProperty("type", out var typeProp)) + { + return true; + } + var type = typeProp.GetString(); + if (type == "computer_call" || type == "computer_call_output") + { + return false; + } + if (type == "function_call" || type == "function_call_output") + { + if (item.TryGetProperty("call_id", out var idProp) + && cuaOnlyCallIds.Contains(idProp.GetString() ?? string.Empty)) + { + return false; + } + } + return true; + }) + .ToList(); + } + + var response = await CallModelAsync(conversation, modelTools, cancellationToken); + if (response?.Output == null || response.Output.Count == 0) + break; + + var hasActions = false; + + foreach (var item in response.Output) + { + var type = item.GetProperty("type").GetString(); + if (type == "reasoning") continue; + + session.ConversationHistory.Add(item); + + switch (type) + { + case "message": + return ExtractText(item); + + case "computer_call": + hasActions = true; + // Lazy session start: only start when CUA is actually needed + if (!cuaAcknowledged) + { + if (!session.SessionStarted) + { + _logger.LogInformation("CUA needed for conversation {ConversationId} — starting session", conversationId); + if (onCuaStarting != null) + await onCuaStarting(true); + onStatusUpdate?.Invoke("Starting W365 computing session..."); + session.SessionStarted = true; + _logger.LogInformation("Session marked started for conversation {ConversationId}", conversationId); + } + else if (onCuaStarting != null) + { + await onCuaStarting(false); + } + + cuaAcknowledged = true; + } + + _logger.LogInformation("CUA iteration {Iteration}: {Action}", i + 1, Truncate(item.GetRawText(), 200)); + try + { + session.ConversationHistory.Add(await HandleComputerCallAsync(item, w365Tools, mcpClient, session, graphAccessToken, onStatusUpdate, onFolderLinkReady, cancellationToken)); + } + catch (InvalidOperationException toolEx) + { + _logger.LogError(toolEx, "Tool call in CUA iteration failed for conversation {ConversationId}", conversationId); + // Pop the unpaired computer_call we added above so the next turn's conversation + // history isn't malformed (Azure OpenAI 400s on "No tool output found for …"). + if (session.ConversationHistory.Count > 0) + { + session.ConversationHistory.RemoveAt(session.ConversationHistory.Count - 1); + } + return toolEx.Message; + } + break; + + case "function_call": + hasActions = true; + var funcName = item.GetProperty("name").GetString(); + _logger.LogInformation("CUA iteration {Iteration}: function_call {Name}", i + 1, funcName); + if (funcName == "OnTaskComplete") + { + session.ConversationHistory.Add(CreateFunctionOutput(item.GetProperty("call_id").GetString()!)); + return "Task completed successfully."; + } + if (funcName == "EndSession") + { + session.ConversationHistory.Add(CreateFunctionOutput(item.GetProperty("call_id").GetString()!)); + _logger.LogInformation("EndSession requested by model for conversation {ConversationId}", conversationId); + onStatusUpdate?.Invoke("Ending session..."); + + // Always delegate to ATG regardless of session.SessionStarted: in V2 the + // session can be acquired by ATG's hostname-discovery when the sample agent + // calls tools/list at startup — before any computer_call flips SessionStarted. + // Gating on SessionStarted would leak the pool slot. ATG's handler is + // idempotent and returns "No active W365 session found." when there's nothing + // to release, so this is safe on fresh conversations too. + await EndSessionAsync(w365Tools, _logger, session.W365SessionId, cancellationToken); + session.SessionStarted = false; + session.W365SessionId = null; + session.ScreenshotSubfolder = null; + _sessions.TryRemove(conversationId, out _); + return "Session ended. The VM has been released back to the pool."; + } + + // Invoke additional MCP function tool + if (additionalTools != null) + { + var callResult = await InvokeFunctionCallAsync(item, additionalTools, cancellationToken); + session.ConversationHistory.Add(callResult); + } + + break; + } + } + + if (!hasActions) break; + } + + return "The task could not be completed within the allowed number of steps."; + } + + /// + /// End the W365 session. Called by the agent on shutdown or explicit end. + /// + public static async Task EndSessionAsync(IList tools, ILogger logger, string? sessionId, CancellationToken ct) + { + try + { + // The ATG-local mcptool resolves the active W365 session via the v2: context key + // — no sessionId arg needed (session routing is transparent). + var args = new Dictionary(); + await InvokeToolAsync(tools, "mcp_W365ComputerUse_EndSession", args, ct); + logger.LogInformation("W365 session ended"); + } + catch (ObjectDisposedException) + { + logger.LogInformation("MCP client already disposed — W365 session will be released by server timeout"); + } + catch (HttpRequestException httpEx) when (httpEx.StatusCode == System.Net.HttpStatusCode.NotFound) + { + logger.LogInformation("MCP transport session expired (404) — W365 session will be released by server timeout"); + } + catch (Exception ex) + { + logger.LogWarning(ex, "Failed to end W365 session"); + } + } + + /// + /// End all active sessions on shutdown. + /// + public async Task EndSessionOnShutdownAsync() + { + if (_cachedTools == null) + { + _logger.LogInformation("No tools cached — nothing to clean up on shutdown"); + return; + } + + foreach (var (convId, session) in _sessions) + { + _logger.LogInformation("Ending session for conversation {ConversationId}, W365SessionId={SessionId}", convId, session.W365SessionId); + await EndSessionAsync(_cachedTools, _logger, session.W365SessionId, CancellationToken.None); + } + + // Even when _sessions is empty, ATG may hold a V2 session acquired by tools/list at + // startup. Invoke EndSession once unconditionally — the handler is idempotent and + // returns a no-op response when there's nothing to release. + if (_sessions.IsEmpty) + { + _logger.LogInformation("No per-conversation sessions tracked; invoking EndSession to clean up any startup-acquired session"); + await EndSessionAsync(_cachedTools, _logger, sessionId: null, CancellationToken.None); + } + + _sessions.Clear(); + _cachedTools = null; + + foreach (var client in _allMcpClients) + { + try { await client.DisposeAsync(); } + catch (Exception ex) { _logger.LogWarning(ex, "Failed to dispose MCP client"); } + } + + _allMcpClients.Clear(); + _cachedMcpClient = null; + } + + /// + /// Get or create MCP clients and merged tool list. Connects to each server URL once on first call, + /// then returns the cached result on subsequent calls. The SSE connections stay alive across + /// messages (MyAgent is transient, but this orchestrator is singleton). + /// The primary MCP client (for W365 screenshot calls) is the one whose tools match the W365 CUA set (). + /// + public async Task<(IList Tools, IMcpClient? Client)> GetOrCreateMcpConnectionAsync( + IList mcpUrls, string accessToken) + { + // Only reuse the cache if it actually contains W365 CUA tools. If the previous attempt landed + // on ATG's synthetic "Error" tool (W365 session acquisition failed upstream), a subsequent + // message should re-hit ATG rather than silently reuse the failure. Otherwise the sample + // agent becomes permanently wedged on a single bad connect. + if (_cachedTools != null && _cachedTools.Any(t => IsW365CuaTool((t as AIFunction)?.Name))) + return (_cachedTools, _cachedMcpClient); + + // Stale caches from a failed prior attempt — clear before reconnecting. + if (_cachedTools != null) + { + _cachedTools = null; + _cachedMcpClient = null; + } + + var allTools = await LoadToolsFromUrlsAsync(mcpUrls, accessToken); + + // Only cache when we got at least one W365 CUA tool. Otherwise leave _cachedTools null so + // the next message retries the MCP connection and gives ATG another shot at session acquisition. + var cachingIsSafe = allTools.Any(t => IsW365CuaTool((t as AIFunction)?.Name)); + if (cachingIsSafe) + { + _cachedTools = allTools; + } + else + { + _logger.LogWarning("Not caching MCP tool list — no V2 CUA tools found (total {Count}). Next message will reconnect.", allTools.Count); + } + + _logger.LogInformation("Total tools from {ServerCount} MCP server(s): {ToolCount}", mcpUrls.Count, allTools.Count); + return (allTools, _cachedMcpClient); + } + + /// + /// Connects to non-W365 MCP servers (mail, calendar, etc.) and returns their merged tool + /// list. Unlike , this path caches unconditionally + /// because non-W365 servers don't have the "Error-tool-after-failed-session-acquisition" + /// problem — if mail/calendar can't load, we just proceed without those tools. + /// + public async Task> GetOrCreateNonW365McpConnectionAsync( + IList mcpUrls, string accessToken) + { + if (_cachedNonW365Tools != null) + { + return _cachedNonW365Tools; + } + + if (mcpUrls.Count == 0) + { + _cachedNonW365Tools = []; + return _cachedNonW365Tools; + } + + var tools = await LoadToolsFromUrlsAsync(mcpUrls, accessToken); + _cachedNonW365Tools = tools; + _logger.LogInformation("Loaded {Count} non-W365 MCP tools from {ServerCount} server(s).", tools.Count, mcpUrls.Count); + return tools; + } + + /// + /// Opens SSE connections to each MCP server URL and merges their tools/list responses. + /// Side effects: adds each connected client to for shutdown + /// cleanup, and sets to the first client whose tools match + /// the W365 CUA set (used for direct screenshot calls in the CUA loop). + /// + private async Task> LoadToolsFromUrlsAsync(IList mcpUrls, string accessToken) + { + var allTools = new List(); + + foreach (var url in mcpUrls) + { + try + { + // Each MCP server needs its own HttpClient — the auto-detect transport + // manages internal state that conflicts when shared across connections. + var httpClient = _httpClientFactory.CreateClient("McpConnection"); + httpClient.DefaultRequestHeaders.Authorization = + new System.Net.Http.Headers.AuthenticationHeaderValue("Bearer", accessToken); + + var transport = new SseClientTransport(new SseClientTransportOptions + { + Endpoint = new Uri(url), + TransportMode = HttpTransportMode.AutoDetect, + }, httpClient); + + var client = await McpClientFactory.CreateAsync(transport); + var tools = (await client.ListToolsAsync()).Cast().ToList(); + + _allMcpClients.Add(client); + allTools.AddRange(tools); + + // Use the W365 server's client for direct screenshot calls. + var hasW365Tools = tools.Any(t => IsW365CuaTool((t as AIFunction)?.Name)); + if (hasW365Tools) + _cachedMcpClient = client; + + _logger.LogInformation("Connected to MCP server at {Url}, loaded {Count} tools: {Names}", + url, tools.Count, string.Join(", ", tools.Select(t => (t as AIFunction)?.Name ?? "?"))); + } + catch (Exception ex) + { + _logger.LogError(ex, "Failed to connect to MCP server at {Url}. Skipping.", url); + } + } + + // Fallback: use first client if no W365 server found + _cachedMcpClient ??= _allMcpClients.FirstOrDefault(); + + return allTools; + } + + private async Task CallModelAsync(List conversation, List tools, CancellationToken ct) + { + var body = JsonSerializer.Serialize(new ComputerUseRequest + { + Model = _modelProvider.ModelName, + Instructions = SystemInstructions, + Input = conversation, + Tools = tools, + Truncation = "auto" + }, new JsonSerializerOptions { DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull }); + + _logger.LogDebug("Model request (first 2000 chars): {Body}", body[..Math.Min(2000, body.Length)]); + + var responseJson = await _modelProvider.SendAsync(body, ct); + _logger.LogDebug("Model response (first 2000 chars): {Response}", responseJson[..Math.Min(2000, responseJson.Length)]); + return JsonSerializer.Deserialize(responseJson); + } + + /// + /// Translate a computer_call into an MCP tool call, capture screenshot, return computer_call_output. + /// + private async Task HandleComputerCallAsync( + JsonElement call, IList tools, IMcpClient? mcpClient, ConversationSession session, string? graphAccessToken, Action? onStatus, Func? onFolderLinkReady, CancellationToken ct) + { + var callId = call.GetProperty("call_id").GetString()!; + var sessionId = session.W365SessionId; + + // GPT-5.4 uses "actions" (non-empty array), older models use "action" (singular). + if (call.TryGetProperty("actions", out var actionsArray) + && actionsArray.ValueKind == JsonValueKind.Array + && actionsArray.GetArrayLength() > 0) + { + foreach (var action in actionsArray.EnumerateArray()) + { + var actionType = action.GetProperty("type").GetString()!; + onStatus?.Invoke($"Performing: {actionType}..."); + + if (actionType != "screenshot") + { + var (toolName, args) = MapActionToMcpTool(actionType, action, sessionId); + var (result, sessionLost) = await InvokeToolCheckSessionAsync(tools, toolName, args, ct); + if (sessionLost) + { + onStatus?.Invoke("Session lost — recovering..."); + await RecoverSessionAsync(session, tools, _logger, ct); + // Re-invoke with the same args; ATG re-acquires the session transparently + // on this retry via the hostname-discovery handler. + await InvokeToolThrowOnErrorAsync(tools, toolName, args, ct); + } + else if (TryExtractToolError(result?.ToString(), out var errorText)) + { + // Surface tool errors to the bot reply rather than silently continuing with + // a no-op result. The model can otherwise loop or end with "No text was streamed". + throw new InvalidOperationException($"Error calling tool '{toolName}': {errorText}"); + } + } + } + } + else if (call.TryGetProperty("action", out var singleAction)) + { + var actionType = singleAction.GetProperty("type").GetString()!; + onStatus?.Invoke($"Performing: {actionType}..."); + + if (actionType != "screenshot") + { + var (toolName, args) = MapActionToMcpTool(actionType, singleAction, sessionId); + var (result, sessionLost) = await InvokeToolCheckSessionAsync(tools, toolName, args, ct); + if (sessionLost) + { + onStatus?.Invoke("Session lost — recovering..."); + await RecoverSessionAsync(session, tools, _logger, ct); + // Re-invoke with the same args; ATG re-acquires the session transparently + // on this retry via the hostname-discovery handler. + await InvokeToolThrowOnErrorAsync(tools, toolName, args, ct); + } + else if (TryExtractToolError(result?.ToString(), out var errorText)) + { + throw new InvalidOperationException($"Error calling tool '{toolName}': {errorText}"); + } + } + } + + // Always capture screenshot after action + var screenshot = await CaptureScreenshotAsync(tools, mcpClient, sessionId, ct); + + var stepName = $"{++session.ScreenshotCounter:D3}_step"; + SaveScreenshotToDisk(screenshot!, stepName, session.ScreenshotSubfolder); + var folderUrl = await UploadScreenshotToOneDriveAsync(screenshot!, $"{stepName}.png", graphAccessToken, session.ScreenshotSubfolder, session); + if (folderUrl != null && onFolderLinkReady != null) + await onFolderLinkReady(folderUrl); + + var safetyChecks = call.TryGetProperty("pending_safety_checks", out var sc) + ? sc : JsonSerializer.Deserialize("[]"); + + // "computer" tool type (gpt-5.4+) doesn't support acknowledged_safety_checks + if (_toolType == "computer") + { + return ToJsonElement(new + { + type = "computer_call_output", + call_id = callId, + output = new { type = "computer_screenshot", image_url = $"data:image/png;base64,{screenshot}" } + }); + } + + return ToJsonElement(new + { + type = "computer_call_output", + call_id = callId, + acknowledged_safety_checks = safetyChecks, + output = new { type = "computer_screenshot", image_url = $"data:image/png;base64,{screenshot}" } + }); + } + + /// + /// Map OpenAI computer_call action types to W365 V2 MCP tool names and arguments. + /// V2 tool names/schemas come from the in-VM MCP server (as returned by its tools/list). + /// No sessionId arg is passed — V2 session routing is handled transparently by ATG's + /// hostname-discovery handler via the x-ms-computerId header. + /// + private static (string ToolName, Dictionary Args) MapActionToMcpTool(string actionType, JsonElement action, string? sessionId) + { + // CUA model emits button names in lowercase ("left"/"right"); V2 click accepts PascalCase enum values. + static string NormalizeButton(string? button) => string.IsNullOrEmpty(button) + ? "Left" + : char.ToUpperInvariant(button[0]) + button.Substring(1).ToLowerInvariant(); + + return actionType.ToLowerInvariant() switch + { + "click" => ("click", new Dictionary + { + ["x"] = action.GetProperty("x").GetInt32(), + ["y"] = action.GetProperty("y").GetInt32(), + ["button"] = NormalizeButton(action.TryGetProperty("button", out var b) ? b.GetString() : null), + ["clickCount"] = 1 + }), + "double_click" => ("click", new Dictionary + { + ["x"] = action.GetProperty("x").GetInt32(), + ["y"] = action.GetProperty("y").GetInt32(), + ["button"] = "Left", + ["clickCount"] = 2 + }), + "type" => ("type_text", new Dictionary + { + ["text"] = action.GetProperty("text").GetString() + }), + "key" or "keys" or "keypress" => ("press_keys", new Dictionary + { + // TEMP (2026-04-22): model emits uppercase key names like "CTRL"/"ESC", but W365's + // press_keys tool appears to reject them. Experimenting with lowercase — revert or + // normalize differently once Vishnu confirms the expected format. + ["keys"] = ExtractKeys(action).Select(k => k.ToLowerInvariant()).ToArray() + }), + "scroll" => ("scroll", new Dictionary + { + ["x"] = action.GetProperty("x").GetInt32(), + ["y"] = action.GetProperty("y").GetInt32(), + ["deltaX"] = action.TryGetProperty("scroll_x", out var sx) ? sx.GetInt32() : 0, + ["deltaY"] = action.TryGetProperty("scroll_y", out var sy) ? sy.GetInt32() : 0 + }), + "move" => ("move_mouse", new Dictionary + { + ["x"] = action.GetProperty("x").GetInt32(), + ["y"] = action.GetProperty("y").GetInt32() + }), + "drag" => ("drag_mouse", new Dictionary + { + ["startX"] = action.GetProperty("path")[0].GetProperty("x").GetInt32(), + ["startY"] = action.GetProperty("path")[0].GetProperty("y").GetInt32(), + ["endX"] = action.GetProperty("path")[action.GetProperty("path").GetArrayLength() - 1].GetProperty("x").GetInt32(), + ["endY"] = action.GetProperty("path")[action.GetProperty("path").GetArrayLength() - 1].GetProperty("y").GetInt32(), + ["button"] = "Left" + }), + "wait" => ("wait_milliseconds", new Dictionary + { + ["ms"] = action.TryGetProperty("ms", out var ms) ? ms.GetInt32() : 500 + }), + "open_url" => ("browser_navigate", new Dictionary + { + ["url"] = action.GetProperty("url").GetString() + }), + _ => throw new NotSupportedException($"Unsupported action: {actionType}") + }; + } + + private async Task CaptureScreenshotAsync(IList tools, IMcpClient? mcpClient, string? sessionId, CancellationToken ct) + { + // V2 take_screenshot takes optional crop args; empty dictionary = full screen. + // No sessionId — V2 session routing is handled by ATG's hostname-discovery handler. + var screenshotArgs = new Dictionary(); + + // Use direct MCP client when available — AIFunction wrappers drop image content blocks + if (mcpClient != null) + { + var result = await mcpClient.CallToolAsync("take_screenshot", screenshotArgs, cancellationToken: ct); + + // Log full raw content on entry so we can diagnose new/unexpected shapes. + string? rawResultJson = null; + try + { + rawResultJson = JsonSerializer.Serialize(result); + _logger.LogDebug("take_screenshot returned {Count} content blocks. Raw JSON (truncated): {Raw}", + result.Content.Count, rawResultJson[..Math.Min(2000, rawResultJson.Length)]); + } + catch (Exception logEx) + { + _logger.LogWarning(logEx, "Failed to serialize take_screenshot response for logging."); + } + + // Detect MCP error responses and surface the real reason (e.g. "no pool with an available + // session was found") instead of falling through to the image-extractor and reporting the + // misleading "no extractable image data" message. + if (!string.IsNullOrEmpty(rawResultJson) && TryExtractToolError(rawResultJson, out var toolErrorText)) + { + throw new InvalidOperationException($"Error calling tool 'take_screenshot': {toolErrorText}"); + } + + foreach (var item in result.Content) + { + _logger.LogDebug("Screenshot content block: Type={Type}, DataLen={DataLen}, TextLen={TextLen}, MimeType={Mime}", + item.Type, item.Data?.Length ?? 0, item.Text?.Length ?? 0, item.MimeType); + + if (item.Type == "image" && !string.IsNullOrEmpty(item.Data)) + return item.Data; + if (!string.IsNullOrEmpty(item.Data)) + return item.Data; + if (item.Type == "text" && !string.IsNullOrEmpty(item.Text)) + { + var nested = ExtractBase64FromText(item.Text); + if (!string.IsNullOrEmpty(nested)) return nested; + } + } + + // Fallback: serialize to JSON and hunt for any base64-looking PNG payload in string fields. + // Covers MCP "resource" blocks (blob), embedded data URLs, or other unexpected shapes. + try + { + var rawJson = JsonSerializer.Serialize(result.Content); + var extracted = ExtractPngBase64FromJson(rawJson); + if (!string.IsNullOrEmpty(extracted)) + { + _logger.LogInformation("Extracted PNG base64 from take_screenshot via JSON scan ({Length} chars).", extracted.Length); + return extracted; + } + } + catch (Exception scanEx) + { + _logger.LogWarning(scanEx, "JSON-scan fallback for take_screenshot threw."); + } + + // Per-block warning with as much detail as possible before we give up. + foreach (var item in result.Content) + _logger.LogWarning("Unhandled screenshot block: Type={Type}, MimeType={Mime}, DataLen={DataLen}, TextLen={TextLen}, TextPreview={Preview}", + item.Type, item.MimeType, item.Data?.Length ?? 0, item.Text?.Length ?? 0, + item.Text?[..Math.Min(200, item.Text.Length)]); + + throw new InvalidOperationException($"Screenshot MCP response had {result.Content.Count} content blocks but no extractable image data. See preceding log lines for the raw shape."); + } + + // Fallback: AIFunction wrapper (may lose image content) + var aiResult = await InvokeToolAsync(tools, "take_screenshot", screenshotArgs, ct); + var str = aiResult?.ToString() ?? ""; + + _logger.LogInformation("Screenshot fallback: result type={Type}, length={Length}, preview={Preview}", + aiResult?.GetType().Name ?? "null", str.Length, str[..Math.Min(200, str.Length)]); + + try + { + using var doc = JsonDocument.Parse(str); + var root = doc.RootElement; + if (root.TryGetProperty("screenshotData", out var sd)) return sd.GetString() ?? ""; + if (root.TryGetProperty("image", out var img)) return img.GetString() ?? ""; + if (root.TryGetProperty("data", out var d)) return d.GetString() ?? ""; + + // Try nested content array (SDK gateway format) + if (root.TryGetProperty("content", out var content) && content.ValueKind == JsonValueKind.Array) + { + foreach (var block in content.EnumerateArray()) + { + if (block.TryGetProperty("data", out var blockData)) + { + var data = blockData.GetString(); + if (!string.IsNullOrEmpty(data)) return data; + } + if (block.TryGetProperty("text", out var blockText)) + { + var extracted = ExtractBase64FromText(blockText.GetString()); + if (!string.IsNullOrEmpty(extracted)) return extracted; + } + } + } + } + catch (JsonException) { } + + // Last resort: if it looks like raw base64 (long string, no JSON), use it directly + if (str.Length > 1000 && !str.StartsWith("{") && !str.StartsWith("[")) + return str; + + throw new InvalidOperationException($"Failed to extract screenshot. Response length: {str.Length}"); + } + + private static string? ExtractBase64FromText(string? text) + { + if (string.IsNullOrEmpty(text)) return null; + try + { + using var doc = JsonDocument.Parse(text); + var root = doc.RootElement; + if (root.TryGetProperty("screenshotData", out var sd)) return sd.GetString(); + if (root.TryGetProperty("image", out var img)) return img.GetString(); + if (root.TryGetProperty("data", out var d)) return d.GetString(); + } + catch (JsonException) { } + return null; + } + + /// + /// Last-resort extractor that walks arbitrary JSON for the first string value that looks + /// like a PNG base64 payload. Checks for the PNG magic prefix (iVBORw0KGgo) or a + /// data:image/png;base64, data URL inside any string field. Used when take_screenshot + /// returns a content block shape we don't explicitly handle (e.g. MCP resource.blob). + /// + private static string? ExtractPngBase64FromJson(string json) + { + if (string.IsNullOrEmpty(json)) return null; + try + { + using var doc = JsonDocument.Parse(json); + return Walk(doc.RootElement); + } + catch (JsonException) { return null; } + + static string? Walk(JsonElement el) + { + switch (el.ValueKind) + { + case JsonValueKind.Object: + foreach (var prop in el.EnumerateObject()) + { + var found = Walk(prop.Value); + if (!string.IsNullOrEmpty(found)) return found; + } + break; + case JsonValueKind.Array: + foreach (var item in el.EnumerateArray()) + { + var found = Walk(item); + if (!string.IsNullOrEmpty(found)) return found; + } + break; + case JsonValueKind.String: + var s = el.GetString(); + if (string.IsNullOrEmpty(s)) break; + // Data URL form: strip the prefix, return the base64 body. + var dataPrefix = "data:image/png;base64,"; + var dataIdx = s.IndexOf(dataPrefix, StringComparison.OrdinalIgnoreCase); + if (dataIdx >= 0) return s.Substring(dataIdx + dataPrefix.Length); + // Raw PNG base64 — starts with the PNG magic encoded as "iVBORw0KGgo". + if (s.Length >= 16 && s.StartsWith("iVBORw0KGgo", StringComparison.Ordinal)) return s; + break; + } + return null; + } + } + + internal static async Task InvokeToolAsync( + IList tools, string name, Dictionary args, CancellationToken ct) + { + var tool = tools.OfType().FirstOrDefault(t => t.Name.Equals(name, StringComparison.OrdinalIgnoreCase)) + ?? throw new InvalidOperationException($"Tool '{name}' not found."); + return await tool.InvokeAsync(new AIFunctionArguments(args), ct); + } + + /// + /// Invokes a tool and throws if the MCP result reports + /// isError: true. The exception message format is "Error calling tool '{name}': {detail}" + /// so the CUA loop can bubble a readable reason up to the bot reply instead of silently proceeding + /// with a bad state. + /// + internal static async Task InvokeToolThrowOnErrorAsync( + IList tools, string name, Dictionary args, CancellationToken ct) + { + var result = await InvokeToolAsync(tools, name, args, ct); + if (TryExtractToolError(result?.ToString(), out var errorText)) + { + throw new InvalidOperationException($"Error calling tool '{name}': {errorText}"); + } + + return result; + } + + /// + /// Parses an MCP CallToolResult-shaped JSON payload and extracts the error text when + /// isError is true. Returns true if an error was found, false otherwise. + /// + private static bool TryExtractToolError(string? response, out string message) + { + message = string.Empty; + if (string.IsNullOrEmpty(response)) return false; + try + { + using var doc = JsonDocument.Parse(response); + if (!doc.RootElement.TryGetProperty("isError", out var isErr) || isErr.ValueKind != JsonValueKind.True) + { + return false; + } + + if (doc.RootElement.TryGetProperty("content", out var content) && content.ValueKind == JsonValueKind.Array) + { + foreach (var block in content.EnumerateArray()) + { + if (block.TryGetProperty("text", out var text)) + { + message = text.GetString() ?? "(unknown error)"; + return true; + } + } + } + + message = "(unknown error)"; + return true; + } + catch (JsonException) + { + return false; + } + } + + /// + /// Invoke a tool and detect session-not-found errors. Returns (result, isSessionLost). + /// + private static async Task<(object? Result, bool IsSessionLost)> InvokeToolCheckSessionAsync( + IList tools, string name, Dictionary args, CancellationToken ct) + { + var result = await InvokeToolAsync(tools, name, args, ct); + var resultStr = result?.ToString() ?? ""; + if (IsSessionNotFoundError(resultStr)) + return (result, true); + return (result, false); + } + + /// + /// Check if a tool response indicates the session is no longer valid. + /// + private static bool IsSessionNotFoundError(string response) + { + if (string.IsNullOrEmpty(response)) return false; + var lower = response.ToLowerInvariant(); + return lower.Contains("no active session found") || + lower.Contains("session not found") || + lower.Contains("session expired") || + lower.Contains("session has been terminated"); + } + + /// + /// Recover from a lost session: release the stale session (best-effort) and reset the + /// session-state flags so the next MCP tool call triggers a fresh checkout via ATG's + /// hostname-discovery handler. There is no explicit "start" step in V2 — the session + /// is acquired transparently when the orchestrator's next computer_call goes through. + /// + private async Task RecoverSessionAsync( + ConversationSession session, IList tools, ILogger logger, CancellationToken ct) + { + logger.LogWarning("Session lost. Recovering — releasing stale session; ATG will re-acquire on next MCP call."); + + try + { + await EndSessionAsync(tools, logger, session.W365SessionId, ct); + } + catch (Exception ex) + { + logger.LogWarning(ex, "Best-effort EndSession during recovery failed"); + } + + session.W365SessionId = null; + session.SessionStarted = false; + session.ScreenshotSubfolder = null; + logger.LogInformation("Session state cleared; awaiting transparent re-acquisition."); + } + + private static string[] ExtractKeys(JsonElement action) + { + if (action.TryGetProperty("keys", out var k)) + { + if (k.ValueKind == JsonValueKind.Array) + return k.EnumerateArray().Select(e => e.GetString() ?? "").ToArray(); + if (k.ValueKind == JsonValueKind.String) + return [k.GetString() ?? ""]; + } + if (action.TryGetProperty("key", out var single) && single.ValueKind == JsonValueKind.String) + return [single.GetString() ?? ""]; + return []; + } + + private static string ExtractText(JsonElement msg) + { + if (msg.TryGetProperty("content", out var c) && c.ValueKind == JsonValueKind.Array) + foreach (var item in c.EnumerateArray()) + if (item.TryGetProperty("text", out var t)) + return t.GetString() ?? ""; + return ""; + } + + private static JsonElement CreateUserMessage(string text) => ToJsonElement(new + { + type = "message", role = "user", + content = new[] { new { type = "input_text", text } } + }); + + private static JsonElement CreateFunctionOutput(string callId, string output = "success") => ToJsonElement(new + { + type = "function_call_output", call_id = callId, output + }); + + /// + /// Invoke an MCP function tool from a model function_call and return the function_call_output. + /// + private async Task InvokeFunctionCallAsync(JsonElement functionCall, IList tools, CancellationToken ct) + { + var callId = functionCall.GetProperty("call_id").GetString()!; + var name = functionCall.GetProperty("name").GetString()!; + var argsStr = functionCall.GetProperty("arguments").GetString() ?? "{}"; + + _logger.LogInformation("Function call {Name} invoked. call_id={CallId}, args={Args}", + name, callId, Truncate(argsStr, 1000)); + + try + { + var args = JsonSerializer.Deserialize>(argsStr) ?? []; + var result = await InvokeToolAsync(tools, name, args, ct); + var resultStr = result?.ToString() ?? "success"; + _logger.LogInformation("Function call {Name} returned ({Length} chars): {Result}", + name, resultStr.Length, Truncate(resultStr, 2000)); + return CreateFunctionOutput(callId, resultStr); + } + catch (Exception ex) + { + _logger.LogError(ex, "Function call {Name} threw. call_id={CallId}", name, callId); + return CreateFunctionOutput(callId, $"Error: {ex.Message}"); + } + } + + private static JsonElement ToJsonElement(object obj) => + JsonSerializer.Deserialize(JsonSerializer.Serialize(obj)); + + private static string Truncate(string v, int max) => v.Length <= max ? v : v[..max] + "..."; + + /// + /// Convert a user-supplied string into a filesystem-safe slug for a folder name. + /// Letters and digits are kept; everything else collapses into single underscores. + /// Trailing underscores are trimmed, the result is lower-cased, and trimmed to + /// characters. Empty/whitespace inputs yield "untitled". + /// + private static string SanitizeForPath(string? input, int maxLen) + { + if (string.IsNullOrWhiteSpace(input)) return "untitled"; + var sb = new System.Text.StringBuilder(maxLen); + foreach (var c in input) + { + if (sb.Length >= maxLen) break; + if (char.IsLetterOrDigit(c)) sb.Append(char.ToLowerInvariant(c)); + else if (sb.Length > 0 && sb[sb.Length - 1] != '_') sb.Append('_'); + } + while (sb.Length > 0 && sb[sb.Length - 1] == '_') sb.Length--; + return sb.Length == 0 ? "untitled" : sb.ToString(); + } + + private void SaveScreenshotToDisk(string base64Data, string name, string? subfolder = null) + { + if (string.IsNullOrEmpty(base64Data) || string.IsNullOrEmpty(_screenshotPath)) return; + try + { + // Match the OneDrive folder layout — per-session subfolder under ./Screenshots so + // counters from concurrent or sequential conversations don't clobber each other. + var dir = string.IsNullOrEmpty(subfolder) + ? _screenshotPath + : Path.Combine(_screenshotPath, subfolder); + Directory.CreateDirectory(dir); + var path = Path.Combine(dir, $"{name}.png"); + File.WriteAllBytes(path, Convert.FromBase64String(base64Data)); + _logger.LogInformation("Screenshot saved: {Path}", path); + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to save screenshot"); + } + } + + /// + /// Upload a screenshot to the user's OneDrive via Microsoft Graph. + /// Requires a Graph access token with Files.ReadWrite scope. + /// Files are uploaded to /CUA-Sessions/{date}/ folder. + /// + private async Task UploadScreenshotToOneDriveAsync(string base64Data, string fileName, string? graphAccessToken, string? subfolder, ConversationSession session) + { + if (string.IsNullOrEmpty(graphAccessToken)) + { + _logger.LogDebug("OneDrive upload skipped: no Graph token"); + return null; + } + if (string.IsNullOrEmpty(base64Data)) + { + _logger.LogDebug("OneDrive upload skipped: no screenshot data"); + return null; + } + if (string.IsNullOrEmpty(_oneDriveFolder)) + { + _logger.LogDebug("OneDrive upload skipped: OneDriveFolder not configured"); + return null; + } + + try + { + // Use /me/drive for token owner, or /users/{id}/drive for a specific user + var driveBase = string.IsNullOrEmpty(_oneDriveUserId) + ? "https://graph.microsoft.com/v1.0/me/drive" + : $"https://graph.microsoft.com/v1.0/users/{_oneDriveUserId}/drive"; + var folderPath = string.IsNullOrEmpty(subfolder) + ? _oneDriveFolder.TrimStart('/') + : $"{_oneDriveFolder.TrimStart('/')}/{subfolder}"; + var url = $"{driveBase}/root:/{folderPath}/{fileName}:/content"; + + using var request = new HttpRequestMessage(HttpMethod.Put, url); + request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", graphAccessToken); + request.Content = new ByteArrayContent(Convert.FromBase64String(base64Data)); + request.Content.Headers.ContentType = new System.Net.Http.Headers.MediaTypeHeaderValue("image/png"); + + var response = await _httpClient.SendAsync(request); + if (response.IsSuccessStatusCode) + { + _logger.LogInformation("Screenshot uploaded to OneDrive: {Folder}/{FileName}", folderPath, fileName); + + // On first upload, create an org-scoped sharing link for the folder + if (!session.FolderShared) + { + var shareUrl = await ShareConversationFolderAsync(folderPath, graphAccessToken); + if (shareUrl != null) + { + session.FolderShared = true; + return shareUrl; + } + } + } + else + { + var content = await response.Content.ReadAsStringAsync(); + _logger.LogWarning("OneDrive upload failed: {Status} {Content}", response.StatusCode, content); + } + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to upload screenshot to OneDrive"); + } + + return null; + } + + /// + /// Create an organization-scoped sharing link for the conversation's screenshot folder. + /// Returns the web URL that anyone in the org can use to view the folder. + /// + private async Task ShareConversationFolderAsync(string folderPath, string graphAccessToken) + { + try + { + var driveBase = string.IsNullOrEmpty(_oneDriveUserId) + ? "https://graph.microsoft.com/v1.0/me/drive" + : $"https://graph.microsoft.com/v1.0/users/{_oneDriveUserId}/drive"; + + using var getRequest = new HttpRequestMessage(HttpMethod.Get, $"{driveBase}/root:/{folderPath}"); + getRequest.Headers.Authorization = new AuthenticationHeaderValue("Bearer", graphAccessToken); + var getResponse = await _httpClient.SendAsync(getRequest); + + if (!getResponse.IsSuccessStatusCode) + { + _logger.LogWarning("Failed to get folder item for sharing: {Status}", getResponse.StatusCode); + return null; + } + + var folderJson = await getResponse.Content.ReadAsStringAsync(); + using var doc = JsonDocument.Parse(folderJson); + var folderId = doc.RootElement.GetProperty("id").GetString(); + var webUrl = doc.RootElement.TryGetProperty("webUrl", out var wu) ? wu.GetString() : null; + + using var linkRequest = new HttpRequestMessage(HttpMethod.Post, $"{driveBase}/items/{folderId}/createLink"); + linkRequest.Headers.Authorization = new AuthenticationHeaderValue("Bearer", graphAccessToken); + linkRequest.Content = new StringContent( + JsonSerializer.Serialize(new { type = "view", scope = "organization" }), + System.Text.Encoding.UTF8, "application/json"); + + var linkResponse = await _httpClient.SendAsync(linkRequest); + if (linkResponse.IsSuccessStatusCode) + { + var linkJson = await linkResponse.Content.ReadAsStringAsync(); + using var linkDoc = JsonDocument.Parse(linkJson); + var shareUrl = linkDoc.RootElement.GetProperty("link").GetProperty("webUrl").GetString(); + _logger.LogInformation("Folder shared with org: {Url}", shareUrl); + return shareUrl; + } + else + { + var errorContent = await linkResponse.Content.ReadAsStringAsync(); + _logger.LogWarning("Failed to create sharing link: {Status} {Content}", linkResponse.StatusCode, errorContent); + return webUrl; + } + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to share conversation folder"); + return null; + } + } + + /// + /// Per-conversation session state. Holds the W365 session ID, conversation history, + /// and screenshot counter for a single user conversation. + /// + private sealed class ConversationSession + { + public bool SessionStarted { get; set; } + public string? W365SessionId { get; set; } + public List ConversationHistory { get; } = []; + public int ScreenshotCounter { get; set; } + public string? ScreenshotSubfolder { get; set; } + public bool FolderShared { get; set; } + } +} diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/CustomEndpointProvider.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/CustomEndpointProvider.cs new file mode 100644 index 00000000..de7dba54 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/CustomEndpointProvider.cs @@ -0,0 +1,119 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Net.Http.Headers; +using System.Security.Cryptography.X509Certificates; +using System.Text; +using System.Text.Json; +using Microsoft.Identity.Client; + +namespace W365ComputerUseSample.ComputerUse; + +/// +/// Sends CUA model requests via a local or custom model endpoint. +/// Supports certificate-based MSAL authentication for secured endpoints. +/// +public class CustomEndpointProvider : ICuaModelProvider +{ + private readonly HttpClient _httpClient; + private readonly string _endpoint; + private readonly string _customerId; + private readonly string? _modelTenantId; + private readonly string? _clientPrincipalId; + private readonly string? _partnerSource; + private readonly IConfidentialClientApplication? _msalApp; + private readonly string _scope; + private string? _cachedToken; + private DateTime _tokenExpiry = DateTime.MinValue; + + public string ModelName { get; } + + public CustomEndpointProvider(IHttpClientFactory httpClientFactory, IConfiguration configuration, ILogger logger) + { + _httpClient = httpClientFactory.CreateClient("WebClient"); + _endpoint = configuration["AIServices:CustomEndpoint:Endpoint"] + ?? throw new InvalidOperationException("AIServices:CustomEndpoint:Endpoint is required."); + _customerId = configuration["AIServices:CustomEndpoint:CustomerId"] + ?? throw new InvalidOperationException("AIServices:CustomEndpoint:CustomerId is required."); + _scope = configuration["AIServices:CustomEndpoint:Scope"] + ?? throw new InvalidOperationException("AIServices:CustomEndpoint:Scope is required."); + ModelName = configuration["AIServices:CustomEndpoint:Model"] ?? "computer-use-preview-2025-03-11"; + _modelTenantId = configuration["AIServices:CustomEndpoint:ModelTenantId"]; + _clientPrincipalId = configuration["AIServices:CustomEndpoint:ClientPrincipalId"]; + _partnerSource = configuration["AIServices:CustomEndpoint:PartnerSource"]; + + // Initialize MSAL with certificate + var certSubject = configuration["AIServices:CustomEndpoint:CertificateSubject"] ?? ""; + var clientId = configuration["AIServices:CustomEndpoint:ClientId"] ?? ""; + var tenantId = configuration["AIServices:CustomEndpoint:TenantId"] ?? ""; + + var cert = LoadCertificate(certSubject); + if (cert != null) + { + _msalApp = ConfidentialClientApplicationBuilder + .Create(clientId) + .WithAuthority($"https://login.microsoftonline.com/{tenantId}") + .WithCertificate(cert) + .Build(); + logger.LogInformation("CustomEndpoint MSAL initialized with certificate '{Subject}'", certSubject); + } + else + { + logger.LogWarning("CustomEndpoint certificate '{Subject}' not found. Auth will fail at runtime.", certSubject); + } + } + + public async Task SendAsync(string requestBody, CancellationToken cancellationToken) + { + var url = $"{_endpoint.TrimEnd('/')}/v0/resourceproxy/tenantId.{_customerId}/azureopenai/responses"; + var token = await GetTokenAsync(); + + using var req = new HttpRequestMessage(HttpMethod.Post, url); + req.Headers.Authorization = new AuthenticationHeaderValue("Bearer", token); + req.Headers.TryAddWithoutValidation("x-ms-client-principal-id", _clientPrincipalId); + req.Headers.TryAddWithoutValidation("x-ms-client-tenant-id", _modelTenantId); + req.Headers.TryAddWithoutValidation("X-ms-Source", + JsonSerializer.Serialize(new { consumptionSource = "Api", partnerSource = _partnerSource ?? "BICEvaluationService" })); + req.Content = new StringContent(requestBody, Encoding.UTF8, "application/json"); + + var resp = await _httpClient.SendAsync(req, cancellationToken); + if (!resp.IsSuccessStatusCode) + { + var err = await resp.Content.ReadAsStringAsync(cancellationToken); + throw new HttpRequestException($"CustomEndpoint returned {resp.StatusCode}: {err}"); + } + + return await resp.Content.ReadAsStringAsync(cancellationToken); + } + + private async Task GetTokenAsync() + { + if (!string.IsNullOrEmpty(_cachedToken) && DateTime.UtcNow < _tokenExpiry.AddMinutes(-5)) + return _cachedToken; + + if (_msalApp == null) + throw new InvalidOperationException("MSAL not initialized. Check CustomEndpoint certificate configuration."); + + var result = await _msalApp + .AcquireTokenForClient(new[] { _scope }) + .WithSendX5C(true) + .ExecuteAsync(); + + _cachedToken = result.AccessToken; + _tokenExpiry = result.ExpiresOn.DateTime; + return _cachedToken; + } + + private static X509Certificate2? LoadCertificate(string subject) + { + if (string.IsNullOrEmpty(subject)) return null; + foreach (var location in new[] { StoreLocation.CurrentUser, StoreLocation.LocalMachine }) + { + using var store = new X509Store(StoreName.My, location); + store.Open(OpenFlags.ReadOnly); + var certs = store.Certificates.Find(X509FindType.FindBySubjectName, subject, false); + if (certs.Count > 0) return certs[0]; + } + return null; + } +} diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ICuaModelProvider.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ICuaModelProvider.cs new file mode 100644 index 00000000..b26db535 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ICuaModelProvider.cs @@ -0,0 +1,17 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace W365ComputerUseSample.ComputerUse; + +/// +/// Abstraction for sending requests to a CUA-capable model (OpenAI Responses API). +/// Implementations handle authentication and endpoint differences. +/// +public interface ICuaModelProvider +{ + /// The model name to include in the request body. + string ModelName { get; } + + /// Send a serialized request body and return the raw JSON response. + Task SendAsync(string requestBody, CancellationToken cancellationToken); +} diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs new file mode 100644 index 00000000..1f774b33 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs @@ -0,0 +1,96 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace W365ComputerUseSample.ComputerUse.Models; + +/// +/// Response from the OpenAI Computer Use API. +/// +public class ComputerUseResponse +{ + [JsonPropertyName("id")] + public string? Id { get; set; } + + [JsonPropertyName("object")] + public string? Object { get; set; } + + [JsonPropertyName("created_at")] + public long CreatedAt { get; set; } + + [JsonPropertyName("model")] + public string? Model { get; set; } + + [JsonPropertyName("output")] + public List? Output { get; set; } +} + +/// +/// Request to the OpenAI Computer Use API. +/// +public class ComputerUseRequest +{ + [JsonPropertyName("model")] + public string Model { get; set; } = "computer-use-preview-2025-03-11"; + + [JsonPropertyName("truncation")] + public string Truncation { get; set; } = "auto"; + + [JsonPropertyName("instructions")] + public string? Instructions { get; set; } + + [JsonPropertyName("input")] + public List Input { get; set; } = []; + + [JsonPropertyName("tools")] + public List Tools { get; set; } = []; +} + +/// +/// Defines the computer_use_preview tool for the OpenAI Responses API. +/// Used by computer-use-preview models. +/// +public class ComputerUseTool +{ + [JsonPropertyName("type")] + public string Type { get; set; } = "computer_use_preview"; + + [JsonPropertyName("display_width")] + public int DisplayWidth { get; set; } = 1024; + + [JsonPropertyName("display_height")] + public int DisplayHeight { get; set; } = 768; + + [JsonPropertyName("environment")] + public string Environment { get; set; } = "windows"; +} + +/// +/// Defines the "computer" tool for GPT-5.4+ models. +/// Bare type with no parameters — the model infers screen dimensions from screenshots. +/// +public class ComputerToolV2 +{ + [JsonPropertyName("type")] + public string Type { get; set; } = "computer"; +} + +/// +/// Defines a function tool for the OpenAI Responses API. +/// +public class FunctionToolDefinition +{ + [JsonPropertyName("type")] + public string Type { get; set; } = "function"; + + [JsonPropertyName("name")] + public string Name { get; set; } = string.Empty; + + [JsonPropertyName("description")] + public string Description { get; set; } = string.Empty; + + [JsonPropertyName("parameters")] + public object? Parameters { get; set; } +} diff --git a/dotnet/w365-computer-use/sample-agent/Program.cs b/dotnet/w365-computer-use/sample-agent/Program.cs new file mode 100644 index 00000000..40d7537a --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/Program.cs @@ -0,0 +1,121 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using W365ComputerUseSample; +using W365ComputerUseSample.Agent; +using W365ComputerUseSample.ComputerUse; +using W365ComputerUseSample.Telemetry; +using Microsoft.Agents.A365.Observability; +using Microsoft.Agents.A365.Observability.Extensions.AgentFramework; +using Microsoft.Agents.A365.Observability.Runtime; +using Microsoft.Agents.A365.Tooling.Extensions.AgentFramework.Services; +using Microsoft.Agents.A365.Tooling.Services; +using Microsoft.Agents.Builder; +using Microsoft.Agents.Core; +using Microsoft.Agents.Hosting.AspNetCore; +using Microsoft.Agents.Storage; +using System.Reflection; + +var builder = WebApplication.CreateBuilder(args); + +// Setup ASP service defaults, including OpenTelemetry, Service Discovery, Resilience, and Health Checks +builder.ConfigureOpenTelemetry(); + +builder.Configuration.AddUserSecrets(Assembly.GetExecutingAssembly()); +builder.Services.AddControllers(); +builder.Services.AddHttpClient("WebClient", client => client.Timeout = TimeSpan.FromSeconds(600)); +builder.Services.AddHttpContextAccessor(); +builder.Logging.AddConsole(); + +// ********** Configure A365 Services ********** +// Configure observability. +builder.Services.AddAgenticTracingExporter(clusterCategory: "production"); + +// Add A365 tracing with Agent Framework integration +builder.AddA365Tracing(config => +{ + config.WithAgentFramework(); +}); + +// Add A365 Tooling Server integration +builder.Services.AddSingleton(); +builder.Services.AddSingleton(); +// ********** END Configure A365 Services ********** + +// Register the model provider based on configuration +var aiProvider = builder.Configuration["AIServices:Provider"] ?? "AzureOpenAI"; +if (aiProvider.Equals("CustomEndpoint", StringComparison.OrdinalIgnoreCase)) +{ + builder.Services.AddSingleton(); +} +else +{ + builder.Services.AddSingleton(); +} + +// Register the Computer Use orchestrator +builder.Services.AddSingleton(); + +// Add AspNet token validation +builder.Services.AddAgentAspNetAuthentication(builder.Configuration); + +// Register IStorage. For development, MemoryStorage is suitable. +builder.Services.AddSingleton(); + +// Add AgentApplicationOptions from config. +builder.AddAgentApplicationOptions(); + +// Add the bot (which is transient) +builder.AddAgent(); + +var app = builder.Build(); + +if (app.Environment.IsDevelopment()) +{ + app.UseDeveloperExceptionPage(); +} + +app.UseRouting(); +app.UseAuthentication(); +app.UseAuthorization(); + +// Map the /api/messages endpoint to the AgentApplication +app.MapPost("/api/messages", async (HttpRequest request, HttpResponse response, IAgentHttpAdapter adapter, IAgent agent, CancellationToken cancellationToken) => +{ + // Allow multiple reads of the request body — tracing/observability middleware may + // re-read it after the adapter, which otherwise triggers + // "Reading is not allowed after reader was completed" on the Kestrel pipe reader. + request.EnableBuffering(); + + await AgentMetrics.InvokeObservedHttpOperation("agent.process_message", async () => + { + await adapter.ProcessAsync(request, response, agent, cancellationToken); + }).ConfigureAwait(false); +}); + +// Health check endpoint for CI/CD pipelines and monitoring +app.MapGet("/api/health", () => Results.Ok(new { status = "healthy", timestamp = DateTime.UtcNow })); + +if (app.Environment.IsDevelopment() || app.Environment.EnvironmentName == "Playground") +{ + app.MapGet("/", () => "W365 Computer Use Sample Agent"); + app.UseDeveloperExceptionPage(); + app.MapControllers().AllowAnonymous(); + + // Hard coded for brevity and ease of testing. + // In production, this should be set in configuration. + app.Urls.Add("http://localhost:3978"); +} +else +{ + app.MapControllers(); +} + +// End active W365 session on shutdown to release the VM back to the pool +app.Lifetime.ApplicationStopping.Register(() => +{ + var orchestrator = app.Services.GetRequiredService(); + orchestrator.EndSessionOnShutdownAsync().GetAwaiter().GetResult(); +}); + +app.Run(); diff --git a/dotnet/w365-computer-use/sample-agent/Properties/launchSettings.json b/dotnet/w365-computer-use/sample-agent/Properties/launchSettings.json new file mode 100644 index 00000000..427cd153 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/Properties/launchSettings.json @@ -0,0 +1,11 @@ +{ + "profiles": { + "W365ComputerUseSample": { + "commandName": "Project", + "environmentVariables": { + "ASPNETCORE_ENVIRONMENT": "Development" + }, + "applicationUrl": "http://localhost:3978" + } + } +} diff --git a/dotnet/w365-computer-use/sample-agent/README.md b/dotnet/w365-computer-use/sample-agent/README.md new file mode 100644 index 00000000..3583dd76 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/README.md @@ -0,0 +1,199 @@ +# W365 Computer Use Sample + +## Overview + +This sample demonstrates how to build an agent that controls a Windows 365 Cloud PC using the OpenAI Responses API and the W365 Computer Use MCP server. + +The agent receives a natural language task from the user, provisions a W365 desktop session via MCP tools, then runs a CUA (Computer Use Agent) loop: the model sees screenshots, decides actions (click, type, scroll), and the MCP server executes them on the VM. + +It supports two model types: +- **`computer-use-preview`** - The original CUA model on Azure OpenAI +- **`gpt-5.4` / `gpt-5.4-mini`** - Newer GPT models with built-in computer use capability + +## Architecture + +``` +User Message + | +MyAgent (Agent Framework) + | connects to MCP server +W365 MCP Tools (QuickStartSession, CaptureScreenshot, Click, Type, etc.) + | provisions and controls +Windows 365 Cloud PC + | screenshots fed back to +CUA Model (Azure OpenAI) + | emits computer_call actions +ComputerUseOrchestrator (translates actions to MCP tool calls) + | loop until task complete +Response to User +``` + +**Key components:** + +| File | Purpose | +|------|---------| +| `Agent/MyAgent.cs` | Message handler - acquires tokens, connects to MCP, runs orchestrator | +| `ComputerUse/ComputerUseOrchestrator.cs` | CUA loop - sends screenshots to model, maps actions to MCP tools | +| `ComputerUse/ICuaModelProvider.cs` | Abstraction for the CUA model API | +| `ComputerUse/AzureOpenAIModelProvider.cs` | Azure OpenAI Responses API provider | + +## Prerequisites + +- [.NET 8.0 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) or later +- Azure OpenAI resource with a CUA-capable model deployment: + - `computer-use-preview` or `gpt-5.4` / `gpt-5.4-mini` + - [Request access to gpt-5.4](https://aka.ms/OAI/gpt54access) if needed +- Access to the W365 Computer Use MCP server (via [Agent 365 MCP Platform](https://learn.microsoft.com/en-us/microsoft-agent-365/developer/)) +- A bearer token with `McpServers.W365ComputerUse.All` scope + +## Setup + +### 1. Clone the repository + +```bash +git clone https://github.com/microsoft/Agent365-Samples.git +cd Agent365-Samples/dotnet/w365-computer-use/sample-agent +``` + +### 2. Restore dependencies + +```bash +dotnet restore +``` + +### 3. Create your local configuration + +Create `appsettings.Development.json` (this file is gitignored): + +**For `computer-use-preview` model:** +```json +{ + "AIServices": { + "Provider": "AzureOpenAI", + "AzureOpenAI": { + "DeploymentName": "computer-use-preview", + "Endpoint": "https://your-resource.openai.azure.com", + "ApiKey": "your-api-key" + } + }, + "McpServer": { + "Url": "http://localhost:52857/mcp/environments/Default-{your-tenant-id}/servers/mcp_W365ComputerUse" + } +} +``` + +**For `gpt-5.4-mini` model:** +```json +{ + "AIServices": { + "Provider": "AzureOpenAI", + "AzureOpenAI": { + "ModelName": "gpt-5.4-mini", + "Endpoint": "https://your-resource.openai.azure.com", + "ApiKey": "your-api-key" + } + }, + "McpServer": { + "Url": "http://localhost:52857/mcp/environments/Default-{your-tenant-id}/servers/mcp_W365ComputerUse" + } +} +``` + +### 4. Obtain a bearer token + +Get a token with the `McpServers.W365ComputerUse.All` scope for your tenant. See the [Agent 365 MCP Platform docs](https://learn.microsoft.com/en-us/microsoft-agent-365/developer/) for details. + +### 5. Start the MCP Platform server + +Ensure the MCP Platform is running locally on port 52857, or update the `McpServer:Url` in your config. + +### 6. Run the agent + +```powershell +cd sample-agent +$env:ASPNETCORE_ENVIRONMENT = "Development" +$env:BEARER_TOKEN = "" +$env:GRAPH_TOKEN = "" +dotnet run +``` + +### 7. Test with Agent Builder + +1. Open [Microsoft 365 Agents Playground](https://dev.agents.cloud.microsoft/) +2. Connect to `http://localhost:3978/api/messages` +3. Send a message like: *"Open Notepad and type Hello World"* +4. Screenshots are saved to `./Screenshots/` automatically + +## Configuration Reference + +| Setting | Description | Default | +|---------|-------------|---------| +| `AIServices:Provider` | Model provider | `AzureOpenAI` | +| `AIServices:AzureOpenAI:Endpoint` | Azure OpenAI resource URL | - | +| `AIServices:AzureOpenAI:ApiKey` | API key | - | +| `AIServices:AzureOpenAI:DeploymentName` | Deployment name (for deployment-based URLs) | `computer-use-preview` | +| `AIServices:AzureOpenAI:ModelName` | Model name (for model-based URLs, e.g., `gpt-5.4-mini`) | - | +| `McpServer:Url` | MCP server URL (dev only; omit for production) | - | +| `ComputerUse:MaxIterations` | Max CUA loop iterations | `30` | +| `ComputerUse:DisplayWidth` | Display width for computer_use_preview tool | `1024` | +| `ComputerUse:DisplayHeight` | Display height for computer_use_preview tool | `768` | +| `Screenshots:LocalPath` | Local path to save screenshots | `./Screenshots` | +| `Screenshots:OneDriveFolder` | OneDrive folder for screenshot upload | `CUA-Sessions` | +| `Screenshots:OneDriveUserId` | UPN/email to upload screenshots to a specific user's OneDrive (instead of token owner) | - | +| `BEARER_TOKEN` (env var) | MCP Platform token with `McpServers.W365ComputerUse.All` scope (dev only) | - | +| `GRAPH_TOKEN` (env var) | Graph API token with `Files.ReadWrite` scope for OneDrive upload (dev only) | - | + +## Supported Models + +| Model | Tool Type | Config | Notes | +|-------|-----------|--------|-------| +| `computer-use-preview` | `computer_use_preview` | `DeploymentName: "computer-use-preview"` | Uses `display_width`, `display_height`, `environment` params | +| `gpt-5.4` / `gpt-5.4-mini` | `computer` | `ModelName: "gpt-5.4-mini"` | Bare `{"type": "computer"}`. Initial screenshot sent with first message | + +The tool type is auto-derived from the model name (`gpt-*` -> `computer`, otherwise -> `computer_use_preview`). + +## How It Works + +1. **User sends a message** -> `MyAgent.OnMessageAsync` +2. **MCP connection** established (direct SSE in dev, A365 SDK gateway in prod) +3. **Session acquisition** runs transparently on the first W365 tool call — ATG picks an eligible Cloud PC pool, checks out a session, and probes readiness. The session is reused across messages. +4. **CUA loop** in `ComputerUseOrchestrator.RunAsync`: + - User message + conversation history sent to the model + - Model returns `computer_call` actions (click, type, scroll, etc.) + - Actions translated to MCP tool calls (`click`, `type_text`, `press_keys`, etc. — discovered dynamically from the W365 remote server) + - Screenshot captured after each action and fed back to the model + - Loop continues until model calls `OnTaskComplete` or max iterations reached +5. **Response** sent back to user +6. **Session persists** across messages for follow-up tasks +7. **EndSession** called on app shutdown (Ctrl+C) via `mcp_W365ComputerUse_EndSession` to release the VM + +## Session Management + +- Sessions are started **once** on the first message and reused across all subsequent messages +- Conversation history accumulates across messages, giving the model context for follow-up tasks +- On app shutdown (`Ctrl+C`), the agent calls `EndSession` to release the VM back to the pool +- If the app crashes, sessions auto-expire after ~30 minutes on the W365 backend + +## Production Deployment + +1. Register an Azure Bot and configure the agent +2. Set `AIServices` config with your Azure OpenAI credentials +3. Remove `McpServer:Url` - the A365 SDK will discover the MCP server via the Tooling Gateway +4. Deploy and install the agent in Teams / M365 + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| `McpServer:Url is required` | Create `appsettings.Development.json` with the MCP server URL | +| `BEARER_TOKEN` not set | Set `$env:BEARER_TOKEN` before running | +| Model returns 400 | Check that the tool type matches your model (see Supported Models table) | +| Screenshot extraction fails | Ensure MCP server returns image content blocks | +| Session orphaned after crash | Sessions auto-expire after ~30 min on the W365 backend | +| Multiple sessions started | Ensure only one agent instance is running per MCP server | + +## Links + +- [Microsoft Agent 365 Developer Documentation](https://learn.microsoft.com/en-us/microsoft-agent-365/developer/) +- [Microsoft 365 Agents SDK](https://learn.microsoft.com/microsoft-365/agents-sdk/) +- [Azure OpenAI Computer Use Guide](https://learn.microsoft.com/en-us/azure/foundry-classic/openai/how-to/computer-use) \ No newline at end of file diff --git a/dotnet/w365-computer-use/sample-agent/ServiceExtensions.cs b/dotnet/w365-computer-use/sample-agent/ServiceExtensions.cs new file mode 100644 index 00000000..4e72efc3 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ServiceExtensions.cs @@ -0,0 +1,39 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Microsoft.Agents.A365.Observability; +using Microsoft.Agents.A365.Observability.Extensions.AgentFramework; +using Microsoft.Agents.A365.Observability.Runtime; +using OpenTelemetry.Metrics; +using OpenTelemetry.Resources; +using OpenTelemetry.Trace; +using W365ComputerUseSample.Telemetry; + +namespace W365ComputerUseSample; + +public static class ServiceExtensions +{ + public static void ConfigureOpenTelemetry(this WebApplicationBuilder builder) + { + builder.Services.AddOpenTelemetry() + .ConfigureResource(resource => resource.AddService("W365ComputerUseSample")) + .WithTracing(tracing => + { + tracing + .AddSource(AgentMetrics.SourceName) + .AddAspNetCoreInstrumentation() + .AddHttpClientInstrumentation(); + + // Console exporter removed — dumps a full Activity block per HTTP request and + // swamped the console during bring-up. Re-add locally if you need trace output. + }) + .WithMetrics(metrics => + { + metrics + .AddMeter(AgentMetrics.SourceName) + .AddAspNetCoreInstrumentation() + .AddHttpClientInstrumentation() + .AddRuntimeInstrumentation(); + }); + } +} diff --git a/dotnet/w365-computer-use/sample-agent/Telemetry/A365OtelWrapper.cs b/dotnet/w365-computer-use/sample-agent/Telemetry/A365OtelWrapper.cs new file mode 100644 index 00000000..6e83aa41 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/Telemetry/A365OtelWrapper.cs @@ -0,0 +1,77 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Microsoft.Agents.A365.Observability.Caching; +using Microsoft.Agents.A365.Observability.Runtime.Common; +using Microsoft.Agents.A365.Runtime.Utils; +using Microsoft.Agents.Builder; +using Microsoft.Agents.Builder.App.UserAuth; +using Microsoft.Agents.Builder.State; +using W365ComputerUseSample.Telemetry; + +namespace W365ComputerUseSample; + +public static class A365OtelWrapper +{ + public static async Task InvokeObservedAgentOperation( + string operationName, + ITurnContext turnContext, + ITurnState turnState, + IExporterTokenCache? agentTokenCache, + UserAuthorization authSystem, + string authHandlerName, + ILogger? logger, + Func func) + { + await AgentMetrics.InvokeObservedAgentOperation( + operationName, + turnContext, + async () => + { + (string agentId, string tenantId) = await ResolveTenantAndAgentId(turnContext, authSystem, authHandlerName); + + using var baggageScope = new BaggageBuilder() + .TenantId(tenantId) + .AgentId(agentId) + .Build(); + + try + { + agentTokenCache?.RegisterObservability(agentId, tenantId, new AgenticTokenStruct + { + UserAuthorization = authSystem, + TurnContext = turnContext, + AuthHandlerName = authHandlerName + }, EnvironmentUtils.GetObservabilityAuthenticationScope()); + } + catch (Exception ex) + { + logger?.LogWarning("There was an error registering for observability: {Message}", ex.Message); + } + + await func().ConfigureAwait(false); + }).ConfigureAwait(false); + } + + private static async Task<(string agentId, string tenantId)> ResolveTenantAndAgentId(ITurnContext turnContext, UserAuthorization authSystem, string authHandlerName) + { + string agentId = ""; + if (turnContext.Activity.IsAgenticRequest()) + { + agentId = turnContext.Activity.GetAgenticInstanceId(); + } + else + { + if (authSystem != null && !string.IsNullOrEmpty(authHandlerName)) + { + agentId = Utility.ResolveAgentIdentity(turnContext, await authSystem.GetTurnTokenAsync(turnContext, authHandlerName)); + } + } + + agentId = agentId ?? Guid.Empty.ToString(); + string? tempTenantId = turnContext?.Activity?.Conversation?.TenantId ?? turnContext?.Activity?.Recipient?.TenantId; + string tenantId = tempTenantId ?? Guid.Empty.ToString(); + + return (agentId, tenantId); + } +} diff --git a/dotnet/w365-computer-use/sample-agent/Telemetry/AgentMetrics.cs b/dotnet/w365-computer-use/sample-agent/Telemetry/AgentMetrics.cs new file mode 100644 index 00000000..9329ec3f --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/Telemetry/AgentMetrics.cs @@ -0,0 +1,114 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Microsoft.Agents.Builder; +using Microsoft.Agents.Core; +using System.Diagnostics; +using System.Diagnostics.Metrics; + +namespace W365ComputerUseSample.Telemetry; + +public static class AgentMetrics +{ + public static readonly string SourceName = "A365.W365ComputerUse"; + + public static readonly ActivitySource ActivitySource = new(SourceName); + + private static readonly Meter Meter = new("A365.W365ComputerUse", "1.0.0"); + + public static readonly Counter MessageProcessedCounter = Meter.CreateCounter( + "agent.messages.processed", + "messages", + "Number of messages processed by the agent"); + + public static readonly Histogram MessageProcessingDuration = Meter.CreateHistogram( + "agent.message.processing.duration", + "ms", + "Duration of message processing in milliseconds"); + + public static readonly Counter CuaActionsExecuted = Meter.CreateCounter( + "agent.cua.actions.executed", + "actions", + "Number of CUA computer actions executed"); + + public static Activity InitializeMessageHandlingActivity(string handlerName, ITurnContext context) + { + var activity = ActivitySource.StartActivity(handlerName); + activity?.SetTag("Activity.Type", context.Activity.Type.ToString()); + activity?.SetTag("Agent.IsAgentic", context.IsAgenticRequest()); + activity?.SetTag("Caller.Id", context.Activity.From?.Id); + activity?.SetTag("Conversation.Id", context.Activity.Conversation?.Id); + activity?.SetTag("Channel.Id", context.Activity.ChannelId?.ToString()); + + return activity!; + } + + public static void FinalizeMessageHandlingActivity(Activity activity, ITurnContext context, long duration, bool success) + { + MessageProcessingDuration.Record(duration, + new("Conversation.Id", context.Activity.Conversation?.Id ?? "unknown"), + new("Channel.Id", context.Activity.ChannelId?.ToString() ?? "unknown")); + + if (success) + { + activity?.SetStatus(ActivityStatusCode.Ok); + } + else + { + activity?.SetStatus(ActivityStatusCode.Error); + } + + activity?.Stop(); + activity?.Dispose(); + } + + public static Task InvokeObservedHttpOperation(string operationName, Action func) + { + using var activity = ActivitySource.StartActivity(operationName); + try + { + func(); + activity?.SetStatus(ActivityStatusCode.Ok); + } + catch (Exception ex) + { + activity?.SetStatus(ActivityStatusCode.Error, ex.Message); + activity?.AddEvent(new ActivityEvent("exception", DateTimeOffset.UtcNow, new() + { + ["exception.type"] = ex.GetType().FullName, + ["exception.message"] = ex.Message, + ["exception.stacktrace"] = ex.StackTrace + })); + throw; + } + + return Task.CompletedTask; + } + + public static Task InvokeObservedAgentOperation(string operationName, ITurnContext context, Func func) + { + MessageProcessedCounter.Add(1); + var activity = InitializeMessageHandlingActivity(operationName, context); + var stopwatch = Stopwatch.StartNew(); + try + { + return func(); + } + catch (Exception ex) + { + activity?.SetStatus(ActivityStatusCode.Error, ex.Message); + activity?.AddEvent(new ActivityEvent("exception", DateTimeOffset.UtcNow, new() + { + ["exception.type"] = ex.GetType().FullName, + ["exception.message"] = ex.Message, + ["exception.stacktrace"] = ex.StackTrace + })); + throw; + } + finally + { + stopwatch.Stop(); + FinalizeMessageHandlingActivity(activity, context, stopwatch.ElapsedMilliseconds, true); + } + } +} diff --git a/dotnet/w365-computer-use/sample-agent/ToolingManifest.json b/dotnet/w365-computer-use/sample-agent/ToolingManifest.json new file mode 100644 index 00000000..fbe0236b --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ToolingManifest.json @@ -0,0 +1,14 @@ +{ + "mcpServers": [ + { "mcpServerName": "mcp_W365ComputerUse", "url": "mcp_W365ComputerUse" }, + { "mcpServerName": "mcp_MailTools", "url": "mcp_MailTools" }, + { "mcpServerName": "mcp_MeServer", "url": "mcp_MeServer" }, + { "mcpServerName": "mcp_CalendarTools", "url": "mcp_CalendarTools" }, + { "mcpServerName": "mcp_TeamsServer", "url": "mcp_TeamsServer" }, + { "mcpServerName": "mcp_ODSPRemoteServer", "url": "mcp_ODSPRemoteServer" }, + { "mcpServerName": "mcp_SharepointListsTools", "url": "mcp_SharepointListsTools" }, + { "mcpServerName": "mcp_AdminTools", "url": "mcp_AdminTools" }, + { "mcpServerName": "mcp_WordServer", "url": "mcp_WordServer" }, + { "mcpServerName": "mcp_m365copilot", "url": "mcp_m365copilot" } + ] +} diff --git a/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj b/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj new file mode 100644 index 00000000..e0455440 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj @@ -0,0 +1,37 @@ + + + + net8.0 + enable + a3c1d2e4-f567-8901-abcd-ef0123456789 + enable + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dotnet/w365-computer-use/sample-agent/appsettings.json b/dotnet/w365-computer-use/sample-agent/appsettings.json new file mode 100644 index 00000000..8db29183 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/appsettings.json @@ -0,0 +1,94 @@ +{ + "AgentApplication": { + "StartTypingTimer": false, + "RemoveRecipientMention": false, + "NormalizeMentions": false, + "AgenticAuthHandlerName": "agentic", + "UserAuthorization": { + "AutoSignin": false, + "Handlers": { + "agentic": { + "Type": "AgenticUserAuthorization", + "Settings": { + "Scopes": [ + "https://graph.microsoft.com/.default" + ] + } + } + } + } + }, + + "TokenValidation": { + "Enabled": false, + "Audiences": [ + "{{ClientId}}" + ] + }, + + "Logging": { + "LogLevel": { + "Default": "Information", + "Microsoft.AspNetCore": "Warning", + "Microsoft.Agents": "Warning", + "Microsoft.Hosting.Lifetime": "Information", + "System.Net.Http": "Warning" + } + }, + "AllowedHosts": "*", + "Connections": { + "ServiceConnection": { + "Settings": { + "AuthType": "UserManagedIdentity", + "AuthorityEndpoint": "https://login.microsoftonline.com/{{BOT_TENANT_ID}}", + "ClientId": "{{BOT_ID}}", + "Scopes": [ + "5a807f24-c9de-44ee-a3a7-329e88a00ffc/.default" + ] + } + } + }, + "ConnectionsMap": [ + { + "ServiceUrl": "*", + "Connection": "ServiceConnection" + } + ], + + "AIServices": { + "Provider": "AzureOpenAI", + + "AzureOpenAI": { + "DeploymentName": "<>", + "ModelName": "", + "Endpoint": "<>", + "ApiKey": "<>", + "ApiVersion": "2025-04-01-preview" + }, + + "CustomEndpoint": { + "Endpoint": "<>", + "CertificateSubject": "<>", + "ClientId": "<>", + "TenantId": "<>", + "Scope": "<>", + "Model": "computer-use-preview-2025-03-11", + "ModelTenantId": "<>", + "ClientPrincipalId": "<>", + "PartnerSource": "<>", + "CustomerId": "<>" + } + }, + + "ComputerUse": { + "MaxIterations": 30, + "DisplayWidth": 1024, + "DisplayHeight": 768 + }, + + "Screenshots": { + "LocalPath": "./Screenshots", + "OneDriveFolder": "CUA-Sessions", + "OneDriveUserId": "" + } +} diff --git a/dotnet/w365-computer-use/sample-agent/nuget.config b/dotnet/w365-computer-use/sample-agent/nuget.config new file mode 100644 index 00000000..765346e5 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/nuget.config @@ -0,0 +1,7 @@ + + + + + + +