From 8725549b8a7b383c37acf6018e6cc7cf2ba72247 Mon Sep 17 00:00:00 2001 From: Bertrand Desmarest Date: Tue, 5 May 2026 09:09:49 -0700 Subject: [PATCH] Add W365 Computer Use sample agent Adds dotnet/w365-computer-use/ as a new sample alongside agent-framework, autonomous, and semantic-kernel. The sample demonstrates an Agent 365 agent that controls a Windows 365 Cloud PC via the W365 Computer Use MCP server using Azure OpenAI's computer-use model. Highlights: - Intent classifier skips the W365 session-acquisition handshake on non-CUA messages (chit-chat, mail/calendar function-tool requests). - Full CUA loop translates model computer_call actions to MCP tool invocations, captures screenshots, and feeds them back to the model until it emits OnTaskComplete or EndSession. - Per-prompt OneDrive screenshot folders ({yyyy-MM-dd}/{HHmmss}_{slug}) with org-scoped sharing links surfaced to the user. - Filters ATG's synthetic "Error" sentinel tool out of the LLM tool list to avoid Azure OpenAI invalid_function_parameters rejections, while still reading its description for user-facing error messaging. - Supports both computer-use-preview and gpt-5.4 / gpt-5.4-mini. --- .../W365ComputerUseSample.sln | 25 + .../w365-computer-use/sample-agent/.gitignore | 9 + .../sample-agent/Agent/MyAgent.cs | 357 ++++++ .../sample-agent/AspNetExtensions.cs | 180 +++ .../ComputerUse/AzureOpenAIModelProvider.cs | 63 + .../ComputerUse/ComputerUseOrchestrator.cs | 1135 +++++++++++++++++ .../ComputerUse/ICuaModelProvider.cs | 17 + .../ComputerUse/Models/ComputerUseModels.cs | 96 ++ .../w365-computer-use/sample-agent/Program.cs | 106 ++ .../Properties/launchSettings.json | 11 + .../w365-computer-use/sample-agent/README.md | 306 +++++ .../sample-agent/ServiceExtensions.cs | 39 + .../sample-agent/ToolingManifest.json | 11 + .../sample-agent/W365ComputerUseSample.csproj | 37 + .../sample-agent/a365.config.example.json | 31 + .../sample-agent/appsettings.json | 79 ++ .../sample-agent/nuget.config | 7 + .../sample-agent/telemetry/A365OtelWrapper.cs | 74 ++ .../sample-agent/telemetry/AgentMetrics.cs | 114 ++ 19 files changed, 2697 insertions(+) create mode 100644 dotnet/w365-computer-use/W365ComputerUseSample.sln create mode 100644 dotnet/w365-computer-use/sample-agent/.gitignore create mode 100644 dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs create mode 100644 dotnet/w365-computer-use/sample-agent/AspNetExtensions.cs create mode 100644 dotnet/w365-computer-use/sample-agent/ComputerUse/AzureOpenAIModelProvider.cs create mode 100644 dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs create mode 100644 dotnet/w365-computer-use/sample-agent/ComputerUse/ICuaModelProvider.cs create mode 100644 dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs create mode 100644 dotnet/w365-computer-use/sample-agent/Program.cs create mode 100644 dotnet/w365-computer-use/sample-agent/Properties/launchSettings.json create mode 100644 dotnet/w365-computer-use/sample-agent/README.md create mode 100644 dotnet/w365-computer-use/sample-agent/ServiceExtensions.cs create mode 100644 dotnet/w365-computer-use/sample-agent/ToolingManifest.json create mode 100644 dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj create mode 100644 dotnet/w365-computer-use/sample-agent/a365.config.example.json create mode 100644 dotnet/w365-computer-use/sample-agent/appsettings.json create mode 100644 dotnet/w365-computer-use/sample-agent/nuget.config create mode 100644 dotnet/w365-computer-use/sample-agent/telemetry/A365OtelWrapper.cs create mode 100644 dotnet/w365-computer-use/sample-agent/telemetry/AgentMetrics.cs diff --git a/dotnet/w365-computer-use/W365ComputerUseSample.sln b/dotnet/w365-computer-use/W365ComputerUseSample.sln new file mode 100644 index 00000000..67fe015d --- /dev/null +++ b/dotnet/w365-computer-use/W365ComputerUseSample.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.14.36623.8 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "W365ComputerUseSample", "sample-agent\W365ComputerUseSample.csproj", "{B72D1A3E-4F8C-9E56-A1B2-C3D4E5F60789}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {B72D1A3E-4F8C-9E56-A1B2-C3D4E5F60789}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {B72D1A3E-4F8C-9E56-A1B2-C3D4E5F60789}.Debug|Any CPU.Build.0 = Debug|Any CPU + {B72D1A3E-4F8C-9E56-A1B2-C3D4E5F60789}.Release|Any CPU.ActiveCfg = Release|Any CPU + {B72D1A3E-4F8C-9E56-A1B2-C3D4E5F60789}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {D4E5F6A7-B8C9-0D1E-2F3A-4B5C6D7E8F90} + EndGlobalSection +EndGlobal diff --git a/dotnet/w365-computer-use/sample-agent/.gitignore b/dotnet/w365-computer-use/sample-agent/.gitignore new file mode 100644 index 00000000..cf4bf767 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/.gitignore @@ -0,0 +1,9 @@ +appsettings.Development.json +appsettings.Production.json +Screenshots/ +a365.config.json +a365.generated.config.json +app.zip +manifest/ +publish/ +.vscode/.env diff --git a/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs new file mode 100644 index 00000000..513bff6f --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/Agent/MyAgent.cs @@ -0,0 +1,357 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using W365ComputerUseSample.ComputerUse; +using W365ComputerUseSample.Telemetry; +using Microsoft.Agents.A365.Observability.Caching; +using Microsoft.Agents.A365.Runtime.Utils; +using Microsoft.Agents.A365.Tooling.Extensions.AgentFramework.Services; +using Microsoft.Agents.Builder; +using Microsoft.Agents.Builder.App; +using Microsoft.Agents.Builder.State; +using Microsoft.Agents.Core; +using Microsoft.Agents.Core.Models; +using Microsoft.Extensions.AI; + +namespace W365ComputerUseSample.Agent; + +public class MyAgent : AgentApplication +{ + private const string AgentWelcomeMessage = "Hello! I can help you perform tasks on a Windows 365 Cloud PC. Tell me what you'd like to do."; + private const string AgentHireMessage = "Thank you for hiring me! I can control a Windows desktop to accomplish tasks for you."; + private const string AgentFarewellMessage = "Thank you for your time, I enjoyed working with you."; + + private readonly IExporterTokenCache? _agentTokenCache; + private readonly ILogger _logger; + private readonly IMcpToolRegistrationService _toolService; + private readonly ComputerUseOrchestrator _orchestrator; + + private readonly string? AgenticAuthHandlerName; + private readonly string? OboAuthHandlerName; + + public MyAgent( + AgentApplicationOptions options, + IConfiguration configuration, + IExporterTokenCache agentTokenCache, + IMcpToolRegistrationService toolService, + ComputerUseOrchestrator orchestrator, + ILogger logger) : base(options) + { + _agentTokenCache = agentTokenCache; + _logger = logger; + _toolService = toolService; + _orchestrator = orchestrator; + + AgenticAuthHandlerName = configuration.GetValue("AgentApplication:AgenticAuthHandlerName"); + OboAuthHandlerName = configuration.GetValue("AgentApplication:OboAuthHandlerName"); + + // Greet when members are added + OnConversationUpdate(ConversationUpdateEvents.MembersAdded, WelcomeMessageAsync); + + // Compute auth handler arrays once + var agenticHandlers = !string.IsNullOrEmpty(AgenticAuthHandlerName) ? [AgenticAuthHandlerName] : Array.Empty(); + var oboHandlers = !string.IsNullOrEmpty(OboAuthHandlerName) ? [OboAuthHandlerName] : Array.Empty(); + + // Handle install/uninstall + OnActivity(ActivityTypes.InstallationUpdate, OnInstallationUpdateAsync, isAgenticOnly: true, autoSignInHandlers: agenticHandlers); + OnActivity(ActivityTypes.InstallationUpdate, OnInstallationUpdateAsync, isAgenticOnly: false); + + // Handle messages — MUST BE AFTER any other message handlers + OnActivity(ActivityTypes.Message, OnMessageAsync, isAgenticOnly: true, autoSignInHandlers: agenticHandlers); + OnActivity(ActivityTypes.Message, OnMessageAsync, isAgenticOnly: false, autoSignInHandlers: oboHandlers); + } + + protected async Task WelcomeMessageAsync(ITurnContext turnContext, ITurnState turnState, CancellationToken cancellationToken) + { + await AgentMetrics.InvokeObservedAgentOperation( + "WelcomeMessage", + turnContext, + async () => + { + foreach (ChannelAccount member in turnContext.Activity.MembersAdded) + { + if (member.Id != turnContext.Activity.Recipient.Id) + { + await turnContext.SendActivityAsync(AgentWelcomeMessage); + } + } + }); + } + + protected async Task OnInstallationUpdateAsync(ITurnContext turnContext, ITurnState turnState, CancellationToken cancellationToken) + { + await AgentMetrics.InvokeObservedAgentOperation( + "InstallationUpdate", + turnContext, + async () => + { + _logger.LogInformation( + "InstallationUpdate received — Action: '{Action}', DisplayName: '{Name}', UserId: '{Id}'", + turnContext.Activity.Action ?? "(none)", + turnContext.Activity.From?.Name ?? "(unknown)", + turnContext.Activity.From?.Id ?? "(unknown)"); + + if (turnContext.Activity.Action == InstallationUpdateActionTypes.Add) + { + await turnContext.SendActivityAsync(MessageFactory.Text(AgentHireMessage), cancellationToken); + } + else if (turnContext.Activity.Action == InstallationUpdateActionTypes.Remove) + { + await turnContext.SendActivityAsync(MessageFactory.Text(AgentFarewellMessage), cancellationToken); + } + }); + } + + protected async Task OnMessageAsync(ITurnContext turnContext, ITurnState turnState, CancellationToken cancellationToken) + { + if (turnContext is null) + { + throw new ArgumentNullException(nameof(turnContext)); + } + + var fromAccount = turnContext.Activity.From; + _logger.LogDebug( + "Turn received from user — DisplayName: '{Name}', UserId: '{Id}', AadObjectId: '{AadObjectId}'", + fromAccount?.Name ?? "(unknown)", + fromAccount?.Id ?? "(unknown)", + fromAccount?.AadObjectId ?? "(none)"); + + // Select auth handler based on request type + string? ObservabilityAuthHandlerName; + string? ToolAuthHandlerName; + if (turnContext.IsAgenticRequest()) + { + ObservabilityAuthHandlerName = ToolAuthHandlerName = AgenticAuthHandlerName; + } + else + { + ObservabilityAuthHandlerName = ToolAuthHandlerName = OboAuthHandlerName; + } + + await A365OtelWrapper.InvokeObservedAgentOperation( + "MessageProcessor", + turnContext, + turnState, + _agentTokenCache, + UserAuthorization, + ObservabilityAuthHandlerName ?? string.Empty, + _logger, + async () => + { + // Single typing indicator. A background refresh loop was removed because it + // raced with the main reply path and triggered Kestrel request-body + // "Reading is already in progress" → ObjectDisposedException crashes post-response. + // Informative updates via onStatusUpdate keep the UI feedback flowing. + await turnContext.SendActivityAsync(Activity.CreateTypingActivity(), cancellationToken).ConfigureAwait(false); + + try + { + var userText = turnContext.Activity.Text?.Trim() ?? string.Empty; + var conversationId = turnContext.Activity.Conversation?.Id ?? Guid.NewGuid().ToString(); + + // Step 1: classify intent with a cheap tool-less LLM call. If the message + // doesn't need desktop control ("hi", "summarize my inbox", etc.) we skip + // W365 tool loading entirely so ATG never acquires a Cloud PC session. + var needsCua = await _orchestrator.ClassifyNeedsCuaAsync(userText, cancellationToken); + + if (!needsCua) + { + // Non-CUA fast path: load tools, run orchestrator with the computer tool + // withheld. Supports function-tool paths (mail/calendar/etc.) without + // engaging the CUA loop. + var (_, nonCuaAdditionalTools) = await GetToolsAsync(turnContext, ToolAuthHandlerName); + var directResponse = await _orchestrator.RunAsync( + conversationId, + userText, + w365Tools: [], + additionalTools: nonCuaAdditionalTools, + graphAccessToken: null, + onStatusUpdate: status => turnContext.StreamingResponse.QueueInformativeUpdateAsync(status), + onCuaStarting: null, + onFolderLinkReady: null, + includeCuaTool: false, + cancellationToken: cancellationToken); + turnContext.StreamingResponse.QueueTextChunk(directResponse); + return; + } + + // CUA path: SendActivity the "Got it" acknowledgment FIRST, before the streaming + // response begins. If we send it later (e.g. from inside onCuaStarting), Teams/ + // Emulator orders it visually AFTER the streaming activity's final text since + // the streaming activity was created earlier in the turn — the user sees the + // result before the acknowledgment. + await turnContext.SendActivityAsync(MessageFactory.Text("Got it — working on it…"), cancellationToken).ConfigureAwait(false); + + var (w365Tools, additionalTools) = await GetToolsAsync(turnContext, ToolAuthHandlerName); + + if (w365Tools == null || w365Tools.Count == 0) + { + // ATG wraps tools/list failures into a synthetic "Error" tool whose Description + // carries the real reason (e.g. "no pool with an available session was found"). + // Extract it so the user sees the actionable message instead of the generic + // "Unable to connect" placeholder. + var errorMessage = ExtractW365ToolListError(additionalTools) + ?? "Unable to connect to the W365 Computer Use service. Please check your configuration."; + // Write the error into the streaming response so EndStreamAsync doesn't + // emit a confusing 'No text was streamed' alongside the real message. + turnContext.StreamingResponse.QueueTextChunk(errorMessage); + return; + } + + // Get Graph token for OneDrive screenshot upload via the agentic auth handler. + string? graphToken = null; + if (!string.IsNullOrEmpty(ToolAuthHandlerName)) + { + graphToken = await UserAuthorization.GetTurnTokenAsync(turnContext, ToolAuthHandlerName); + } + + // Run the CUA loop — session is managed per conversation + var response = await _orchestrator.RunAsync( + conversationId, + userText, + w365Tools, + additionalTools: additionalTools, + graphAccessToken: graphToken, + onStatusUpdate: status => turnContext.StreamingResponse.QueueInformativeUpdateAsync(status), + onCuaStarting: async (isNewSession) => + { + if (isNewSession) + { + await turnContext.StreamingResponse.QueueInformativeUpdateAsync("Starting a session to a Windows 365 Cloud PC…"); + } + }, + onFolderLinkReady: async url => await turnContext.SendActivityAsync( + MessageFactory.Text($"📸 Screenshots for this request: [View folder]({url})"), cancellationToken), + cancellationToken: cancellationToken); + + // Send the response + turnContext.StreamingResponse.QueueTextChunk(response); + } + finally + { + try { await turnContext.StreamingResponse.EndStreamAsync(cancellationToken).ConfigureAwait(false); } + catch (ObjectDisposedException) { /* stream already disposed */ } + } + }); + } + + /// + /// Get MCP tools from the A365 SDK's tooling gateway, separated into W365 (CUA) and + /// additional (function) tools by name. The SDK loads all tools registered to the + /// agent's blueprint in a single call. + /// + private async Task<(IList? W365Tools, IList? AdditionalTools)> GetToolsAsync(ITurnContext context, string? authHandlerName) + { + string? accessToken = null; + string? agentId = null; + + if (!string.IsNullOrEmpty(authHandlerName)) + { + accessToken = await UserAuthorization.GetTurnTokenAsync(context, authHandlerName); + agentId = Utility.ResolveAgentIdentity(context, accessToken); + } + + if (string.IsNullOrEmpty(accessToken) || string.IsNullOrEmpty(agentId)) + { + _logger.LogWarning("No auth token or agent identity available. Cannot connect to MCP."); + return (null, null); + } + + try + { + var handlerForMcp = !string.IsNullOrEmpty(authHandlerName) + ? authHandlerName + : OboAuthHandlerName ?? AgenticAuthHandlerName ?? string.Empty; + + var allTools = await _toolService.GetMcpToolsAsync(agentId, UserAuthorization, handlerForMcp, context).ConfigureAwait(false); + + return (FilterW365Tools(allTools), FilterAdditionalTools(allTools)); + } + catch (Exception ex) + { + _logger.LogError(ex, "Failed to connect to MCP servers."); + throw; + } + } + + private IList? FilterW365Tools(IList? allTools) + { + var w365Tools = allTools?.Where(t => + { + var name = (t as AIFunction)?.Name ?? t.ToString() ?? string.Empty; + return ComputerUseOrchestrator.IsW365CuaTool(name); + }).ToList(); + + if (w365Tools != null && w365Tools.Count > 0) + { + _logger.LogInformation("Found {ToolCount} W365 Computer Use tools", w365Tools.Count); + } + else + { + _logger.LogWarning("No W365 tools found among {TotalCount} MCP tools", allTools?.Count ?? 0); + } + + return w365Tools; + } + + private IList? FilterAdditionalTools(IList? allTools) + { + var additionalTools = allTools?.Where(t => + { + var name = (t as AIFunction)?.Name ?? t.ToString() ?? string.Empty; + return !ComputerUseOrchestrator.IsW365CuaTool(name); + }).ToList(); + + if (additionalTools != null && additionalTools.Count > 0) + { + _logger.LogInformation("Found {ToolCount} additional function tools: {Names}", + additionalTools.Count, + string.Join(", ", additionalTools.Select(t => (t as AIFunction)?.Name ?? "?"))); + } + + return additionalTools; + } + + /// + /// Looks for ATG's synthetic Error tool in the non-CUA tool list and extracts a + /// user-facing error reason from its description. ATG formats the description as: + /// "Tool list retrieval failed. Message='...'. ExceptionType='...'. ExceptionMessage='...'. CorrelationId=..., TimeStamp=...". + /// We prefer the ExceptionMessage field because it carries the specific reason + /// (e.g. "Failed to acquire a W365 session: no pool with an available session was found."). + /// Returns null if no error tool is present or the description can't be parsed. + /// + private static string? ExtractW365ToolListError(IList? additionalTools) + { + if (additionalTools == null || additionalTools.Count == 0) + { + return null; + } + + foreach (var tool in additionalTools) + { + if (tool is not AIFunction fn) continue; + if (!string.Equals(fn.Name, "Error", StringComparison.OrdinalIgnoreCase)) continue; + + var description = fn.Description ?? string.Empty; + var extracted = ExtractQuotedField(description, "ExceptionMessage=") + ?? ExtractQuotedField(description, "Message=") + ?? (string.IsNullOrWhiteSpace(description) ? null : description); + return extracted; + } + + return null; + } + + private static string? ExtractQuotedField(string source, string fieldPrefix) + { + var startMarker = fieldPrefix + "'"; + var start = source.IndexOf(startMarker, StringComparison.Ordinal); + if (start < 0) return null; + start += startMarker.Length; + var end = source.IndexOf("'.", start, StringComparison.Ordinal); + if (end < 0) end = source.IndexOf('\'', start); + if (end < start) return null; + var value = source.Substring(start, end - start); + return string.IsNullOrWhiteSpace(value) ? null : value; + } +} diff --git a/dotnet/w365-computer-use/sample-agent/AspNetExtensions.cs b/dotnet/w365-computer-use/sample-agent/AspNetExtensions.cs new file mode 100644 index 00000000..67917658 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/AspNetExtensions.cs @@ -0,0 +1,180 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Microsoft.Agents.Authentication; +using Microsoft.Agents.Core; +using Microsoft.AspNetCore.Authentication.JwtBearer; +using Microsoft.IdentityModel.Protocols; +using Microsoft.IdentityModel.Protocols.OpenIdConnect; +using Microsoft.IdentityModel.Tokens; +using Microsoft.IdentityModel.Validators; +using System.Collections.Concurrent; +using System.Globalization; +using System.IdentityModel.Tokens.Jwt; + +namespace W365ComputerUseSample; + +public static class AspNetExtensions +{ + private static readonly ConcurrentDictionary> _openIdMetadataCache = new(); + + public static void AddAgentAspNetAuthentication(this IServiceCollection services, IConfiguration configuration, string tokenValidationSectionName = "TokenValidation") + { + IConfigurationSection tokenValidationSection = configuration.GetSection(tokenValidationSectionName); + + if (!tokenValidationSection.Exists() || !tokenValidationSection.GetValue("Enabled", true)) + { + System.Diagnostics.Trace.WriteLine("AddAgentAspNetAuthentication: Auth disabled"); + return; + } + + services.AddAgentAspNetAuthentication(tokenValidationSection.Get()!); + } + + public static void AddAgentAspNetAuthentication(this IServiceCollection services, TokenValidationOptions validationOptions) + { + AssertionHelpers.ThrowIfNull(validationOptions, nameof(validationOptions)); + + if (validationOptions.Audiences == null || validationOptions.Audiences.Count == 0) + { + throw new ArgumentException($"{nameof(TokenValidationOptions)}:Audiences requires at least one ClientId"); + } + + foreach (var audience in validationOptions.Audiences) + { + if (!Guid.TryParse(audience, out _)) + { + throw new ArgumentException($"{nameof(TokenValidationOptions)}:Audiences values must be a GUID"); + } + } + + if (validationOptions.ValidIssuers == null || validationOptions.ValidIssuers.Count == 0) + { + validationOptions.ValidIssuers = + [ + "https://api.botframework.com", + "https://sts.windows.net/d6d49420-f39b-4df7-a1dc-d59a935871db/", + "https://login.microsoftonline.com/d6d49420-f39b-4df7-a1dc-d59a935871db/v2.0", + "https://sts.windows.net/f8cdef31-a31e-4b4a-93e4-5f571e91255a/", + "https://login.microsoftonline.com/f8cdef31-a31e-4b4a-93e4-5f571e91255a/v2.0", + "https://sts.windows.net/69e9b82d-4842-4902-8d1e-abc5b98a55e8/", + "https://login.microsoftonline.com/69e9b82d-4842-4902-8d1e-abc5b98a55e8/v2.0", + ]; + + if (!string.IsNullOrEmpty(validationOptions.TenantId) && Guid.TryParse(validationOptions.TenantId, out _)) + { + validationOptions.ValidIssuers.Add(string.Format(CultureInfo.InvariantCulture, AuthenticationConstants.ValidTokenIssuerUrlTemplateV1, validationOptions.TenantId)); + validationOptions.ValidIssuers.Add(string.Format(CultureInfo.InvariantCulture, AuthenticationConstants.ValidTokenIssuerUrlTemplateV2, validationOptions.TenantId)); + } + } + + if (string.IsNullOrEmpty(validationOptions.AzureBotServiceOpenIdMetadataUrl)) + { + validationOptions.AzureBotServiceOpenIdMetadataUrl = validationOptions.IsGov ? AuthenticationConstants.GovAzureBotServiceOpenIdMetadataUrl : AuthenticationConstants.PublicAzureBotServiceOpenIdMetadataUrl; + } + + if (string.IsNullOrEmpty(validationOptions.OpenIdMetadataUrl)) + { + validationOptions.OpenIdMetadataUrl = validationOptions.IsGov ? AuthenticationConstants.GovOpenIdMetadataUrl : AuthenticationConstants.PublicOpenIdMetadataUrl; + } + + var openIdMetadataRefresh = validationOptions.OpenIdMetadataRefresh ?? BaseConfigurationManager.DefaultAutomaticRefreshInterval; + + _ = services.AddAuthentication(options => + { + options.DefaultAuthenticateScheme = JwtBearerDefaults.AuthenticationScheme; + options.DefaultChallengeScheme = JwtBearerDefaults.AuthenticationScheme; + }) + .AddJwtBearer(options => + { + options.SaveToken = true; + options.TokenValidationParameters = new TokenValidationParameters + { + ValidateIssuer = true, + ValidateAudience = true, + ValidateLifetime = true, + ClockSkew = TimeSpan.FromMinutes(5), + ValidIssuers = validationOptions.ValidIssuers, + ValidAudiences = validationOptions.Audiences, + ValidateIssuerSigningKey = true, + RequireSignedTokens = true, + }; + + options.TokenValidationParameters.EnableAadSigningKeyIssuerValidation(); + + options.Events = new JwtBearerEvents + { + OnMessageReceived = async context => + { + string authorizationHeader = context.Request.Headers.Authorization.ToString(); + + if (string.IsNullOrEmpty(authorizationHeader)) + { + context.Options.TokenValidationParameters.ConfigurationManager ??= options.ConfigurationManager as BaseConfigurationManager; + await Task.CompletedTask.ConfigureAwait(false); + return; + } + + string[] parts = authorizationHeader.Split(' '); + if (parts.Length != 2 || !string.Equals(parts[0], "Bearer", StringComparison.OrdinalIgnoreCase)) + { + context.Options.TokenValidationParameters.ConfigurationManager ??= options.ConfigurationManager as BaseConfigurationManager; + await Task.CompletedTask.ConfigureAwait(false); + return; + } + + string issuer; + try + { + JwtSecurityToken token = new(parts[1]); + issuer = token.Claims.FirstOrDefault(claim => claim.Type == AuthenticationConstants.IssuerClaim)?.Value!; + } + catch + { + // Malformed/opaque token — let normal JWT validation reject it with 401/403. + context.Options.TokenValidationParameters.ConfigurationManager ??= options.ConfigurationManager as BaseConfigurationManager; + return; + } + + if (validationOptions.AzureBotServiceTokenHandling && AuthenticationConstants.BotFrameworkTokenIssuer.Equals(issuer)) + { + context.Options.TokenValidationParameters.ConfigurationManager = _openIdMetadataCache.GetOrAdd(validationOptions.AzureBotServiceOpenIdMetadataUrl, key => + { + return new ConfigurationManager(validationOptions.AzureBotServiceOpenIdMetadataUrl, new OpenIdConnectConfigurationRetriever()) + { + AutomaticRefreshInterval = openIdMetadataRefresh + }; + }); + } + else + { + context.Options.TokenValidationParameters.ConfigurationManager = _openIdMetadataCache.GetOrAdd(validationOptions.OpenIdMetadataUrl, key => + { + return new ConfigurationManager(validationOptions.OpenIdMetadataUrl, new OpenIdConnectConfigurationRetriever()) + { + AutomaticRefreshInterval = openIdMetadataRefresh + }; + }); + } + + await Task.CompletedTask.ConfigureAwait(false); + }, + OnTokenValidated = context => Task.CompletedTask, + OnForbidden = context => Task.CompletedTask, + OnAuthenticationFailed = context => Task.CompletedTask + }; + }); + } + + public class TokenValidationOptions + { + public IList? Audiences { get; set; } + public string? TenantId { get; set; } + public IList? ValidIssuers { get; set; } + public bool IsGov { get; set; } = false; + public string? AzureBotServiceOpenIdMetadataUrl { get; set; } + public string? OpenIdMetadataUrl { get; set; } + public bool AzureBotServiceTokenHandling { get; set; } = true; + public TimeSpan? OpenIdMetadataRefresh { get; set; } + } +} diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/AzureOpenAIModelProvider.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/AzureOpenAIModelProvider.cs new file mode 100644 index 00000000..5ca126dd --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/AzureOpenAIModelProvider.cs @@ -0,0 +1,63 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text; + +namespace W365ComputerUseSample.ComputerUse; + +/// +/// Sends CUA model requests to Azure OpenAI using an API key. +/// +public class AzureOpenAIModelProvider : ICuaModelProvider +{ + private readonly HttpClient _httpClient; + private readonly string _url; + private readonly string _apiKey; + private readonly ILogger _logger; + + public string ModelName { get; } + + public AzureOpenAIModelProvider(IHttpClientFactory httpClientFactory, IConfiguration configuration, ILogger logger) + { + _httpClient = httpClientFactory.CreateClient("WebClient"); + _logger = logger; + var endpoint = configuration["AIServices:AzureOpenAI:Endpoint"] + ?? throw new InvalidOperationException("AIServices:AzureOpenAI:Endpoint is required."); + _apiKey = configuration["AIServices:AzureOpenAI:ApiKey"] + ?? throw new InvalidOperationException("AIServices:AzureOpenAI:ApiKey is required."); + var apiVersion = configuration["AIServices:AzureOpenAI:ApiVersion"] ?? "2025-04-01-preview"; + + // DeploymentName = deployment-based URL; ModelName = model-based URL (model sent in body) + var deploymentName = configuration["AIServices:AzureOpenAI:DeploymentName"]; + ModelName = configuration["AIServices:AzureOpenAI:ModelName"] + ?? deploymentName + ?? "computer-use-preview"; + + if (!string.IsNullOrEmpty(deploymentName)) + { + _url = $"{endpoint.TrimEnd('/')}/openai/deployments/{deploymentName}/responses?api-version={apiVersion}"; + } + else + { + // Model-based endpoint — model name goes in the request body, not the URL + _url = $"{endpoint.TrimEnd('/')}/openai/responses?api-version={apiVersion}"; + } + } + + public async Task SendAsync(string requestBody, CancellationToken cancellationToken) + { + _logger.LogInformation("Azure OpenAI request URL: {Url}", _url); + using var req = new HttpRequestMessage(HttpMethod.Post, _url); + req.Headers.Add("api-key", _apiKey); + req.Content = new StringContent(requestBody, Encoding.UTF8, "application/json"); + + var resp = await _httpClient.SendAsync(req, cancellationToken); + if (!resp.IsSuccessStatusCode) + { + var err = await resp.Content.ReadAsStringAsync(cancellationToken); + throw new HttpRequestException($"Azure OpenAI returned {resp.StatusCode}: {err}"); + } + + return await resp.Content.ReadAsStringAsync(cancellationToken); + } +} diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs new file mode 100644 index 00000000..32b265e6 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ComputerUseOrchestrator.cs @@ -0,0 +1,1135 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Collections.Concurrent; +using System.Net.Http; +using System.Net.Http.Headers; +using System.Text.Json; +using Microsoft.Extensions.AI; +using W365ComputerUseSample.ComputerUse.Models; + +namespace W365ComputerUseSample.ComputerUse; + +/// +/// Thin protocol adapter between OpenAI's computer-use-preview model and W365 MCP tools. +/// The model emits computer_call actions; this class translates them to MCP tool calls +/// and feeds back screenshots. Supports multiple concurrent sessions keyed by conversation ID. +/// +public class ComputerUseOrchestrator +{ + /// + /// Names of the CUA tools exposed by the W365 remote MCP server (as returned by its + /// tools/list). Used to identify which tools came from the W365 server vs other MCP servers + /// (mail, calendar, etc.) without per-server tracking. Includes ATG's local EndSession + /// mcptool. Update when W365 adds/renames tools. + /// + internal static readonly HashSet W365CuaToolNames = new(StringComparer.OrdinalIgnoreCase) + { + // Desktop interaction + "take_screenshot", "click", "type_text", "press_keys", "scroll", "move_mouse", + "drag_mouse", "wait_milliseconds", "get_cursor_position", "get_screen_size", + // Window management + "list_windows", "activate_window", "close_window", + // Accessibility / OCR + "get_accessibility_tree", "find_ui_element", "analyze_screen", + // Browser + "browser_navigate", "browser_click", "browser_type", "browser_get_html", + "browser_get_text", "browser_get_url", "browser_get_title", "browser_query_text", + "browser_list_tabs", "browser_switch_tab", "browser_close_tab", "browser_new_tab", + "browser_back", "browser_forward", "browser_reload", "browser_wait_for", + "browser_eval_js", "browser_screenshot", "focus_browser", + // Code / shell execution + "execute_python_code", "execute_shell_command", + // ATG-local tool + "mcp_W365ComputerUse_EndSession", + }; + + /// Returns true when identifies a W365 CUA tool. + internal static bool IsW365CuaTool(string? toolName) + => !string.IsNullOrEmpty(toolName) && W365CuaToolNames.Contains(toolName); + + private readonly ICuaModelProvider _modelProvider; + private readonly IHttpClientFactory _httpClientFactory; + private readonly HttpClient _httpClient; + private readonly ILogger _logger; + private readonly int _maxIterations; + private readonly string? _screenshotPath; + private readonly string? _oneDriveFolder; + private readonly string? _oneDriveUserId; + private readonly string _toolType; + private readonly List _tools; + + /// + /// Per-conversation session state. Each conversation (user chat) gets its own + /// W365 session, conversation history, and screenshot counter. + /// + private readonly ConcurrentDictionary _sessions = new(); + + private const string SystemInstructions = """ + You are a helpful assistant that can also control a Windows desktop computer. + If the user's message is conversational or doesn't require computer use, respond with a helpful text message. + + ## Function tools (email, calendar, etc.) + You may have access to function tools for tasks like sending email, managing calendar, etc. + Prefer function tools over computer use when a matching one is available — they are faster and more reliable. + After calling a function tool, respond with a text message describing what you did and the result. + Do NOT call OnTaskComplete after using function tools — just respond with text. + + ## When no tool can accomplish the request + If the user asks for something and no function tool matches AND computer use cannot accomplish it either, + respond with a text message explaining clearly that you are unable to perform that task and why + (e.g. "I don't have an email tool available in this environment"). + Do NOT call OnTaskComplete in this case — only call OnTaskComplete when you have actually completed a computer-use task. + + ## Computer use (desktop control) + Only use computer actions when no function tool can accomplish the task. + When a task requires computer use, perform the actions and examine screenshots to verify they worked. + If you see browser setup or sign-in dialogs, dismiss them (Escape, X, or Skip). + Once you have completed a computer use task, call the OnTaskComplete function. + Do NOT continue looping after the task is done. + If the user sends a casual greeting or question that does not require computer use, reply with a helpful text message. + + ## Ending the Cloud PC session + Call the EndSession function ONLY when the user explicitly asks to end, close, + disconnect, release, or quit the session, or otherwise says they are done with + all work on the Cloud PC. Trigger phrases include: "end session", "close session", + "disconnect", "release the VM", "I'm done", "quit", "shut it down", "log off". + + Do NOT call EndSession in any of these situations: + - The user is starting a new task (e.g. "go to bbc.com", "open Word", "navigate to ..."). + - The user is switching topics or apps within the same session. + - You just completed a previous task — call OnTaskComplete instead, which keeps the session open for the next request. + - The user sends a casual greeting, question, or anything that's not an explicit request to end the session. + + Switching tasks inside one session is normal and expected. The session should + remain open across many user requests until the user explicitly asks to end it. + """; + + public ComputerUseOrchestrator( + ICuaModelProvider modelProvider, + IHttpClientFactory httpClientFactory, + IConfiguration configuration, + ILogger logger) + { + _modelProvider = modelProvider; + _httpClientFactory = httpClientFactory; + _httpClient = httpClientFactory.CreateClient("WebClient"); + _logger = logger; + _maxIterations = configuration.GetValue("ComputerUse:MaxIterations", 30); + _screenshotPath = configuration["Screenshots:LocalPath"]; + _oneDriveFolder = configuration["Screenshots:OneDriveFolder"]; + _oneDriveUserId = configuration["Screenshots:OneDriveUserId"]; + + _toolType = configuration["ComputerUse:ToolType"] ?? ""; + if (string.IsNullOrEmpty(_toolType)) + { + // Auto-derive from model name: gpt-* models use "computer", others use "computer_use_preview" + var modelName = _modelProvider.ModelName; + _toolType = modelName.StartsWith("gpt-", StringComparison.OrdinalIgnoreCase) ? "computer" : "computer_use_preview"; + } + var displayWidth = configuration.GetValue("ComputerUse:DisplayWidth", 1024); + var displayHeight = configuration.GetValue("ComputerUse:DisplayHeight", 768); + + // Build the computer tool definition based on the tool type: + // "computer_use_preview" — computer-use-preview model: display_width, display_height, environment + // "computer" — GPT-5.4+ models (Azure OpenAI): bare type, no params + object computerTool = _toolType switch + { + "computer" => new ComputerToolV2(), + _ => new ComputerUseTool { DisplayWidth = displayWidth, DisplayHeight = displayHeight } + }; + + _logger.LogInformation("CUA tool type: {ToolType}, display: {Width}x{Height}", _toolType, displayWidth, displayHeight); + + _tools = + [ + computerTool, + new FunctionToolDefinition + { + Name = "OnTaskComplete", + Description = "Call this function when the given task has been completed successfully." + }, + new FunctionToolDefinition + { + Name = "EndSession", + Description = "Call this function when the user wants to end, quit, disconnect, or release their computer session." + } + ]; + } + + /// + /// Lightweight intent classifier: decides whether a user message needs computer-use (CUA). + /// Runs a single tool-less model call and parses a strict YES/NO answer. On any parse error + /// or exception, returns true so we fall back to the full CUA loop — safer to pay + /// the W365 session cost than miss a legitimate computer-use request. + /// + public async Task ClassifyNeedsCuaAsync(string userMessage, CancellationToken cancellationToken = default) + { + const string ClassifierInstructions = """ + You are a router. Decide whether the user's message requires controlling or managing a + Windows desktop: clicking, typing into apps, taking screenshots, opening programs, + interacting with the GUI, OR ending/releasing a Cloud PC session. + Answer with a single word: + YES — if it needs desktop control or session management + NO — if it is chit-chat, a question answerable from knowledge, or a request that can + be fulfilled with mail/calendar/Teams/other function tools only + When uncertain, prefer YES. + """; + + try + { + var body = JsonSerializer.Serialize(new ComputerUseRequest + { + Model = _modelProvider.ModelName, + Instructions = ClassifierInstructions, + Input = [CreateUserMessage(userMessage)], + Tools = [], + Truncation = "auto" + }, new JsonSerializerOptions { DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull }); + + var responseJson = await _modelProvider.SendAsync(body, cancellationToken); + var response = JsonSerializer.Deserialize(responseJson); + if (response?.Output == null) + { + return true; + } + + foreach (var item in response.Output) + { + if (item.TryGetProperty("type", out var tProp) && tProp.GetString() == "message") + { + var replyText = ExtractText(item).Trim(); + _logger.LogInformation("CUA intent classifier reply for message {Preview}: {Reply}", Truncate(userMessage, 80), Truncate(replyText, 60)); + // Match on the first non-empty token. The router is instructed to emit a single + // word but may prepend/append fluff; trim to the leading YES/NO. + var upper = replyText.ToUpperInvariant(); + if (upper.StartsWith("NO")) return false; + if (upper.StartsWith("YES")) return true; + // Unexpected shape — default to CUA so we don't silently drop a legitimate request. + return true; + } + } + + return true; + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + throw; + } + catch (Exception ex) when (ex is JsonException || ex is HttpRequestException || ex is TaskCanceledException) + { + _logger.LogWarning(ex, "CUA intent classifier threw — defaulting to needsCua=true."); + return true; + } + } + + /// + /// Run the CUA loop for a specific conversation. When is + /// false, the computer tool is withheld from the model's tool list so it + /// cannot emit computer_call actions — used on the non-CUA fast path where the + /// router decided the message doesn't need desktop control, so no W365 session is acquired. + /// + public async Task RunAsync( + string conversationId, + string userMessage, + IList w365Tools, + IList? additionalTools = null, + string? graphAccessToken = null, + Func? onStatusUpdate = null, + Func? onCuaStarting = null, + Func? onFolderLinkReady = null, + bool includeCuaTool = true, + CancellationToken cancellationToken = default) + { + _logger.LogInformation("Processing message for conversation {ConversationId}: {Message}", conversationId, Truncate(userMessage, 100)); + + var session = _sessions.GetOrAdd(conversationId, _ => new ConversationSession()); + + if (session.SessionStarted) + { + _logger.LogInformation("Reusing session for conversation {ConversationId}, W365SessionId={SessionId}", conversationId, session.W365SessionId); + } + + // Two-level screenshot folder layout: {yyyy-MM-dd}/{HHmmss}_{prompt-slug}. + // Set it here on every CUA-bound turn so each user prompt that triggers the CUA loop + // gets its own subfolder. Reset FolderShared so the new folder gets a fresh share + // link surfaced via onFolderLinkReady. Non-CUA turns keep the existing folder (or + // null) — they don't take screenshots, so the value is irrelevant. + if (includeCuaTool) + { + var promptSlug = SanitizeForPath(userMessage, maxLen: 30); + session.ScreenshotSubfolder = $"{DateTime.UtcNow:yyyy-MM-dd}/{DateTime.UtcNow:HHmmss}_{promptSlug}"; + session.FolderShared = false; + _logger.LogInformation("CUA turn folder for conversation {ConversationId}: {Folder}", conversationId, session.ScreenshotSubfolder); + } + + // For "computer" tool type (gpt-5.4+), include a screenshot with the FIRST user message if session already active + if (_toolType == "computer" && session.ConversationHistory.Count == 0 && session.SessionStarted) + { + var initialScreenshot = await CaptureScreenshotAsync(w365Tools, session.W365SessionId, cancellationToken); + var convPrefix = conversationId.Length > 8 ? conversationId[..8] : conversationId; + var initialName = $"{convPrefix}_{++session.ScreenshotCounter:D3}_initial"; + SaveScreenshotToDisk(initialScreenshot!, initialName, session.ScreenshotSubfolder); + var folderUrlReuse = await UploadScreenshotToOneDriveAsync(initialScreenshot!, $"{initialName}.png", graphAccessToken, session.ScreenshotSubfolder, session, cancellationToken); + if (folderUrlReuse != null && onFolderLinkReady != null) + await onFolderLinkReady(folderUrlReuse); + session.ConversationHistory.Add(ToJsonElement(new + { + type = "message", + role = "user", + content = new object[] + { + new { type = "input_text", text = userMessage }, + new { type = "input_image", image_url = $"data:image/png;base64,{initialScreenshot}" } + } + })); + } + else + { + session.ConversationHistory.Add(CreateUserMessage(userMessage)); + } + + // Build the model's tools list — computer + OnTaskComplete + any additional function tools. + // When includeCuaTool is false (non-CUA fast path), skip all CUA-specific tools entirely + // (computer, OnTaskComplete, EndSession). Those require an active W365 session, and the + // classifier has already decided we don't need one. Only the caller-provided additional + // tools (mail/calendar/etc.) remain visible to the model. + var modelTools = includeCuaTool + ? new List(_tools) + : new List(); + if (additionalTools?.Count > 0) + { + // ATG injects a synthetic "Error" sentinel tool when any MCP server's tools/list + // fails (e.g. W365 session acquisition error). Its parameters schema is `{}` with + // no properties — Azure OpenAI rejects that with `invalid_function_parameters`. + // MyAgent reads the Error description for user-facing messaging before getting + // here, so it's safe to drop from the LLM call. + var llmTools = additionalTools.OfType() + .Where(t => !string.Equals(t.Name, "Error", StringComparison.OrdinalIgnoreCase)) + .ToList(); + + foreach (var tool in llmTools) + { + modelTools.Add(new FunctionToolDefinition + { + Name = tool.Name, + Description = tool.Description ?? string.Empty, + Parameters = tool.JsonSchema + }); + } + + _logger.LogInformation("Added {Count} additional function tools to model", llmTools.Count); + foreach (var tool in llmTools) + { + var schemaStr = tool.JsonSchema.GetRawText(); + _logger.LogInformation("Function tool: {Name}, Description: {Desc}, Schema: {Schema}", + tool.Name, Truncate(tool.Description ?? "", 80), Truncate(schemaStr, 200)); + } + } + + var cuaAcknowledged = false; + for (var i = 0; i < _maxIterations; i++) + { + cancellationToken.ThrowIfCancellationRequested(); + + // When running without the computer tool, strip past CUA-only turns from the history. + // Azure OpenAI 400s when an item references a tool that isn't declared in this turn: + // - `computer_call` / `computer_call_output` need `computer` or `computer_use_preview` + // - `function_call` / `function_call_output` for OnTaskComplete or EndSession need + // those CUA-only function tools declared (we strip them in non-CUA modelTools). + // Two-pass: first identify the call_ids of CUA-only function_calls so we can also + // drop their paired function_call_outputs (which carry only call_id, not the name). + // session.ConversationHistory itself is left intact so a later CUA turn still sees + // the full record. + var conversation = session.ConversationHistory; + if (!includeCuaTool) + { + var cuaOnlyCallIds = new HashSet(StringComparer.Ordinal); + foreach (var item in session.ConversationHistory) + { + if (!item.TryGetProperty("type", out var typeProp)) continue; + if (typeProp.GetString() != "function_call") continue; + if (!item.TryGetProperty("name", out var nameProp)) continue; + var name = nameProp.GetString(); + if (name != "OnTaskComplete" && name != "EndSession") continue; + if (item.TryGetProperty("call_id", out var idProp)) + { + var id = idProp.GetString(); + if (!string.IsNullOrEmpty(id)) cuaOnlyCallIds.Add(id); + } + } + + conversation = session.ConversationHistory + .Where(item => + { + if (!item.TryGetProperty("type", out var typeProp)) + { + return true; + } + var type = typeProp.GetString(); + if (type == "computer_call" || type == "computer_call_output") + { + return false; + } + if ((type == "function_call" || type == "function_call_output") + && item.TryGetProperty("call_id", out var idProp) + && cuaOnlyCallIds.Contains(idProp.GetString() ?? string.Empty)) + { + return false; + } + return true; + }) + .ToList(); + } + + var response = await CallModelAsync(conversation, modelTools, cancellationToken); + if (response?.Output == null || response.Output.Count == 0) + break; + + var hasActions = false; + + foreach (var item in response.Output) + { + var type = item.GetProperty("type").GetString(); + if (type == "reasoning") continue; + + session.ConversationHistory.Add(item); + + switch (type) + { + case "message": + return ExtractText(item); + + case "computer_call": + hasActions = true; + // Lazy session start: only start when CUA is actually needed + if (!cuaAcknowledged) + { + if (!session.SessionStarted) + { + _logger.LogInformation("CUA needed for conversation {ConversationId} — starting session", conversationId); + if (onCuaStarting != null) + await onCuaStarting(true); + if (onStatusUpdate != null) await onStatusUpdate("Starting W365 computing session..."); + session.SessionStarted = true; + _logger.LogInformation("Session marked started for conversation {ConversationId}", conversationId); + } + else if (onCuaStarting != null) + { + await onCuaStarting(false); + } + + cuaAcknowledged = true; + } + + _logger.LogInformation("CUA iteration {Iteration}: {Action}", i + 1, Truncate(item.GetRawText(), 200)); + try + { + session.ConversationHistory.Add(await HandleComputerCallAsync(item, w365Tools, session, graphAccessToken, onStatusUpdate, onFolderLinkReady, cancellationToken)); + } + catch (InvalidOperationException toolEx) + { + _logger.LogError(toolEx, "Tool call in CUA iteration failed for conversation {ConversationId}", conversationId); + // Pop the unpaired computer_call we added above so the next turn's conversation + // history isn't malformed (Azure OpenAI 400s on "No tool output found for …"). + if (session.ConversationHistory.Count > 0) + { + session.ConversationHistory.RemoveAt(session.ConversationHistory.Count - 1); + } + return toolEx.Message; + } + break; + + case "function_call": + hasActions = true; + var funcName = item.GetProperty("name").GetString(); + _logger.LogInformation("CUA iteration {Iteration}: function_call {Name}", i + 1, funcName); + if (funcName == "OnTaskComplete") + { + session.ConversationHistory.Add(CreateFunctionOutput(item.GetProperty("call_id").GetString()!)); + return "Task completed successfully."; + } + if (funcName == "EndSession") + { + session.ConversationHistory.Add(CreateFunctionOutput(item.GetProperty("call_id").GetString()!)); + _logger.LogInformation("EndSession requested by model for conversation {ConversationId}", conversationId); + if (onStatusUpdate != null) await onStatusUpdate("Ending session..."); + + // Always delegate to ATG regardless of session.SessionStarted: in V2 the + // session can be acquired by ATG's hostname-discovery when the sample agent + // calls tools/list at startup — before any computer_call flips SessionStarted. + // Gating on SessionStarted would leak the pool slot. ATG's handler is + // idempotent and returns "No active W365 session found." when there's nothing + // to release, so this is safe on fresh conversations too. + await EndSessionAsync(w365Tools, _logger, session.W365SessionId, cancellationToken); + session.SessionStarted = false; + session.W365SessionId = null; + session.ScreenshotSubfolder = null; + _sessions.TryRemove(conversationId, out _); + return "Session ended. The VM has been released back to the pool."; + } + + // Invoke additional MCP function tool + if (additionalTools != null) + { + var callResult = await InvokeFunctionCallAsync(item, additionalTools, cancellationToken); + session.ConversationHistory.Add(callResult); + } + + break; + } + } + + if (!hasActions) break; + } + + return "The task could not be completed within the allowed number of steps."; + } + + /// + /// End the W365 session. Called by the agent on shutdown or explicit end. + /// + public static async Task EndSessionAsync(IList tools, ILogger logger, string? sessionId, CancellationToken ct) + { + try + { + // The ATG-local mcptool resolves the active W365 session via the v2: context key + // — no sessionId arg needed (session routing is transparent). + var args = new Dictionary(); + await InvokeToolAsync(tools, "mcp_W365ComputerUse_EndSession", args, ct); + logger.LogInformation("W365 session ended"); + } + catch (ObjectDisposedException) + { + logger.LogInformation("MCP client already disposed — W365 session will be released by server timeout"); + } + catch (HttpRequestException httpEx) when (httpEx.StatusCode == System.Net.HttpStatusCode.NotFound) + { + logger.LogInformation("MCP transport session expired (404) — W365 session will be released by server timeout"); + } + catch (Exception ex) when (ex is HttpRequestException || ex is InvalidOperationException || ex is TaskCanceledException) + { + logger.LogWarning(ex, "Failed to end W365 session"); + } + } + + private async Task CallModelAsync(List conversation, List tools, CancellationToken ct) + { + var body = JsonSerializer.Serialize(new ComputerUseRequest + { + Model = _modelProvider.ModelName, + Instructions = SystemInstructions, + Input = conversation, + Tools = tools, + Truncation = "auto" + }, new JsonSerializerOptions { DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingNull }); + + _logger.LogDebug("Model request (first 2000 chars): {Body}", body[..Math.Min(2000, body.Length)]); + + var responseJson = await _modelProvider.SendAsync(body, ct); + _logger.LogDebug("Model response (first 2000 chars): {Response}", responseJson[..Math.Min(2000, responseJson.Length)]); + return JsonSerializer.Deserialize(responseJson); + } + + /// + /// Translate a computer_call into an MCP tool call, capture screenshot, return computer_call_output. + /// + private async Task HandleComputerCallAsync( + JsonElement call, IList tools, ConversationSession session, string? graphAccessToken, Func? onStatus, Func? onFolderLinkReady, CancellationToken ct) + { + var callId = call.GetProperty("call_id").GetString()!; + var sessionId = session.W365SessionId; + + // GPT-5.4 uses "actions" (non-empty array), older models use "action" (singular). + if (call.TryGetProperty("actions", out var actionsArray) + && actionsArray.ValueKind == JsonValueKind.Array + && actionsArray.GetArrayLength() > 0) + { + foreach (var action in actionsArray.EnumerateArray()) + { + var actionType = action.GetProperty("type").GetString()!; + if (onStatus != null) await onStatus($"Performing: {actionType}..."); + + if (actionType != "screenshot") + { + var (toolName, args) = MapActionToMcpTool(actionType, action, sessionId); + var (result, sessionLost) = await InvokeToolCheckSessionAsync(tools, toolName, args, ct); + if (sessionLost) + { + if (onStatus != null) await onStatus("Session lost — recovering..."); + await RecoverSessionAsync(session, tools, _logger, ct); + // Re-invoke with the same args; ATG re-acquires the session transparently + // on this retry via the hostname-discovery handler. + await InvokeToolThrowOnErrorAsync(tools, toolName, args, ct); + } + else if (TryExtractToolError(result?.ToString(), out var errorText)) + { + // Surface tool errors to the bot reply rather than silently continuing with + // a no-op result. The model can otherwise loop or end with "No text was streamed". + throw new InvalidOperationException($"Error calling tool '{toolName}': {errorText}"); + } + } + } + } + else if (call.TryGetProperty("action", out var singleAction)) + { + var actionType = singleAction.GetProperty("type").GetString()!; + if (onStatus != null) await onStatus($"Performing: {actionType}..."); + + if (actionType != "screenshot") + { + var (toolName, args) = MapActionToMcpTool(actionType, singleAction, sessionId); + var (result, sessionLost) = await InvokeToolCheckSessionAsync(tools, toolName, args, ct); + if (sessionLost) + { + if (onStatus != null) await onStatus("Session lost — recovering..."); + await RecoverSessionAsync(session, tools, _logger, ct); + // Re-invoke with the same args; ATG re-acquires the session transparently + // on this retry via the hostname-discovery handler. + await InvokeToolThrowOnErrorAsync(tools, toolName, args, ct); + } + else if (TryExtractToolError(result?.ToString(), out var errorText)) + { + throw new InvalidOperationException($"Error calling tool '{toolName}': {errorText}"); + } + } + } + + // Always capture screenshot after action + var screenshot = await CaptureScreenshotAsync(tools, sessionId, ct); + + var stepName = $"{++session.ScreenshotCounter:D3}_step"; + SaveScreenshotToDisk(screenshot!, stepName, session.ScreenshotSubfolder); + var folderUrl = await UploadScreenshotToOneDriveAsync(screenshot!, $"{stepName}.png", graphAccessToken, session.ScreenshotSubfolder, session, ct); + if (folderUrl != null && onFolderLinkReady != null) + await onFolderLinkReady(folderUrl); + + var safetyChecks = call.TryGetProperty("pending_safety_checks", out var sc) + ? sc : JsonSerializer.Deserialize("[]"); + + // "computer" tool type (gpt-5.4+) doesn't support acknowledged_safety_checks + if (_toolType == "computer") + { + return ToJsonElement(new + { + type = "computer_call_output", + call_id = callId, + output = new { type = "computer_screenshot", image_url = $"data:image/png;base64,{screenshot}" } + }); + } + + return ToJsonElement(new + { + type = "computer_call_output", + call_id = callId, + acknowledged_safety_checks = safetyChecks, + output = new { type = "computer_screenshot", image_url = $"data:image/png;base64,{screenshot}" } + }); + } + + /// + /// Map OpenAI computer_call action types to W365 V2 MCP tool names and arguments. + /// V2 tool names/schemas come from the in-VM MCP server (as returned by its tools/list). + /// No sessionId arg is passed — V2 session routing is handled transparently by ATG's + /// hostname-discovery handler via the x-ms-computerId header. + /// + private static (string ToolName, Dictionary Args) MapActionToMcpTool(string actionType, JsonElement action, string? sessionId) + { + // CUA model emits button names in lowercase ("left"/"right"); V2 click accepts PascalCase enum values. + static string NormalizeButton(string? button) => string.IsNullOrEmpty(button) + ? "Left" + : char.ToUpperInvariant(button[0]) + button.Substring(1).ToLowerInvariant(); + + return actionType.ToLowerInvariant() switch + { + "click" => ("click", new Dictionary + { + ["x"] = action.GetProperty("x").GetInt32(), + ["y"] = action.GetProperty("y").GetInt32(), + ["button"] = NormalizeButton(action.TryGetProperty("button", out var b) ? b.GetString() : null), + ["clickCount"] = 1 + }), + "double_click" => ("click", new Dictionary + { + ["x"] = action.GetProperty("x").GetInt32(), + ["y"] = action.GetProperty("y").GetInt32(), + ["button"] = "Left", + ["clickCount"] = 2 + }), + "type" => ("type_text", new Dictionary + { + ["text"] = action.GetProperty("text").GetString() + }), + "key" or "keys" or "keypress" => ("press_keys", new Dictionary + { + // Lowercase the key names — W365's press_keys tool rejects uppercase variants + // like "CTRL"/"ESC" that the model sometimes emits. + ["keys"] = ExtractKeys(action).Select(k => k.ToLowerInvariant()).ToArray() + }), + "scroll" => ("scroll", new Dictionary + { + ["x"] = action.GetProperty("x").GetInt32(), + ["y"] = action.GetProperty("y").GetInt32(), + ["deltaX"] = action.TryGetProperty("scroll_x", out var sx) ? sx.GetInt32() : 0, + ["deltaY"] = action.TryGetProperty("scroll_y", out var sy) ? sy.GetInt32() : 0 + }), + "move" => ("move_mouse", new Dictionary + { + ["x"] = action.GetProperty("x").GetInt32(), + ["y"] = action.GetProperty("y").GetInt32() + }), + "drag" => ("drag_mouse", new Dictionary + { + ["startX"] = action.GetProperty("path")[0].GetProperty("x").GetInt32(), + ["startY"] = action.GetProperty("path")[0].GetProperty("y").GetInt32(), + ["endX"] = action.GetProperty("path")[action.GetProperty("path").GetArrayLength() - 1].GetProperty("x").GetInt32(), + ["endY"] = action.GetProperty("path")[action.GetProperty("path").GetArrayLength() - 1].GetProperty("y").GetInt32(), + ["button"] = "Left" + }), + "wait" => ("wait_milliseconds", new Dictionary + { + ["ms"] = action.TryGetProperty("ms", out var ms) ? ms.GetInt32() : 500 + }), + "open_url" => ("browser_navigate", new Dictionary + { + ["url"] = action.GetProperty("url").GetString() + }), + _ => throw new NotSupportedException($"Unsupported action: {actionType}") + }; + } + + private async Task CaptureScreenshotAsync(IList tools, string? sessionId, CancellationToken ct) + { + // take_screenshot takes optional crop args; empty dictionary = full screen. + // No sessionId — session routing is handled by ATG's hostname-discovery handler. + var screenshotArgs = new Dictionary(); + + + // Fallback: AIFunction wrapper (may lose image content) + var aiResult = await InvokeToolAsync(tools, "take_screenshot", screenshotArgs, ct); + var str = aiResult?.ToString() ?? ""; + + _logger.LogInformation("Screenshot fallback: result type={Type}, length={Length}, preview={Preview}", + aiResult?.GetType().Name ?? "null", str.Length, str[..Math.Min(200, str.Length)]); + + try + { + using var doc = JsonDocument.Parse(str); + var root = doc.RootElement; + if (root.TryGetProperty("screenshotData", out var sd)) return sd.GetString() ?? ""; + if (root.TryGetProperty("image", out var img)) return img.GetString() ?? ""; + if (root.TryGetProperty("data", out var d)) return d.GetString() ?? ""; + + // Try nested content array (SDK gateway format) + if (root.TryGetProperty("content", out var content) && content.ValueKind == JsonValueKind.Array) + { + foreach (var block in content.EnumerateArray()) + { + if (block.TryGetProperty("data", out var blockData)) + { + var data = blockData.GetString(); + if (!string.IsNullOrEmpty(data)) return data; + } + if (block.TryGetProperty("text", out var blockText)) + { + var extracted = ExtractBase64FromText(blockText.GetString()); + if (!string.IsNullOrEmpty(extracted)) return extracted; + } + } + } + } + catch (JsonException) { } + + // Last resort: if it looks like raw base64 (long string, no JSON), use it directly + if (str.Length > 1000 && !str.StartsWith("{") && !str.StartsWith("[")) + return str; + + throw new InvalidOperationException($"Failed to extract screenshot. Response length: {str.Length}"); + } + + private static string? ExtractBase64FromText(string? text) + { + if (string.IsNullOrEmpty(text)) return null; + try + { + using var doc = JsonDocument.Parse(text); + var root = doc.RootElement; + if (root.TryGetProperty("screenshotData", out var sd)) return sd.GetString(); + if (root.TryGetProperty("image", out var img)) return img.GetString(); + if (root.TryGetProperty("data", out var d)) return d.GetString(); + } + catch (JsonException) { } + return null; + } + + internal static async Task InvokeToolAsync( + IList tools, string name, Dictionary args, CancellationToken ct) + { + var tool = tools.OfType().FirstOrDefault(t => t.Name.Equals(name, StringComparison.OrdinalIgnoreCase)) + ?? throw new InvalidOperationException($"Tool '{name}' not found."); + return await tool.InvokeAsync(new AIFunctionArguments(args), ct); + } + + /// + /// Invokes a tool and throws if the MCP result reports + /// isError: true. The exception message format is "Error calling tool '{name}': {detail}" + /// so the CUA loop can bubble a readable reason up to the bot reply instead of silently proceeding + /// with a bad state. + /// + internal static async Task InvokeToolThrowOnErrorAsync( + IList tools, string name, Dictionary args, CancellationToken ct) + { + var result = await InvokeToolAsync(tools, name, args, ct); + if (TryExtractToolError(result?.ToString(), out var errorText)) + { + throw new InvalidOperationException($"Error calling tool '{name}': {errorText}"); + } + + return result; + } + + /// + /// Parses an MCP CallToolResult-shaped JSON payload and extracts the error text when + /// isError is true. Returns true if an error was found, false otherwise. + /// + private static bool TryExtractToolError(string? response, out string message) + { + message = string.Empty; + if (string.IsNullOrEmpty(response)) return false; + try + { + using var doc = JsonDocument.Parse(response); + if (!doc.RootElement.TryGetProperty("isError", out var isErr) || isErr.ValueKind != JsonValueKind.True) + { + return false; + } + + if (doc.RootElement.TryGetProperty("content", out var content) && content.ValueKind == JsonValueKind.Array) + { + foreach (var block in content.EnumerateArray()) + { + if (block.TryGetProperty("text", out var text)) + { + message = text.GetString() ?? "(unknown error)"; + return true; + } + } + } + + message = "(unknown error)"; + return true; + } + catch (JsonException) + { + return false; + } + } + + /// + /// Invoke a tool and detect session-not-found errors. Returns (result, isSessionLost). + /// + private static async Task<(object? Result, bool IsSessionLost)> InvokeToolCheckSessionAsync( + IList tools, string name, Dictionary args, CancellationToken ct) + { + var result = await InvokeToolAsync(tools, name, args, ct); + var resultStr = result?.ToString() ?? ""; + if (IsSessionNotFoundError(resultStr)) + return (result, true); + return (result, false); + } + + /// + /// Check if a tool response indicates the session is no longer valid. + /// + private static bool IsSessionNotFoundError(string response) + { + if (string.IsNullOrEmpty(response)) return false; + var lower = response.ToLowerInvariant(); + return lower.Contains("no active session found") || + lower.Contains("session not found") || + lower.Contains("session expired") || + lower.Contains("session has been terminated"); + } + + /// + /// Recover from a lost session: release the stale session (best-effort) and reset the + /// session-state flags so the next MCP tool call triggers a fresh checkout via ATG's + /// hostname-discovery handler. There is no explicit "start" step in V2 — the session + /// is acquired transparently when the orchestrator's next computer_call goes through. + /// + private async Task RecoverSessionAsync( + ConversationSession session, IList tools, ILogger logger, CancellationToken ct) + { + logger.LogWarning("Session lost. Recovering — releasing stale session; ATG will re-acquire on next MCP call."); + + try + { + await EndSessionAsync(tools, logger, session.W365SessionId, ct); + } + catch (OperationCanceledException) when (ct.IsCancellationRequested) + { + throw; + } + catch (Exception ex) when (ex is HttpRequestException || ex is InvalidOperationException || ex is TaskCanceledException) + { + logger.LogWarning(ex, "Best-effort EndSession during recovery failed"); + } + + session.W365SessionId = null; + session.SessionStarted = false; + session.ScreenshotSubfolder = null; + logger.LogInformation("Session state cleared; awaiting transparent re-acquisition."); + } + + private static string[] ExtractKeys(JsonElement action) + { + if (action.TryGetProperty("keys", out var k)) + { + if (k.ValueKind == JsonValueKind.Array) + return k.EnumerateArray().Select(e => e.GetString() ?? "").ToArray(); + if (k.ValueKind == JsonValueKind.String) + return [k.GetString() ?? ""]; + } + if (action.TryGetProperty("key", out var single) && single.ValueKind == JsonValueKind.String) + return [single.GetString() ?? ""]; + return []; + } + + private static string ExtractText(JsonElement msg) + { + if (msg.TryGetProperty("content", out var c) && c.ValueKind == JsonValueKind.Array) + foreach (var item in c.EnumerateArray()) + if (item.TryGetProperty("text", out var t)) + return t.GetString() ?? ""; + return ""; + } + + private static JsonElement CreateUserMessage(string text) => ToJsonElement(new + { + type = "message", role = "user", + content = new[] { new { type = "input_text", text } } + }); + + private static JsonElement CreateFunctionOutput(string callId, string output = "success") => ToJsonElement(new + { + type = "function_call_output", call_id = callId, output + }); + + /// + /// Invoke an MCP function tool from a model function_call and return the function_call_output. + /// + private async Task InvokeFunctionCallAsync(JsonElement functionCall, IList tools, CancellationToken ct) + { + var callId = functionCall.GetProperty("call_id").GetString()!; + var name = functionCall.GetProperty("name").GetString()!; + var argsStr = functionCall.GetProperty("arguments").GetString() ?? "{}"; + + _logger.LogInformation("Function call {Name} invoked. call_id={CallId}, args={Args}", + name, callId, Truncate(argsStr, 1000)); + + try + { + var args = JsonSerializer.Deserialize>(argsStr) ?? []; + var result = await InvokeToolAsync(tools, name, args, ct); + var resultStr = result?.ToString() ?? "success"; + _logger.LogInformation("Function call {Name} returned ({Length} chars): {Result}", + name, resultStr.Length, Truncate(resultStr, 2000)); + return CreateFunctionOutput(callId, resultStr); + } + catch (OperationCanceledException) when (ct.IsCancellationRequested) + { + throw; + } + catch (Exception ex) when (ex is JsonException || ex is InvalidOperationException || ex is ArgumentException) + { + _logger.LogError(ex, "Function call {Name} threw. call_id={CallId}", name, callId); + return CreateFunctionOutput(callId, $"Error: {ex.Message}"); + } + } + + private static JsonElement ToJsonElement(object obj) => + JsonSerializer.Deserialize(JsonSerializer.Serialize(obj)); + + private static string Truncate(string v, int max) => v.Length <= max ? v : v[..max] + "..."; + + /// + /// Convert a user-supplied string into a filesystem-safe slug for a folder name. + /// Letters and digits are kept; everything else collapses into single underscores. + /// Trailing underscores are trimmed, the result is lower-cased, and trimmed to + /// characters. Empty/whitespace inputs yield "untitled". + /// + private static string SanitizeForPath(string? input, int maxLen) + { + if (string.IsNullOrWhiteSpace(input)) return "untitled"; + var sb = new System.Text.StringBuilder(maxLen); + foreach (var c in input) + { + if (sb.Length >= maxLen) break; + if (char.IsLetterOrDigit(c)) sb.Append(char.ToLowerInvariant(c)); + else if (sb.Length > 0 && sb[sb.Length - 1] != '_') sb.Append('_'); + } + while (sb.Length > 0 && sb[sb.Length - 1] == '_') sb.Length--; + return sb.Length == 0 ? "untitled" : sb.ToString(); + } + + private void SaveScreenshotToDisk(string base64Data, string name, string? subfolder = null) + { + if (string.IsNullOrEmpty(base64Data) || string.IsNullOrEmpty(_screenshotPath)) return; + try + { + // Match the OneDrive folder layout — per-session subfolder under ./Screenshots so + // counters from concurrent or sequential conversations don't clobber each other. + var dir = string.IsNullOrEmpty(subfolder) + ? _screenshotPath + : Path.Combine(_screenshotPath, subfolder); + Directory.CreateDirectory(dir); + var path = Path.Combine(dir, $"{name}.png"); + File.WriteAllBytes(path, Convert.FromBase64String(base64Data)); + _logger.LogInformation("Screenshot saved: {Path}", path); + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to save screenshot"); + } + } + + /// + /// Upload a screenshot to the user's OneDrive via Microsoft Graph. + /// Requires a Graph access token with Files.ReadWrite scope. + /// Files are uploaded to /CUA-Sessions/{date}/ folder. + /// + private async Task UploadScreenshotToOneDriveAsync(string base64Data, string fileName, string? graphAccessToken, string? subfolder, ConversationSession session, CancellationToken cancellationToken) + { + if (string.IsNullOrEmpty(graphAccessToken)) + { + _logger.LogDebug("OneDrive upload skipped: no Graph token"); + return null; + } + if (string.IsNullOrEmpty(base64Data)) + { + _logger.LogDebug("OneDrive upload skipped: no screenshot data"); + return null; + } + if (string.IsNullOrEmpty(_oneDriveFolder)) + { + _logger.LogDebug("OneDrive upload skipped: OneDriveFolder not configured"); + return null; + } + + try + { + // Use /me/drive for token owner, or /users/{id}/drive for a specific user. + // Encode the user identifier — UPNs contain '@' and may contain '#' which both need escaping. + var driveBase = string.IsNullOrEmpty(_oneDriveUserId) + ? "https://graph.microsoft.com/v1.0/me/drive" + : $"https://graph.microsoft.com/v1.0/users/{Uri.EscapeDataString(_oneDriveUserId)}/drive"; + var folderPath = string.IsNullOrEmpty(subfolder) + ? _oneDriveFolder.TrimStart('/') + : $"{_oneDriveFolder.TrimStart('/')}/{subfolder}"; + var url = $"{driveBase}/root:/{folderPath}/{fileName}:/content"; + + using var request = new HttpRequestMessage(HttpMethod.Put, url); + request.Headers.Authorization = new AuthenticationHeaderValue("Bearer", graphAccessToken); + request.Content = new ByteArrayContent(Convert.FromBase64String(base64Data)); + request.Content.Headers.ContentType = new System.Net.Http.Headers.MediaTypeHeaderValue("image/png"); + + var response = await _httpClient.SendAsync(request, cancellationToken); + if (response.IsSuccessStatusCode) + { + _logger.LogInformation("Screenshot uploaded to OneDrive: {Folder}/{FileName}", folderPath, fileName); + + // On first upload, create an org-scoped sharing link for the folder + if (!session.FolderShared) + { + var shareUrl = await ShareConversationFolderAsync(folderPath, graphAccessToken, cancellationToken); + if (shareUrl != null) + { + session.FolderShared = true; + return shareUrl; + } + } + } + else + { + var content = await response.Content.ReadAsStringAsync(cancellationToken); + _logger.LogWarning("OneDrive upload failed: {Status} {Content}", response.StatusCode, content); + } + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to upload screenshot to OneDrive"); + } + + return null; + } + + /// + /// Create an organization-scoped sharing link for the conversation's screenshot folder. + /// Returns the web URL that anyone in the org can use to view the folder. + /// + private async Task ShareConversationFolderAsync(string folderPath, string graphAccessToken, CancellationToken cancellationToken) + { + try + { + var driveBase = string.IsNullOrEmpty(_oneDriveUserId) + ? "https://graph.microsoft.com/v1.0/me/drive" + : $"https://graph.microsoft.com/v1.0/users/{Uri.EscapeDataString(_oneDriveUserId)}/drive"; + + using var getRequest = new HttpRequestMessage(HttpMethod.Get, $"{driveBase}/root:/{folderPath}"); + getRequest.Headers.Authorization = new AuthenticationHeaderValue("Bearer", graphAccessToken); + var getResponse = await _httpClient.SendAsync(getRequest, cancellationToken); + + if (!getResponse.IsSuccessStatusCode) + { + _logger.LogWarning("Failed to get folder item for sharing: {Status}", getResponse.StatusCode); + return null; + } + + var folderJson = await getResponse.Content.ReadAsStringAsync(cancellationToken); + using var doc = JsonDocument.Parse(folderJson); + var folderId = doc.RootElement.GetProperty("id").GetString(); + var webUrl = doc.RootElement.TryGetProperty("webUrl", out var wu) ? wu.GetString() : null; + + using var linkRequest = new HttpRequestMessage(HttpMethod.Post, $"{driveBase}/items/{folderId}/createLink"); + linkRequest.Headers.Authorization = new AuthenticationHeaderValue("Bearer", graphAccessToken); + linkRequest.Content = new StringContent( + JsonSerializer.Serialize(new { type = "view", scope = "organization" }), + System.Text.Encoding.UTF8, "application/json"); + + var linkResponse = await _httpClient.SendAsync(linkRequest, cancellationToken); + if (linkResponse.IsSuccessStatusCode) + { + var linkJson = await linkResponse.Content.ReadAsStringAsync(cancellationToken); + using var linkDoc = JsonDocument.Parse(linkJson); + var shareUrl = linkDoc.RootElement.GetProperty("link").GetProperty("webUrl").GetString(); + _logger.LogInformation("Folder shared with org: {Url}", shareUrl); + return shareUrl; + } + else + { + var errorContent = await linkResponse.Content.ReadAsStringAsync(cancellationToken); + _logger.LogWarning("Failed to create sharing link: {Status} {Content}", linkResponse.StatusCode, errorContent); + return webUrl; + } + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to share conversation folder"); + return null; + } + } + + /// + /// Per-conversation session state. Holds the W365 session ID, conversation history, + /// and screenshot counter for a single user conversation. + /// + private sealed class ConversationSession + { + public bool SessionStarted { get; set; } + public string? W365SessionId { get; set; } + public List ConversationHistory { get; } = []; + public int ScreenshotCounter { get; set; } + public string? ScreenshotSubfolder { get; set; } + public bool FolderShared { get; set; } + } +} diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/ICuaModelProvider.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/ICuaModelProvider.cs new file mode 100644 index 00000000..b26db535 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/ICuaModelProvider.cs @@ -0,0 +1,17 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace W365ComputerUseSample.ComputerUse; + +/// +/// Abstraction for sending requests to a CUA-capable model (OpenAI Responses API). +/// Implementations handle authentication and endpoint differences. +/// +public interface ICuaModelProvider +{ + /// The model name to include in the request body. + string ModelName { get; } + + /// Send a serialized request body and return the raw JSON response. + Task SendAsync(string requestBody, CancellationToken cancellationToken); +} diff --git a/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs b/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs new file mode 100644 index 00000000..1f774b33 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ComputerUse/Models/ComputerUseModels.cs @@ -0,0 +1,96 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace W365ComputerUseSample.ComputerUse.Models; + +/// +/// Response from the OpenAI Computer Use API. +/// +public class ComputerUseResponse +{ + [JsonPropertyName("id")] + public string? Id { get; set; } + + [JsonPropertyName("object")] + public string? Object { get; set; } + + [JsonPropertyName("created_at")] + public long CreatedAt { get; set; } + + [JsonPropertyName("model")] + public string? Model { get; set; } + + [JsonPropertyName("output")] + public List? Output { get; set; } +} + +/// +/// Request to the OpenAI Computer Use API. +/// +public class ComputerUseRequest +{ + [JsonPropertyName("model")] + public string Model { get; set; } = "computer-use-preview-2025-03-11"; + + [JsonPropertyName("truncation")] + public string Truncation { get; set; } = "auto"; + + [JsonPropertyName("instructions")] + public string? Instructions { get; set; } + + [JsonPropertyName("input")] + public List Input { get; set; } = []; + + [JsonPropertyName("tools")] + public List Tools { get; set; } = []; +} + +/// +/// Defines the computer_use_preview tool for the OpenAI Responses API. +/// Used by computer-use-preview models. +/// +public class ComputerUseTool +{ + [JsonPropertyName("type")] + public string Type { get; set; } = "computer_use_preview"; + + [JsonPropertyName("display_width")] + public int DisplayWidth { get; set; } = 1024; + + [JsonPropertyName("display_height")] + public int DisplayHeight { get; set; } = 768; + + [JsonPropertyName("environment")] + public string Environment { get; set; } = "windows"; +} + +/// +/// Defines the "computer" tool for GPT-5.4+ models. +/// Bare type with no parameters — the model infers screen dimensions from screenshots. +/// +public class ComputerToolV2 +{ + [JsonPropertyName("type")] + public string Type { get; set; } = "computer"; +} + +/// +/// Defines a function tool for the OpenAI Responses API. +/// +public class FunctionToolDefinition +{ + [JsonPropertyName("type")] + public string Type { get; set; } = "function"; + + [JsonPropertyName("name")] + public string Name { get; set; } = string.Empty; + + [JsonPropertyName("description")] + public string Description { get; set; } = string.Empty; + + [JsonPropertyName("parameters")] + public object? Parameters { get; set; } +} diff --git a/dotnet/w365-computer-use/sample-agent/Program.cs b/dotnet/w365-computer-use/sample-agent/Program.cs new file mode 100644 index 00000000..df4afb7f --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/Program.cs @@ -0,0 +1,106 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using W365ComputerUseSample; +using W365ComputerUseSample.Agent; +using W365ComputerUseSample.ComputerUse; +using W365ComputerUseSample.Telemetry; +using Microsoft.Agents.A365.Observability; +using Microsoft.Agents.A365.Observability.Extensions.AgentFramework; +using Microsoft.Agents.A365.Observability.Runtime; +using Microsoft.Agents.A365.Tooling.Extensions.AgentFramework.Services; +using Microsoft.Agents.A365.Tooling.Services; +using Microsoft.Agents.Builder; +using Microsoft.Agents.Core; +using Microsoft.Agents.Hosting.AspNetCore; +using Microsoft.Agents.Storage; +using System.Reflection; + +var builder = WebApplication.CreateBuilder(args); + +// Setup ASP service defaults, including OpenTelemetry, Service Discovery, Resilience, and Health Checks +builder.ConfigureOpenTelemetry(); + +builder.Configuration.AddUserSecrets(Assembly.GetExecutingAssembly()); +builder.Services.AddControllers(); +builder.Services.AddHttpClient("WebClient", client => client.Timeout = TimeSpan.FromSeconds(600)); +builder.Services.AddHttpContextAccessor(); +builder.Logging.AddConsole(); + +// ********** Configure A365 Services ********** +// Configure observability. +builder.Services.AddAgenticTracingExporter(clusterCategory: "production"); + +// Add A365 tracing with Agent Framework integration +builder.AddA365Tracing(config => +{ + config.WithAgentFramework(); +}); + +// Add A365 Tooling Server integration +builder.Services.AddSingleton(); +builder.Services.AddSingleton(); +// ********** END Configure A365 Services ********** + +// Register the Azure OpenAI CUA model provider +builder.Services.AddSingleton(); + +// Register the Computer Use orchestrator +builder.Services.AddSingleton(); + +// Add AspNet token validation +builder.Services.AddAgentAspNetAuthentication(builder.Configuration); + +// Register IStorage. For development, MemoryStorage is suitable. +builder.Services.AddSingleton(); + +// Add AgentApplicationOptions from config. +builder.AddAgentApplicationOptions(); + +// Add the bot (which is transient) +builder.AddAgent(); + +var app = builder.Build(); + +if (app.Environment.IsDevelopment()) +{ + app.UseDeveloperExceptionPage(); +} + +app.UseRouting(); +app.UseAuthentication(); +app.UseAuthorization(); + +// Map the /api/messages endpoint to the AgentApplication +app.MapPost("/api/messages", async (HttpRequest request, HttpResponse response, IAgentHttpAdapter adapter, IAgent agent, CancellationToken cancellationToken) => +{ + // Allow multiple reads of the request body — tracing/observability middleware may + // re-read it after the adapter, which otherwise triggers + // "Reading is not allowed after reader was completed" on the Kestrel pipe reader. + request.EnableBuffering(); + + await AgentMetrics.InvokeObservedHttpOperation("agent.process_message", async () => + { + await adapter.ProcessAsync(request, response, agent, cancellationToken); + }).ConfigureAwait(false); +}); + +// Health check endpoint for CI/CD pipelines and monitoring +app.MapGet("/api/health", () => Results.Ok(new { status = "healthy", timestamp = DateTime.UtcNow })); + +if (app.Environment.IsDevelopment() || app.Environment.EnvironmentName == "Playground") +{ + app.MapGet("/", () => "W365 Computer Use Sample Agent"); + app.UseDeveloperExceptionPage(); + app.MapControllers().AllowAnonymous(); + + // Hard coded for brevity and ease of testing. + // In production, this should be set in configuration. + app.Urls.Add("http://localhost:3978"); +} +else +{ + app.MapControllers(); +} + +app.Run(); diff --git a/dotnet/w365-computer-use/sample-agent/Properties/launchSettings.json b/dotnet/w365-computer-use/sample-agent/Properties/launchSettings.json new file mode 100644 index 00000000..427cd153 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/Properties/launchSettings.json @@ -0,0 +1,11 @@ +{ + "profiles": { + "W365ComputerUseSample": { + "commandName": "Project", + "environmentVariables": { + "ASPNETCORE_ENVIRONMENT": "Development" + }, + "applicationUrl": "http://localhost:3978" + } + } +} diff --git a/dotnet/w365-computer-use/sample-agent/README.md b/dotnet/w365-computer-use/sample-agent/README.md new file mode 100644 index 00000000..c04adfab --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/README.md @@ -0,0 +1,306 @@ +# W365 Computer Use Sample + +## Overview + +This sample demonstrates how to build an agent that controls a Windows 365 Cloud PC using the OpenAI Responses API and the W365 Computer Use MCP server. + +The agent receives a natural language task from the user, provisions a W365 desktop session via MCP tools, then runs a CUA (Computer Use Agent) loop: the model sees screenshots, decides actions (click, type, scroll), and the MCP server executes them on the VM. + +It supports two model types: +- **`computer-use-preview`** - The original CUA model on Azure OpenAI +- **`gpt-5.4` / `gpt-5.4-mini`** - Newer GPT models with built-in computer use capability + +## Architecture + +``` +User Message + | +MyAgent (Agent Framework) + | connects to MCP server +W365 MCP Tools (QuickStartSession, CaptureScreenshot, Click, Type, etc.) + | provisions and controls +Windows 365 Cloud PC + | screenshots fed back to +CUA Model (Azure OpenAI) + | emits computer_call actions +ComputerUseOrchestrator (translates actions to MCP tool calls) + | loop until task complete +Response to User +``` + +**Key components:** + +| File | Purpose | +|------|---------| +| `Agent/MyAgent.cs` | Message handler - acquires tokens, connects to MCP, runs orchestrator | +| `ComputerUse/ComputerUseOrchestrator.cs` | CUA loop - sends screenshots to model, maps actions to MCP tools | +| `ComputerUse/ICuaModelProvider.cs` | Abstraction for the CUA model API | +| `ComputerUse/AzureOpenAIModelProvider.cs` | Azure OpenAI Responses API provider | + +## Prerequisites + +- [.NET 8.0 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) or later +- Azure OpenAI resource with a CUA-capable model deployment: + - `computer-use-preview` or `gpt-5.4` / `gpt-5.4-mini` + - [Request access to gpt-5.4](https://aka.ms/OAI/gpt54access) if needed +- Access to the W365 Computer Use MCP server (via [Agent 365 MCP Platform](https://learn.microsoft.com/en-us/microsoft-agent-365/developer/)) +- An Azure tenant where you can run `a365 setup` to provision the agent identity and grant the `McpServers.W365ComputerUse.All` admin consent +- An end user with a Windows 365 Cloud PC license / pool entitlement in that tenant +- The [Agent 365 dev-tools CLI](https://learn.microsoft.com/en-us/microsoft-agent-365/developer/) installed: `dotnet tool install -g Microsoft.Agents.A365.DevTools.Cli` + +## Setup + +### 1. Clone the repository + +```bash +git clone https://github.com/microsoft/Agent365-Samples.git +cd Agent365-Samples/dotnet/w365-computer-use/sample-agent +``` + +### 2. Restore dependencies + +```bash +dotnet restore +``` + +### 3. Provision the agent identity + +The sample uses agentic auth and connects to the production Agent Tooling Gateway. Provision the agent blueprint, app registration, and consent grants via the Agent 365 dev-tools CLI. + +First, copy the committed template into a working config: + +```bash +cp a365.config.example.json a365.config.json +``` + +Then open `a365.config.json` and fill in the placeholders for your tenant — `tenantId`, `subscriptionId`, `resourceGroup`, `webAppName`, `appServicePlanName`, `agentUserPrincipalName`, `managerEmail`. Leave `customBlueprintPermissions` as-is unless you intend to extend the agent with additional MCP servers; the W365 sample only needs `McpServers.W365ComputerUse.All` (and `McpServersMetadata.Read.All` for tool discovery). + +Then run setup. Use `--m365` so the CLI registers the agent identity in M365 (the bot channels registration is done separately — see [Production Deployment](#production-deployment)): + +```bash +a365 setup all --m365 +``` + +This writes `a365.generated.config.json` next to your config file — both `a365.config.json` and `a365.generated.config.json` are gitignored and contain tenant-specific values you must not commit. The generated file contains the new agent blueprint id, the agent blueprint client secret, and consent URLs you must visit to grant tenant admin consent for each resource. + +### 4. Configure local secrets + +For local runs, write a personal `appsettings.Development.json` next to `appsettings.json` (gitignored) that overrides the connection block from `UserManagedIdentity` to `ClientSecret` and supplies the Azure OpenAI credentials. The blueprint id and client secret come from `a365.generated.config.json` (produced by `a365 setup all --m365`): + +```jsonc +{ + "AIServices": { + "AzureOpenAI": { + "Endpoint": "https://your-resource.openai.azure.com", + "ApiKey": "", + "DeploymentName": "computer-use-preview" // or set ModelName for gpt-5.4-mini + } + }, + "Connections": { + "ServiceConnection": { + "Settings": { + "AuthType": "ClientSecret", + "AuthorityEndpoint": "https://login.microsoftonline.com/", + "ClientId": "", + "ClientSecret": "" + } + } + } +} +``` + +Alternatively use `dotnet user-secrets` if you prefer to keep secrets out of the working tree: + +```bash +dotnet user-secrets init +dotnet user-secrets set "AIServices:AzureOpenAI:Endpoint" "https://your-resource.openai.azure.com" +dotnet user-secrets set "AIServices:AzureOpenAI:ApiKey" "" +dotnet user-secrets set "AIServices:AzureOpenAI:DeploymentName" "computer-use-preview" +dotnet user-secrets set "Connections:ServiceConnection:Settings:AuthType" "ClientSecret" +dotnet user-secrets set "Connections:ServiceConnection:Settings:AuthorityEndpoint" "https://login.microsoftonline.com/" +dotnet user-secrets set "Connections:ServiceConnection:Settings:ClientId" "" +dotnet user-secrets set "Connections:ServiceConnection:Settings:ClientSecret" "" +``` + +### 5. Run the agent + +```bash +cd sample-agent +dotnet run +``` + +The agent listens on `http://localhost:3978/api/messages`. + +### 6. Test with Agent Playground + +1. Open [Microsoft 365 Agents Playground](https://dev.agents.cloud.microsoft/). +2. Connect to `http://localhost:3978/api/messages`. +3. Send a message like *"Open Notepad and type Hello World"*. +4. Screenshots are saved to `./Screenshots/` locally and uploaded to the OneDrive folder configured in `appsettings.json` (with a per-prompt subfolder link surfaced in chat). + +## Configuration Reference + +| Setting | Description | Default | +|---------|-------------|---------| +| `AIServices:AzureOpenAI:Endpoint` | Azure OpenAI resource URL | - | +| `AIServices:AzureOpenAI:ApiKey` | API key | - | +| `AIServices:AzureOpenAI:DeploymentName` | Deployment name (for deployment-based URLs) | `computer-use-preview` | +| `AIServices:AzureOpenAI:ModelName` | Model name (for model-based URLs, e.g., `gpt-5.4-mini`) | - | +| `ComputerUse:MaxIterations` | Max CUA loop iterations | `30` | +| `ComputerUse:DisplayWidth` | Display width for computer_use_preview tool | `1024` | +| `ComputerUse:DisplayHeight` | Display height for computer_use_preview tool | `768` | +| `Screenshots:LocalPath` | Local path to save screenshots | `./Screenshots` | +| `Screenshots:OneDriveFolder` | OneDrive folder for screenshot upload | `CUA-Sessions` | +| `Screenshots:OneDriveUserId` | UPN/email to upload screenshots to a specific user's OneDrive (instead of token owner) | - | + +## Supported Models + +| Model | Tool Type | Config | Notes | +|-------|-----------|--------|-------| +| `computer-use-preview` | `computer_use_preview` | `DeploymentName: "computer-use-preview"` | Uses `display_width`, `display_height`, `environment` params | +| `gpt-5.4` / `gpt-5.4-mini` | `computer` | `ModelName: "gpt-5.4-mini"` | Bare `{"type": "computer"}`. Initial screenshot sent with first message | + +The tool type is auto-derived from the model name (`gpt-*` -> `computer`, otherwise -> `computer_use_preview`). + +## How It Works + +1. **User sends a message** -> `MyAgent.OnMessageAsync` +2. **MCP connection** established via the Agent 365 SDK's tooling gateway based on the agent blueprint's permissions +3. **Session acquisition** runs transparently on the first W365 tool call — ATG picks an eligible Cloud PC pool, checks out a session, and probes readiness. The session is reused across messages. +4. **CUA loop** in `ComputerUseOrchestrator.RunAsync`: + - User message + conversation history sent to the model + - Model returns `computer_call` actions (click, type, scroll, etc.) + - Actions translated to MCP tool calls (`click`, `type_text`, `press_keys`, etc. — discovered dynamically from the W365 remote server) + - Screenshot captured after each action and fed back to the model + - Loop continues until model calls `OnTaskComplete` or max iterations reached +5. **Response** sent back to user +6. **Session persists** across messages for follow-up tasks +7. **EndSession** called on app shutdown (Ctrl+C) via `mcp_W365ComputerUse_EndSession` to release the VM + +## Session Management + +- Sessions are started **once** on the first message and reused across all subsequent messages +- Conversation history accumulates across messages, giving the model context for follow-up tasks +- On app shutdown (`Ctrl+C`), the agent calls `EndSession` to release the VM back to the pool +- If the app crashes, sessions auto-expire after ~30 minutes on the W365 backend + +## Production Deployment + +The dev-tools CLI provisions the agent identity, but the rest of the production pipeline (Azure infrastructure, App Service settings, the Bot Channels Registration, blueprint linking, and Teams app upload) is currently a manual flow. Follow the steps below for an end-to-end deployment. + +### 1. Provision the agent identity + +If you haven't already done it during local setup, run: + +```bash +a365 setup all --m365 +``` + +This creates the agent blueprint, the agent app registration, and the agent user, and writes consent URLs into `a365.generated.config.json`. Open each `consentUrl` in the generated file as a tenant administrator to grant admin consent for every required resource (Microsoft Graph, Agent 365 Tools, Messaging Bot API, Observability API, Power Platform API). + +### 2. Verify license readiness + +Each end user invoking the agent must hold an `AGENT_365_TOOLS` or `AGENT_365` license, plus a Windows 365 Cloud PC license / pool entitlement (the Cloud PC license is what `mcp_W365ComputerUse` uses to provision a session for the user). + +### 3. Create the Azure infrastructure + +Create the resource group, App Service plan, and web app: + +```bash +az group create --name --location + +az appservice plan create \ + --name \ + --resource-group \ + --sku B1 \ + --is-linux false + +az webapp create \ + --name \ + --resource-group \ + --plan \ + --runtime "DOTNETCORE:8.0" +``` + +### 4. Configure App Service application settings + +Push the runtime configuration into App Service. The committed `appsettings.json` defaults the connection to `UserManagedIdentity` — override it to `ClientSecret` here so the agent blueprint can authenticate with the secret produced by `a365 setup`. Use double-underscore separators for nested keys: + +```bash +az webapp config appsettings set \ + --name \ + --resource-group \ + --settings \ + AIServices__AzureOpenAI__Endpoint=https://.openai.azure.com \ + AIServices__AzureOpenAI__ApiKey= \ + AIServices__AzureOpenAI__DeploymentName=computer-use-preview \ + Connections__ServiceConnection__Settings__AuthType=ClientSecret \ + Connections__ServiceConnection__Settings__AuthorityEndpoint=https://login.microsoftonline.com/ \ + Connections__ServiceConnection__Settings__ClientId= \ + Connections__ServiceConnection__Settings__ClientSecret= \ + ASPNETCORE_ENVIRONMENT=Production +``` + +For higher-trust deployments, assign a User-Assigned Managed Identity to the App Service and leave the connection on `UserManagedIdentity` (the default), or store the client secret in Azure Key Vault and reference it from App Service via Key Vault references. + +### 5. Build and deploy the code + +```bash +dotnet publish W365ComputerUseSample.csproj -c Release -o ./publish +Compress-Archive -Path ./publish/* -DestinationPath ./app.zip -Force +az webapp deploy \ + --resource-group \ + --name \ + --src-path ./app.zip \ + --type zip +``` + +Confirm the messaging endpoint is reachable: `https://.azurewebsites.net/api/health` should return `{"status":"healthy", ...}`. + +### 6. Create the Bot Channels Registration in Azure AI Foundry + +In Azure AI Foundry (or the Azure portal), create a Bot Channels Registration (Azure Bot resource) that points at your messaging endpoint: + +- **Messaging endpoint**: `https://.azurewebsites.net/api/messages` +- **Microsoft App type**: Single Tenant +- **Microsoft App ID**: the agent's app-registration client id (the same one you set as `Connections__ServiceConnection__Settings__ClientId`) +- Enable the **Microsoft Teams** channel + +Take note of the bot's resource id — you will need it in step 7. + +### 7. Link the bot to the agent blueprint + +Link the Azure Bot resource to the agent blueprint so the platform recognises the bot as the messaging endpoint for this agent identity. This is currently a portal/CLI step (the dev-tools CLI does not yet automate it). Confirm the linkage by checking that the agent blueprint shows a `botId` and `botMsaAppId` in its agent identity record. + +### 8. Generate and upload the Teams app package + +The dev-tools CLI generates the Teams app manifest from internal templates — no manifest is committed to the repo: + +```bash +a365 publish +``` + +This writes the manifest, icons, and `manifest.zip` into a local `manifest/` folder (gitignored). Upload `manifest/manifest.zip` through the [Microsoft 365 Admin Center](https://admin.microsoft.com/) under **Settings → Integrated apps → Upload custom apps**. + +### 9. Send a test message + +Open Microsoft Teams (or the Microsoft 365 chat client) as a user with the required licenses, install the uploaded custom app, and send the agent a message such as *"Open Notepad and type Hello World"*. + +In Production the SDK discovers MCP servers through the platform's Agent Tooling Gateway based on the agent blueprint's permissions — `ToolingManifest.json` is reference material only. + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| Model returns 400 | Check that the tool type matches your model (see Supported Models table) | +| `Failed to acquire a W365 session: no Cloud PC pools are available` | The acting user has no Windows 365 Cloud PC entitlement. Assign a Cloud PC license / pool membership in the user's tenant. | +| `invalid_function_parameters` from Azure OpenAI | One of the MCP server's tool responses returned an Error sentinel. The orchestrator filters it from the LLM tool list — confirm the filter is in place in `ComputerUseOrchestrator.cs`. | +| Screenshot extraction fails | Ensure MCP server returns image content blocks | +| Session orphaned after crash | Sessions auto-expire after ~30 min on the W365 backend | +| Multiple sessions started | Ensure only one agent instance is running per conversation | + +## Links + +- [Microsoft Agent 365 Developer Documentation](https://learn.microsoft.com/en-us/microsoft-agent-365/developer/) +- [Microsoft 365 Agents SDK](https://learn.microsoft.com/microsoft-365/agents-sdk/) +- [Azure OpenAI Computer Use Guide](https://learn.microsoft.com/en-us/azure/foundry-classic/openai/how-to/computer-use) \ No newline at end of file diff --git a/dotnet/w365-computer-use/sample-agent/ServiceExtensions.cs b/dotnet/w365-computer-use/sample-agent/ServiceExtensions.cs new file mode 100644 index 00000000..4e72efc3 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ServiceExtensions.cs @@ -0,0 +1,39 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Microsoft.Agents.A365.Observability; +using Microsoft.Agents.A365.Observability.Extensions.AgentFramework; +using Microsoft.Agents.A365.Observability.Runtime; +using OpenTelemetry.Metrics; +using OpenTelemetry.Resources; +using OpenTelemetry.Trace; +using W365ComputerUseSample.Telemetry; + +namespace W365ComputerUseSample; + +public static class ServiceExtensions +{ + public static void ConfigureOpenTelemetry(this WebApplicationBuilder builder) + { + builder.Services.AddOpenTelemetry() + .ConfigureResource(resource => resource.AddService("W365ComputerUseSample")) + .WithTracing(tracing => + { + tracing + .AddSource(AgentMetrics.SourceName) + .AddAspNetCoreInstrumentation() + .AddHttpClientInstrumentation(); + + // Console exporter removed — dumps a full Activity block per HTTP request and + // swamped the console during bring-up. Re-add locally if you need trace output. + }) + .WithMetrics(metrics => + { + metrics + .AddMeter(AgentMetrics.SourceName) + .AddAspNetCoreInstrumentation() + .AddHttpClientInstrumentation() + .AddRuntimeInstrumentation(); + }); + } +} diff --git a/dotnet/w365-computer-use/sample-agent/ToolingManifest.json b/dotnet/w365-computer-use/sample-agent/ToolingManifest.json new file mode 100644 index 00000000..39548b07 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/ToolingManifest.json @@ -0,0 +1,11 @@ +{ + "mcpServers": [ + { + "mcpServerName": "mcp_W365ComputerUse", + "mcpServerUniqueName": "mcp_W365ComputerUse", + "url": "https://agent365.svc.cloud.microsoft/agents/servers/mcp_W365ComputerUse", + "scope": "McpServers.W365ComputerUse.All", + "audience": "ea9ffc3e-8a23-4a7d-836d-234d7c7565c1" + } + ] +} diff --git a/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj b/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj new file mode 100644 index 00000000..e0455440 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/W365ComputerUseSample.csproj @@ -0,0 +1,37 @@ + + + + net8.0 + enable + a3c1d2e4-f567-8901-abcd-ef0123456789 + enable + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dotnet/w365-computer-use/sample-agent/a365.config.example.json b/dotnet/w365-computer-use/sample-agent/a365.config.example.json new file mode 100644 index 00000000..5b2a5084 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/a365.config.example.json @@ -0,0 +1,31 @@ +{ + "tenantId": "<>", + "subscriptionId": "<>", + "resourceGroup": "<>", + "location": "westus2", + "environment": "prod", + "messagingEndpoint": "https://<>.azurewebsites.net/api/messages", + "needDeployment": true, + "webAppName": "<>", + "appServicePlanName": "<>", + "appServicePlanSku": "B1", + "graphBaseUrl": "https://graph.microsoft.com", + "agentIdentityDisplayName": "W365 Computer Use Identity", + "agentBlueprintDisplayName": "W365_ComputerUse_Blueprint", + "agentUserPrincipalName": "w365cua@<>.onmicrosoft.com", + "agentUserDisplayName": "W365 Computer Use Agent User", + "managerEmail": "<>", + "agentUserUsageLocation": "US", + "agentDescription": "W365 Computer Use sample agent", + "useBlueprint": false, + "customBlueprintPermissions": [ + { + "resourceAppId": "ea9ffc3e-8a23-4a7d-836d-234d7c7565c1", + "resourceName": "Agent 365 Tools", + "scopes": [ + "McpServersMetadata.Read.All", + "McpServers.W365ComputerUse.All" + ] + } + ] +} diff --git a/dotnet/w365-computer-use/sample-agent/appsettings.json b/dotnet/w365-computer-use/sample-agent/appsettings.json new file mode 100644 index 00000000..a89c9724 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/appsettings.json @@ -0,0 +1,79 @@ +{ + "AgentApplication": { + "StartTypingTimer": false, + "RemoveRecipientMention": false, + "NormalizeMentions": false, + "AgenticAuthHandlerName": "agentic", + "UserAuthorization": { + "AutoSignin": false, + "Handlers": { + "agentic": { + "Type": "AgenticUserAuthorization", + "Settings": { + "Scopes": [ + "https://graph.microsoft.com/.default" + ] + } + } + } + } + }, + + "TokenValidation": { + "Enabled": false, + "Audiences": [ + "{{ClientId}}" + ] + }, + + "Logging": { + "LogLevel": { + "Default": "Information", + "Microsoft.AspNetCore": "Warning", + "Microsoft.Agents": "Warning", + "Microsoft.Hosting.Lifetime": "Information", + "System.Net.Http": "Warning" + } + }, + "AllowedHosts": "*", + "Connections": { + "ServiceConnection": { + "Settings": { + "AuthType": "UserManagedIdentity", + "AuthorityEndpoint": "https://login.microsoftonline.com/{{BOT_TENANT_ID}}", + "ClientId": "{{BOT_ID}}", + "Scopes": [ + "5a807f24-c9de-44ee-a3a7-329e88a00ffc/.default" + ] + } + } + }, + "ConnectionsMap": [ + { + "ServiceUrl": "*", + "Connection": "ServiceConnection" + } + ], + + "AIServices": { + "AzureOpenAI": { + "DeploymentName": "<>", + "ModelName": "", + "Endpoint": "<>", + "ApiKey": "<>", + "ApiVersion": "2025-04-01-preview" + } + }, + + "ComputerUse": { + "MaxIterations": 30, + "DisplayWidth": 1024, + "DisplayHeight": 768 + }, + + "Screenshots": { + "LocalPath": "./Screenshots", + "OneDriveFolder": "CUA-Sessions", + "OneDriveUserId": "" + } +} diff --git a/dotnet/w365-computer-use/sample-agent/nuget.config b/dotnet/w365-computer-use/sample-agent/nuget.config new file mode 100644 index 00000000..765346e5 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/nuget.config @@ -0,0 +1,7 @@ + + + + + + + diff --git a/dotnet/w365-computer-use/sample-agent/telemetry/A365OtelWrapper.cs b/dotnet/w365-computer-use/sample-agent/telemetry/A365OtelWrapper.cs new file mode 100644 index 00000000..c0051f47 --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/telemetry/A365OtelWrapper.cs @@ -0,0 +1,74 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Microsoft.Agents.A365.Observability.Caching; +using Microsoft.Agents.A365.Observability.Runtime.Common; +using Microsoft.Agents.A365.Runtime.Utils; +using Microsoft.Agents.Builder; +using Microsoft.Agents.Builder.App.UserAuth; +using Microsoft.Agents.Builder.State; +using W365ComputerUseSample.Telemetry; + +namespace W365ComputerUseSample; + +public static class A365OtelWrapper +{ + public static async Task InvokeObservedAgentOperation( + string operationName, + ITurnContext turnContext, + ITurnState turnState, + IExporterTokenCache? agentTokenCache, + UserAuthorization authSystem, + string authHandlerName, + ILogger? logger, + Func func) + { + await AgentMetrics.InvokeObservedAgentOperation( + operationName, + turnContext, + async () => + { + (string agentId, string tenantId) = await ResolveTenantAndAgentId(turnContext, authSystem, authHandlerName); + + using var baggageScope = new BaggageBuilder() + .TenantId(tenantId) + .AgentId(agentId) + .Build(); + + try + { + agentTokenCache?.RegisterObservability(agentId, tenantId, new AgenticTokenStruct + { + UserAuthorization = authSystem, + TurnContext = turnContext, + AuthHandlerName = authHandlerName + }, EnvironmentUtils.GetObservabilityAuthenticationScope()); + } + catch (Exception ex) when (ex is ArgumentException || ex is InvalidOperationException) + { + logger?.LogWarning("There was an error registering for observability: {Message}", ex.Message); + } + + await func().ConfigureAwait(false); + }).ConfigureAwait(false); + } + + private static async Task<(string agentId, string tenantId)> ResolveTenantAndAgentId(ITurnContext turnContext, UserAuthorization authSystem, string authHandlerName) + { + string agentId = ""; + if (turnContext?.Activity?.IsAgenticRequest() == true) + { + agentId = turnContext.Activity.GetAgenticInstanceId(); + } + else if (authSystem != null && !string.IsNullOrEmpty(authHandlerName) && turnContext != null) + { + agentId = Utility.ResolveAgentIdentity(turnContext, await authSystem.GetTurnTokenAsync(turnContext, authHandlerName)); + } + + if (string.IsNullOrEmpty(agentId)) agentId = Guid.Empty.ToString(); + string? tempTenantId = turnContext?.Activity?.Conversation?.TenantId ?? turnContext?.Activity?.Recipient?.TenantId; + string tenantId = tempTenantId ?? Guid.Empty.ToString(); + + return (agentId, tenantId); + } +} diff --git a/dotnet/w365-computer-use/sample-agent/telemetry/AgentMetrics.cs b/dotnet/w365-computer-use/sample-agent/telemetry/AgentMetrics.cs new file mode 100644 index 00000000..a8af8a0e --- /dev/null +++ b/dotnet/w365-computer-use/sample-agent/telemetry/AgentMetrics.cs @@ -0,0 +1,114 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Microsoft.Agents.Builder; +using Microsoft.Agents.Core; +using System.Diagnostics; +using System.Diagnostics.Metrics; + +namespace W365ComputerUseSample.Telemetry; + +public static class AgentMetrics +{ + public static readonly string SourceName = "A365.W365ComputerUse"; + + public static readonly ActivitySource ActivitySource = new(SourceName); + + private static readonly Meter Meter = new("A365.W365ComputerUse", "1.0.0"); + + public static readonly Counter MessageProcessedCounter = Meter.CreateCounter( + "agent.messages.processed", + "messages", + "Number of messages processed by the agent"); + + public static readonly Histogram MessageProcessingDuration = Meter.CreateHistogram( + "agent.message.processing.duration", + "ms", + "Duration of message processing in milliseconds"); + + public static readonly Counter CuaActionsExecuted = Meter.CreateCounter( + "agent.cua.actions.executed", + "actions", + "Number of CUA computer actions executed"); + + public static Activity InitializeMessageHandlingActivity(string handlerName, ITurnContext context) + { + var activity = ActivitySource.StartActivity(handlerName); + activity?.SetTag("Activity.Type", context.Activity.Type.ToString()); + activity?.SetTag("Agent.IsAgentic", context.IsAgenticRequest()); + activity?.SetTag("Caller.Id", context.Activity.From?.Id); + activity?.SetTag("Conversation.Id", context.Activity.Conversation?.Id); + activity?.SetTag("Channel.Id", context.Activity.ChannelId?.ToString()); + + return activity!; + } + + public static void FinalizeMessageHandlingActivity(Activity activity, ITurnContext context, long duration, bool success) + { + MessageProcessingDuration.Record(duration, + new("Conversation.Id", context.Activity.Conversation?.Id ?? "unknown"), + new("Channel.Id", context.Activity.ChannelId?.ToString() ?? "unknown")); + + if (success) + { + activity?.SetStatus(ActivityStatusCode.Ok); + } + else + { + activity?.SetStatus(ActivityStatusCode.Error); + } + + activity?.Stop(); + activity?.Dispose(); + } + + public static async Task InvokeObservedHttpOperation(string operationName, Func func) + { + using var activity = ActivitySource.StartActivity(operationName); + try + { + await func().ConfigureAwait(false); + activity?.SetStatus(ActivityStatusCode.Ok); + } + catch (Exception ex) + { + activity?.SetStatus(ActivityStatusCode.Error, ex.Message); + activity?.AddEvent(new ActivityEvent("exception", DateTimeOffset.UtcNow, new() + { + ["exception.type"] = ex.GetType().FullName, + ["exception.message"] = ex.Message, + ["exception.stacktrace"] = ex.StackTrace + })); + throw; + } + } + + public static async Task InvokeObservedAgentOperation(string operationName, ITurnContext context, Func func) + { + MessageProcessedCounter.Add(1); + var activity = InitializeMessageHandlingActivity(operationName, context); + var stopwatch = Stopwatch.StartNew(); + bool success = false; + try + { + await func().ConfigureAwait(false); + success = true; + } + catch (Exception ex) + { + activity?.SetStatus(ActivityStatusCode.Error, ex.Message); + activity?.AddEvent(new ActivityEvent("exception", DateTimeOffset.UtcNow, new() + { + ["exception.type"] = ex.GetType().FullName, + ["exception.message"] = ex.Message, + ["exception.stacktrace"] = ex.StackTrace + })); + throw; + } + finally + { + stopwatch.Stop(); + FinalizeMessageHandlingActivity(activity, context, stopwatch.ElapsedMilliseconds, success); + } + } +}