diff --git a/contoso-trading b/contoso-trading new file mode 160000 index 000000000..010b65084 --- /dev/null +++ b/contoso-trading @@ -0,0 +1 @@ +Subproject commit 010b650841fd6b0a85cdde1677afbbee4ef06f28 diff --git a/labs/deployment-guard/README.md b/labs/deployment-guard/README.md new file mode 100644 index 000000000..b5bb909f7 --- /dev/null +++ b/labs/deployment-guard/README.md @@ -0,0 +1,240 @@ +# Deployment Guard Lab + +Shift-left reliability with SRE Agent: catch breaking changes in PRs **before** they reach production. This lab sets up an SRE Agent with an HTTP trigger that receives GitHub PR events, deploys changes to staging, compares health metrics against production, and posts a risk assessment as a PR comment. + +## What You'll Learn + +1. Deploy an SRE Agent with the `law-dynatrace-github-httptrigger-prvalidation` recipe +2. Wire a GitHub repo to the agent via Logic App webhook bridge +3. Create a PR with a subtle breaking change and watch the agent catch it +4. Understand how deployment guard analysis works end-to-end + +## Architecture + +``` +┌─────────────────┐ PR event ┌──────────────────┐ webhook ┌──────────────┐ +│ GitHub Repo │ ──────────────→ │ GitHub Actions │ ────────────→ │ Logic App │ +│ (contoso-trading)│ │ (PR workflow) │ │ (bridge) │ +└─────────────────┘ └──────────────────┘ └──────┬───────┘ + │ + HTTP trigger + │ + ▼ + ┌──────────────────┐ + │ SRE Agent │ + │ deployment-guard │ + │ subagent │ + └────────┬─────────┘ + │ + ┌───────────────────────────────┼───────────────────────────────┐ + │ │ │ + ▼ ▼ ▼ + Read PR diff from Deploy PR changes to Query Dynatrace + + connected GitHub repo staging environment LAW baselines + │ + ▼ + Run canary traffic + for 2-3 minutes + │ + ▼ + Compare staging vs prod + health metrics + │ + ▼ + Post risk assessment + comment on PR +``` + +## Prerequisites + +- Azure subscription with Contributor access +- Dynatrace environment with MCP gateway access +- Tools: `az`, `gh`, `jq` + +## Step 0 — Deploy the Sample App (contoso-trading) + +Fork and deploy [contoso-trading](https://github.com/dm-chelupati/contoso-trading) to two environments — production and staging. The app is a microservices trading platform (gateway, order-service, payment-service) running on Azure Container Apps. + +```bash +# Fork the repo +gh repo fork dm-chelupati/contoso-trading --clone + +cd contoso-trading + +# Deploy production +azd env new contoso-prod +azd env set AZURE_LOCATION eastus2 +azd up + +# Deploy staging (same app, separate resource group) +azd env new contoso-staging +azd env set AZURE_LOCATION eastus2 +azd up +``` + +After both environments are running, note: +- **Production RG**: `rg-contoso-prod` (or whatever `azd` created) +- **Staging RG**: `rg-contoso-staging` +- **LAW resource ID**: Find it in the production RG — `az resource list --resource-group rg-contoso-prod --resource-type Microsoft.OperationalInsights/workspaces --query "[0].id" -o tsv` + +## Step 1 — Deploy the SRE Agent + +Use the `law-dynatrace-github-httptrigger-prvalidation` recipe from the templates: + +```bash +cd sreagent-templates + +./bin/new-agent.sh --recipe law-dynatrace-github-httptrigger-prvalidation --non-interactive \ + --set agentName=deployment-guard-lab \ + --set resourceGroup=rg-deployment-guard-lab \ + --set location=eastus2 \ + --set lawId=/subscriptions//resourceGroups//providers/Microsoft.OperationalInsights/workspaces/ \ + --set dtTenant= \ + --set dtToken= \ + --set githubRepo=/contoso-trading \ + --set targetRGs=rg-contoso-prod,rg-contoso-staging \ + -o deployment-guard-lab/ + +./bin/deploy.sh deployment-guard-lab/ +``` + +The deploy script will print a GitHub OAuth URL at the end. Open it in your browser and approve the SRE Agent app to connect your fork of contoso-trading. + +## Step 2 — Get the Webhook URL + +After deployment, the agent has a Logic App webhook bridge. Get the trigger URL: + +```bash +# Find the Logic App in the agent's resource group +LOGIC_APP=$(az resource list \ + --resource-group rg-deployment-guard-lab \ + --resource-type Microsoft.Logic/workflows \ + --query "[0].name" -o tsv) + +# Get the callback URL for the HTTP trigger +WEBHOOK_URL=$(az rest --method POST \ + --url "https://management.azure.com/subscriptions/$(az account show --query id -o tsv)/resourceGroups/rg-deployment-guard-lab/providers/Microsoft.Logic/workflows/${LOGIC_APP}/triggers/manual/listCallbackUrl?api-version=2016-06-01" \ + --query "value" -o tsv) + +echo "Webhook URL: $WEBHOOK_URL" +``` + +## Step 3 — Wire GitHub to the Agent + +### Option A: Use the setup script + +```bash +cd labs/deployment-guard +bash scripts/setup-github-workflow.sh \ + --repo /contoso-trading \ + --webhook-url "$WEBHOOK_URL" +``` + +### Option B: Manual setup + +1. Copy the workflow to your contoso-trading fork: + +```bash +cp sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/data/sample-github-workflow.yml \ + /path/to/contoso-trading/.github/workflows/sre-agent-pr-guard.yml +cd /path/to/contoso-trading +git add .github/workflows/sre-agent-pr-guard.yml +git commit -m "Add SRE Agent PR deployment guard" +git push +``` + +2. Add the webhook URL as a GitHub secret: + +```bash +gh secret set SRE_AGENT_WEBHOOK_URL \ + --repo /contoso-trading \ + --body "$WEBHOOK_URL" +``` + +## Step 4 — Test with a Risky PR + +Now create a PR that introduces a subtle breaking change: + +```bash +cd /path/to/contoso-trading +git checkout main && git pull +git checkout -b config-cleanup + +# Rename a database env var — looks like a cleanup but breaks payment-service +sed -i '' 's|DATABASE_URL|DB_CONNECTION_URL|g' payment-service/Program.cs + +git add -A +git commit -m "Standardize database env var naming" +git push origin config-cleanup + +# Create the PR +gh pr create \ + --title "Standardize database env var naming" \ + --body "Renamed DATABASE_URL to DB_CONNECTION_URL for consistency with other services." \ + --base main \ + --head config-cleanup +``` + +### What happens next + +1. GitHub Actions fires the `sre-agent-pr-guard` workflow +2. The workflow sends the PR event to the Logic App webhook URL +3. The Logic App forwards it to the SRE Agent's HTTP trigger +4. The `deployment-guard` subagent activates and: + - Reads the PR diff (sees `DATABASE_URL` → `DB_CONNECTION_URL`) + - Captures production baselines from Dynatrace + LAW + - Deploys the PR changes to staging + - Sends canary traffic to staging endpoints + - Detects that payment-service can't connect to the database (env var mismatch) + - Posts a **CRITICAL** risk assessment as a PR comment + +### Expected PR Comment + +The agent should post something like: + +> **🔴 CRITICAL Risk — Do not merge** +> +> | Check | Result | +> |---|---| +> | Static Analysis | `DATABASE_URL` renamed to `DB_CONNECTION_URL` in payment-service — env var mismatch with deployment config | +> | Staging Deploy | ✅ Deployed | +> | Canary Tests | ❌ payment-service returning 500 — database connection failed | +> | Health Comparison | Production: 0 errors, Staging: 100% error rate on /api/payments | +> +> **Root Cause**: The `DATABASE_URL` environment variable is defined in the Container App configuration but the code now reads `DB_CONNECTION_URL`. The payment service cannot connect to the database. +> +> **Recommendation**: Either update the Container App env var to `DB_CONNECTION_URL` or revert the code change. + +## Step 5 — Clean Up + +```bash +# Close the test PR +gh pr close config-cleanup --repo /contoso-trading --delete-branch + +# Delete the agent (optional) +az group delete --name rg-deployment-guard-lab --yes --no-wait +``` + +## Lab Scenarios + +### Scenario 1: Safe change (LOW risk) +Update a log message or comment — agent should report LOW risk. + +### Scenario 2: Performance regression (MEDIUM risk) +Add a `Thread.Sleep(500)` or `await Task.Delay(500)` to a hot path — agent should detect latency increase. + +### Scenario 3: Breaking change (CRITICAL risk) +Rename an env var or remove a health check endpoint — agent should flag it. + +### Scenario 4: Silent data corruption (HIGH risk) +Change a calculation or data mapping — app returns 200 but wrong data. Agent compares response payloads against baselines and catches the difference. + +## Troubleshooting + +| Issue | Fix | +|---|---| +| Webhook not firing | Check GitHub Actions logs — is `SRE_AGENT_WEBHOOK_URL` secret set? | +| Agent not responding | Check Logic App run history in Azure portal | +| No PR comment | Verify GitHub repo is connected in SRE Agent portal (Settings → Repos) | +| Staging deploy fails | Check agent has `RunAzCliWriteCommands` tool and Contributor role on staging RG | +| Dynatrace queries empty | Verify Dynatrace MCP connector is connected (Settings → Connectors) | diff --git a/labs/deployment-guard/azure.yaml b/labs/deployment-guard/azure.yaml new file mode 100644 index 000000000..3843f179a --- /dev/null +++ b/labs/deployment-guard/azure.yaml @@ -0,0 +1,17 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/Azure/azure-dev/main/schemas/v1.0/azure.yaml.json + +name: deployment-guard-demo +metadata: + template: deployment-guard-demo@1.0.0 + +# This lab uses an existing application repo (contoso-trading) and deploys +# an SRE Agent configured with the law-dynatrace-httptrigger recipe. +# The agent's HTTP trigger receives GitHub PR webhooks via a Logic App bridge +# and runs deployment guard analysis on every PR. +# +# Prerequisites: +# - Fork or clone https://github.com/dm-chelupati/contoso-trading +# - Dynatrace environment with MCP gateway access +# +# The lab does NOT provision the app infrastructure (contoso-trading has its own). +# It provisions only the SRE Agent + webhook bridge. diff --git a/labs/deployment-guard/docs/blog-shift-left-deployment-guard.md b/labs/deployment-guard/docs/blog-shift-left-deployment-guard.md new file mode 100644 index 000000000..d5143a2dc --- /dev/null +++ b/labs/deployment-guard/docs/blog-shift-left-deployment-guard.md @@ -0,0 +1,168 @@ +# Shift Left with Azure SRE Agent: An Agent That Guards Every PR + +## Azure SRE Agent can do more than investigate production incidents. With HTTP triggers and a deployment guard skill, it analyzes pull requests by deploying changes to staging, comparing health metrics against production baselines, and posting risk assessments directly on the PR — before the code is merged. + +## The Gap Between Code Review and Production + +Most teams have two reliability checkpoints: code review (before merge) and monitoring (after deployment). The gap between them is where subtle breaking changes slip through. + +A renamed environment variable, a removed health check endpoint, a changed database schema — these changes pass code review because they look correct in isolation. They pass CI because nobody wrote a test for the specific interaction between the code change and the deployment configuration. They reach production, and the first signal is an alert at 2 AM. + +The challenge is cross-referencing: a human reviewer would need to compare the PR diff against the live infrastructure config, the deployment environment variables, and the production health baselines. In practice, this doesn't happen for routine changes. + +Azure SRE Agent's HTTP trigger capability fills this gap by inserting an automated reliability check into the PR workflow. + +## How It Works + +The deployment guard uses a webhook bridge pattern: + +``` +GitHub PR → GitHub Actions workflow → Logic App webhook bridge → SRE Agent HTTP trigger + ↓ + deployment-guard subagent + ↓ + ┌───────────────────┼───────────────────┐ + ↓ ↓ ↓ + Read PR diff Deploy to staging Query Dynatrace + ↓ + LAW baselines + Canary traffic + ↓ + Compare health + ↓ + Post risk assessment + comment on PR +``` + +Here's the agent configured with the deployment guard skill, HTTP trigger, and connectors: + + +![Agent builder canvas with deployment guard configuration](images/agent-builder-canvas.png) + +The HTTP trigger receives PR events from GitHub via a Logic App webhook bridge and routes them to the deployment-guard subagent in autonomous mode: + + +![HTTP trigger configuration for pr-deployment-guard](images/http-trigger-config.png) + +When a developer opens a PR, GitHub Actions sends the event to the SRE Agent via the Logic App bridge. The agent's deployment guard subagent runs a 9-step analysis: + +1. **Read the PR diff** from the connected GitHub repo — identify what changed (app code, IaC, config, DB schema, dependencies) +2. **Static analysis** — check for breaking patterns: renamed env vars, removed endpoints, changed schemas, missing error handling +3. **Capture production baselines** — query Dynatrace and Log Analytics for current error rates, latency percentiles, and throughput. Send test requests to production endpoints and record response structure +4. **Deploy to staging** — use `az containerapp update` to deploy the PR's changes to the staging environment +5. **Canary traffic** — send synthetic HTTP requests to staging endpoints for 2-3 minutes to exercise affected code paths +6. **Validate responses** — compare staging API responses against production baselines. Catch cases where the app returns 200 OK but serves degraded or incorrect data +7. **Monitor health** — query Dynatrace and LAW for staging metrics over 5 minutes. Compare against production +8. **Risk assessment** — classify as LOW, MEDIUM, HIGH, or CRITICAL +9. **Post PR comment** — structured report with risk level, static analysis findings, canary test results, health comparison table, and recommendation + +## Risk Levels + +| Risk | Criteria | Example | +|---|---|---| +| LOW | No functional or performance changes detected | Updated a log message or code comment | +| MEDIUM | Minor changes, no regressions in staging | Added a new optional query parameter | +| HIGH | Behavioral regression detected, staging still functional | Response payload changed, latency increased 2x | +| CRITICAL | Staging failing or data integrity compromised | Database connection failed, endpoints returning 500 | + +## Example: Environment Variable Rename + +A PR titled "Standardize database env var naming" renames `DATABASE_URL` to `DB_CONNECTION_URL` in the payment service. The commit is clean, the description is clear, and the change looks like responsible housekeeping. + +The deployment guard: +- Reads the diff and flags `DATABASE_URL` → `DB_CONNECTION_URL` as a potential env var mismatch +- Deploys to staging — the Container App's environment variables still define `DATABASE_URL` +- Sends canary traffic to the payment-service endpoint +- Gets 500 errors — the service can't find `DB_CONNECTION_URL` and fails to connect to the database +- Posts a CRITICAL risk assessment on the PR: + +| Check | Result | +|---|---| +| Static Analysis | `DATABASE_URL` renamed to `DB_CONNECTION_URL` — env var mismatch with deployment config | +| Staging Deploy | Deployed | +| Canary Tests | payment-service returning 500 — database connection failed | +| Health Comparison | Production: 0 errors / Staging: 100% error rate on /api/payments | + +**Recommendation**: Update the Container App env var to `DB_CONNECTION_URL` or revert the code change. + +The developer sees this before merging. No production incident. + +## How It Compares to CI Tests + +| Capability | CI Tests | Deployment Guard | +|---|---|---| +| Catches what you wrote tests for | Yes | N/A | +| Catches unanticipated regressions | No | Yes — compares against live production baselines | +| Compares response payloads against production | No | Yes — detects silent data degradation | +| Cross-references code against infrastructure config | No | Yes — reads the diff and checks env vars, endpoints, schemas | +| Requires pre-written test cases | Yes | No — uses real traffic against a real staging deployment | + +The deployment guard complements CI — it doesn't replace it. CI validates correctness against known expectations. The deployment guard validates behavior against the live production environment. + +## Setting It Up + +### Step 1 — Deploy an agent with the `law-dynatrace-github-httptrigger-prvalidation` recipe + +```bash +cd sreagent-templates + +./bin/new-agent.sh --recipe law-dynatrace-github-httptrigger-prvalidation --non-interactive \ + --set agentName=my-deployment-guard \ + --set resourceGroup=rg-sre-guard \ + --set location=eastus2 \ + --set lawId=/subscriptions//resourceGroups//providers/Microsoft.OperationalInsights/workspaces/ \ + --set dtTenant= \ + --set dtToken= \ + --set githubRepo=/ \ + --set targetRGs=rg-prod,rg-staging \ + -o my-deployment-guard/ + +./bin/deploy.sh my-deployment-guard/ +``` + +The recipe includes: + +| Component | What it does | +|---|---| +| **deployment-guard-analysis** skill | 9-step PR analysis workflow | +| **deployment-guard** subagent | Autonomous agent with access to az CLI, Dynatrace, LAW, GitHub | +| **pr-deployment-guard** HTTP trigger | Receives webhook events and routes to the subagent | +| **Log Analytics connector** | Azure-side logs and metrics | +| **Dynatrace MCP connector** | Application performance data | +| **Safety hooks** | deny-prod-deletes, require-approval-for-restarts | + +### Step 2 — Copy the GitHub workflow to your app repo + +The recipe generates a sample workflow at `data/sample-github-workflow.yml`. Copy it to your app repo: + +```bash +cp my-deployment-guard/data/sample-github-workflow.yml \ + /path/to/your-app/.github/workflows/sre-agent-pr-guard.yml +``` + +### Step 3 — Set the webhook secret + +Get the Logic App trigger URL from the agent's webhook bridge and add it as a GitHub secret: + +```bash +gh secret set SRE_AGENT_WEBHOOK_URL --repo / --body "" +``` + +### Step 4 — Open a PR and watch the agent analyze it + +Every PR on the app repo now triggers the deployment guard. The agent posts its risk assessment as a PR comment within 5-10 minutes (baseline capture + canary testing + analysis). + +## Lab and Recipe + +| Resource | Description | +|---|---| +| [law-dynatrace-github-httptrigger-prvalidation recipe](https://github.com/microsoft/sre-agent/tree/main/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation) | Deploy an agent with LAW + Dynatrace + HTTP trigger + deployment guard pre-configured | +| [deployment-guard lab](https://github.com/microsoft/sre-agent/tree/main/labs/deployment-guard) | End-to-end walkthrough using [contoso-trading](https://github.com/dm-chelupati/contoso-trading) as the target app — includes a demo script that creates a risky PR and shows the agent's response | +| [Inside SRE Agent Live](https://www.youtube.com/@InsideSREAgent) | Live demo recordings | + +## Learn More + +- [HTTP Triggers](https://sre.azure.com/docs/capabilities/http-triggers) — Configuring webhook-based automation +- [Skills](https://sre.azure.com/docs/capabilities/skills) — Creating custom analysis workflows +- [Subagents](https://sre.azure.com/docs/capabilities/subagents) — Dedicated agents with scoped tools and instructions +- [Connectors](https://sre.azure.com/docs/capabilities/connectors) — Connecting Log Analytics, Dynatrace, and other data sources +- [SRE Agent Templates](https://github.com/microsoft/sre-agent) — Recipes, labs, and deployment tooling diff --git a/labs/deployment-guard/docs/images/01-portal-home.png b/labs/deployment-guard/docs/images/01-portal-home.png new file mode 100644 index 000000000..8e31e3309 Binary files /dev/null and b/labs/deployment-guard/docs/images/01-portal-home.png differ diff --git a/labs/deployment-guard/docs/images/02-agent-overview.png b/labs/deployment-guard/docs/images/02-agent-overview.png new file mode 100644 index 000000000..bc267660c Binary files /dev/null and b/labs/deployment-guard/docs/images/02-agent-overview.png differ diff --git a/labs/deployment-guard/docs/images/agent-builder-canvas.png b/labs/deployment-guard/docs/images/agent-builder-canvas.png new file mode 100644 index 000000000..78b67aa1a Binary files /dev/null and b/labs/deployment-guard/docs/images/agent-builder-canvas.png differ diff --git a/labs/deployment-guard/docs/images/debug-agent-page.png b/labs/deployment-guard/docs/images/debug-agent-page.png new file mode 100644 index 000000000..78b67aa1a Binary files /dev/null and b/labs/deployment-guard/docs/images/debug-agent-page.png differ diff --git a/labs/deployment-guard/scripts/demo.sh b/labs/deployment-guard/scripts/demo.sh new file mode 100644 index 000000000..6511e32d1 --- /dev/null +++ b/labs/deployment-guard/scripts/demo.sh @@ -0,0 +1,137 @@ +#!/bin/bash +# ============================================================ +# demo.sh — Run the deployment guard demo end-to-end +# +# This script creates a risky PR on contoso-trading and watches +# the SRE Agent analyze it via the HTTP trigger. +# +# Usage: +# bash demo.sh --repo [--app-dir ] +# +# Prerequisites: +# - SRE Agent deployed with law-dynatrace-httptrigger recipe +# - GitHub workflow + webhook secret configured (setup-github-workflow.sh) +# - contoso-trading cloned locally +# ============================================================ +set -euo pipefail + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +REPO="" +APP_DIR="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --repo) REPO="$2"; shift 2 ;; + --app-dir) APP_DIR="$2"; shift 2 ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +if [[ -z "$REPO" ]]; then + echo "Usage: $0 --repo [--app-dir ]" + exit 1 +fi + +# Default app dir to ~/contoso-trading +APP_DIR="${APP_DIR:-$HOME/contoso-trading}" + +if [[ ! -d "$APP_DIR" ]]; then + echo -e "${RED}contoso-trading not found at $APP_DIR${NC}" + echo "Clone it first: gh repo clone $REPO $APP_DIR" + exit 1 +fi + +echo -e "${BLUE}═══════════════════════════════════════════════════════════${NC}" +echo -e "${BLUE} Deployment Guard Demo${NC}" +echo -e "${BLUE}═══════════════════════════════════════════════════════════${NC}" + +# ───────────────────────────────────────────────────────── +# PREP: Clean up any previous demo branches +# ───────────────────────────────────────────────────────── +echo -e "\n${YELLOW}[PREP] Cleaning up previous demo state...${NC}" +cd "$APP_DIR" +git checkout main 2>/dev/null && git pull +git branch -D config-cleanup 2>/dev/null || true +git push origin --delete config-cleanup 2>/dev/null || true + +# Close any existing demo PRs +EXISTING_PR=$(gh pr list --repo "$REPO" --head config-cleanup --json number -q '.[0].number' 2>/dev/null || echo "") +if [[ -n "$EXISTING_PR" ]]; then + gh pr close "$EXISTING_PR" --repo "$REPO" --delete-branch 2>/dev/null || true +fi +echo -e "${GREEN} ✓ Clean state${NC}" + +# ───────────────────────────────────────────────────────── +# ACT 1: Create a risky change +# ───────────────────────────────────────────────────────── +echo -e "\n${YELLOW}[ACT 1] Creating a subtle breaking change...${NC}" +git checkout -b config-cleanup + +# Rename DATABASE_URL to DB_CONNECTION_URL — looks like a cleanup +# but breaks payment-service because the env var is still DATABASE_URL +sed -i '' 's|DATABASE_URL|DB_CONNECTION_URL|g' payment-service/Program.cs 2>/dev/null \ + || sed -i 's|DATABASE_URL|DB_CONNECTION_URL|g' payment-service/Program.cs 2>/dev/null + +git add -A +git commit -m "Standardize database env var naming" +git push origin config-cleanup --force +echo -e "${GREEN} ✓ Pushed config-cleanup branch${NC}" + +# ───────────────────────────────────────────────────────── +# ACT 2: Open the PR — this triggers the webhook +# ───────────────────────────────────────────────────────── +echo -e "\n${YELLOW}[ACT 2] Creating PR...${NC}" +PR_URL=$(gh pr create \ + --repo "$REPO" \ + --title "Standardize database env var naming" \ + --body "Renamed DATABASE_URL to DB_CONNECTION_URL for consistency with other services." \ + --base main \ + --head config-cleanup \ + --json url -q '.url' 2>/dev/null || \ + gh pr view config-cleanup --repo "$REPO" --json url -q '.url') + +echo -e "${GREEN} ✓ PR created: $PR_URL${NC}" +echo "" +echo -e "${BLUE}The GitHub Actions workflow is now sending the PR event to the SRE Agent.${NC}" +echo -e "${BLUE}Watch the PR for the agent's risk assessment comment.${NC}" +echo "" +echo -e "${YELLOW}Check progress:${NC}" +echo " GitHub Actions: gh run list --repo $REPO --limit 3" +echo " PR comments: gh pr view config-cleanup --repo $REPO --comments" +echo "" + +# ───────────────────────────────────────────────────────── +# ACT 3: Wait and show the result +# ───────────────────────────────────────────────────────── +echo -e "${YELLOW}[ACT 3] Waiting for agent to analyze the PR...${NC}" +echo " This typically takes 5-10 minutes (baseline capture + canary testing)." +echo "" +echo " To check manually:" +echo " gh pr view config-cleanup --repo $REPO --comments" +echo "" + +# Poll for PR comment (up to 15 minutes) +for i in $(seq 1 30); do + COMMENTS=$(gh pr view config-cleanup --repo "$REPO" --json comments --jq '.comments | length' 2>/dev/null || echo "0") + if [[ "$COMMENTS" -gt 0 ]]; then + echo -e "\n${GREEN} ✓ Agent posted a comment on the PR!${NC}" + echo "" + gh pr view config-cleanup --repo "$REPO" --comments 2>/dev/null | tail -40 + break + fi + echo " Waiting... ($((i * 30))s elapsed, $COMMENTS comments so far)" + sleep 30 +done + +# ───────────────────────────────────────────────────────── +# CLEANUP +# ───────────────────────────────────────────────────────── +echo "" +echo -e "${YELLOW}[CLEANUP] To clean up after the demo:${NC}" +echo " gh pr close config-cleanup --repo $REPO --delete-branch" +echo " cd $APP_DIR && git checkout main" diff --git a/labs/deployment-guard/scripts/prereqs.sh b/labs/deployment-guard/scripts/prereqs.sh new file mode 100644 index 000000000..eb6827350 --- /dev/null +++ b/labs/deployment-guard/scripts/prereqs.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# ============================================================ +# prereqs.sh — Check prerequisites for Deployment Guard Lab +# ============================================================ +set -euo pipefail + +echo "" +echo "=============================================" +echo " Deployment Guard Lab — Prerequisites" +echo "=============================================" +echo "" + +MISSING=0 + +check_tool() { + local name="$1" + local cmd="$2" + if command -v "$cmd" &>/dev/null; then + version=$($cmd --version 2>&1 | head -1) + echo " ✅ $name: $version" + else + echo " ❌ $name: NOT FOUND" + MISSING=$((MISSING + 1)) + fi +} + +check_tool "Azure CLI" "az" +check_tool "GitHub CLI" "gh" +check_tool "jq" "jq" + +echo "" + +# Check az login +if az account show &>/dev/null; then + ACCOUNT=$(az account show --query name -o tsv) + echo " ✅ Logged into Azure: $ACCOUNT" +else + echo " ❌ Not logged into Azure (run: az login)" + MISSING=$((MISSING + 1)) +fi + +# Check gh auth +if gh auth status &>/dev/null; then + echo " ✅ Logged into GitHub" +else + echo " ❌ Not logged into GitHub (run: gh auth login)" + MISSING=$((MISSING + 1)) +fi + +echo "" +if [[ $MISSING -eq 0 ]]; then + echo " All prerequisites met! ✅" +else + echo " $MISSING prerequisite(s) missing. Fix them before proceeding." + exit 1 +fi diff --git a/labs/deployment-guard/scripts/setup-github-workflow.sh b/labs/deployment-guard/scripts/setup-github-workflow.sh new file mode 100644 index 000000000..8f8f41703 --- /dev/null +++ b/labs/deployment-guard/scripts/setup-github-workflow.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# ============================================================ +# setup-github-workflow.sh — Wire a GitHub repo to the SRE Agent +# Copies the PR guard workflow and sets the webhook secret. +# +# Usage: +# bash setup-github-workflow.sh \ +# --repo \ +# --webhook-url +# ============================================================ +set -euo pipefail + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +REPO="" +WEBHOOK_URL="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --repo) REPO="$2"; shift 2 ;; + --webhook-url) WEBHOOK_URL="$2"; shift 2 ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +if [[ -z "$REPO" || -z "$WEBHOOK_URL" ]]; then + echo "Usage: $0 --repo --webhook-url " + exit 1 +fi + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +RECIPE_DIR="$(cd "$SCRIPT_DIR/../../sreagent-templates/recipes/law-dynatrace-httptrigger" && pwd)" +WORKFLOW_SRC="$RECIPE_DIR/data/sample-github-workflow.yml" + +if [[ ! -f "$WORKFLOW_SRC" ]]; then + echo -e "${RED}Workflow template not found at $WORKFLOW_SRC${NC}" + exit 1 +fi + +# Clone the repo to a temp dir, add workflow, push +TMPDIR=$(mktemp -d) +echo -e "${YELLOW}Cloning $REPO...${NC}" +gh repo clone "$REPO" "$TMPDIR/repo" -- --depth 1 + +mkdir -p "$TMPDIR/repo/.github/workflows" +cp "$WORKFLOW_SRC" "$TMPDIR/repo/.github/workflows/sre-agent-pr-guard.yml" + +cd "$TMPDIR/repo" +git add .github/workflows/sre-agent-pr-guard.yml +if git diff --cached --quiet; then + echo -e "${GREEN}Workflow already exists. Skipping.${NC}" +else + git commit -m "Add SRE Agent PR deployment guard workflow" + git push + echo -e "${GREEN}✓ Workflow pushed to $REPO${NC}" +fi + +# Set the webhook secret +echo -e "${YELLOW}Setting SRE_AGENT_WEBHOOK_URL secret...${NC}" +gh secret set SRE_AGENT_WEBHOOK_URL --repo "$REPO" --body "$WEBHOOK_URL" +echo -e "${GREEN}✓ Secret set on $REPO${NC}" + +# Clean up +rm -rf "$TMPDIR" +echo -e "${GREEN}Done! PRs on $REPO will now trigger the SRE Agent deployment guard.${NC}" diff --git a/sreagent-templates/agents/dg-azd-bash/README.md b/sreagent-templates/agents/dg-azd-bash/README.md new file mode 100644 index 000000000..8c6fbea0e --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/README.md @@ -0,0 +1,126 @@ +# law-dynatrace-httptrigger + +SRE Agent with Log Analytics + Dynatrace MCP connectors, GitHub repo integration, and an HTTP trigger that enables **PR deployment guard** — automated PR reviews that deploy to staging, run canary tests, and post risk assessments as PR comments. + +## Use Case + +Shift-left reliability: instead of catching production issues after deployment, the agent reviews every PR by deploying changes to a staging environment, comparing health metrics against production baselines, and flagging regressions before merge. + +## Prerequisites + +- Azure subscription with **production** and **staging** resource groups +- Log Analytics workspace connected to your Container Apps / App Services +- Dynatrace environment with MCP gateway access and API token +- GitHub repo with app source code +- All [CLI tools](../../README.md#prerequisites) installed (`./bin/install-prerequisites.sh --check`) + +## Quick Start + +### Step 1 — Generate agent config + +```bash +./bin/new-agent.sh --recipe law-dynatrace-httptrigger --non-interactive \ + --set agentName=contoso-sre \ + --set resourceGroup=rg-sre-contoso \ + --set location=eastus2 \ + --set lawId=/subscriptions//resourceGroups//providers/Microsoft.OperationalInsights/workspaces/ \ + --set dtTenant=abc12345 \ + --set dtToken=dt0c01.xxx \ + --set githubRepo=contoso/trading-app \ + --set targetRGs=rg-contoso-prod,rg-contoso-staging \ + -o contoso-sre/ +``` + +### Step 2 — Deploy + +| Backend | Command | +|---|---| +| Bicep | `./bin/deploy.sh contoso-sre/` | +| Terraform | `./bin/deploy-tf.sh contoso-sre/` | +| PowerShell | `./bin/ps/Deploy-Agent.ps1 -InputPath contoso-sre/` | + +### Step 3 — Set up the Dynatrace secret + +```bash +echo "DYNATRACE_BEARER_TOKEN=dt0c01.your-token-here" > contoso-sre/connectors.secrets.env +``` + +Then redeploy or run `./bin/deploy.sh contoso-sre/` to apply. + +### Step 4 — Wire up GitHub PR workflow + +Copy the sample workflow to your app repo: + +```bash +cp contoso-sre/docs/sample-github-workflow.yml \ + /path/to/your-app/.github/workflows/sre-agent-pr-guard.yml +``` + +Add the webhook URL as a GitHub secret: + +```bash +# Get the Logic App trigger URL from the agent's webhook bridge +WEBHOOK_URL=$(az resource show \ + --resource-group rg-sre-contoso \ + --resource-type Microsoft.Logic/workflows \ + --name \ + --query "properties.accessEndpoint" -o tsv) + +gh secret set SRE_AGENT_WEBHOOK_URL --repo contoso/trading-app --body "$WEBHOOK_URL" +``` + +### Step 5 — Test it + +Open a PR on your app repo — the GitHub workflow sends the PR event to the agent, which triggers the deployment guard. The agent will: + +1. Read the PR diff +2. Capture production baseline metrics from Dynatrace + LAW +3. Deploy changes to staging +4. Send synthetic canary traffic +5. Compare staging health against production +6. Post a risk assessment comment on the PR + +## Parameters + +| Param | Required | Example | Description | +|---|---|---|---| +| agentName | ✅ | `contoso-sre` | Agent name (lowercase, hyphens) | +| resourceGroup | ✅ | `rg-sre-contoso` | Resource group for the agent | +| location | ✅ | `eastus2` | Azure region | +| targetRGs | ✅ | `rg-contoso-prod,rg-contoso-staging` | Resource groups the agent monitors | +| lawId | ✅ | `/subscriptions/.../workspaces/...` | Log Analytics workspace resource ID | +| dtTenant | ✅ | `abc12345` | Dynatrace tenant ID | +| dtToken | ✅ | `dt0c01.xxx` | Dynatrace API token (stored as secret) | +| githubRepo | ✅ | `contoso/trading-app` | GitHub org/repo | +| modelProvider | | `Anthropic` | AI model provider (Anthropic or Azure OpenAI) | + +## What You Get + +| Category | Items | +|---|---| +| **Connectors** | Log Analytics, Dynatrace MCP | +| **Skills** | deployment-guard-analysis, investigate-app-errors | +| **Subagents** | deployment-guard, error-investigator | +| **HTTP Trigger** | pr-deployment-guard (receives GitHub PR webhooks) | +| **Hooks** | deny-prod-deletes, require-approval-for-restarts | +| **Common Prompts** | investigation-guidelines, safety-rules | +| **GitHub Repo** | Connected for diff analysis and PR comments | + +## Architecture + +``` +GitHub PR → GitHub Actions workflow → Logic App webhook bridge → SRE Agent HTTP trigger + ↓ + deployment-guard subagent + ↓ + ┌───────────────┼───────────────┐ + ↓ ↓ ↓ + Read PR diff Deploy to staging Query Dynatrace + ↓ + LAW baselines + Canary traffic + ↓ + Compare health + ↓ + Post PR comment with + risk assessment +``` diff --git a/sreagent-templates/agents/dg-azd-bash/agent.json b/sreagent-templates/agents/dg-azd-bash/agent.json new file mode 100644 index 000000000..aadbf19ca --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/agent.json @@ -0,0 +1,27 @@ +{ + "_scenario": "law-dynatrace-httptrigger", + "identity": { + "agentName": "dg-azd-bash", + "resourceGroup": "rg-dg-azd-bash", + "subscription": "cbf44432-7f45-4906-a85d-d2b14a1e8328", + "location": "swedencentral", + "targetResourceGroups": [ + "rg-contoso-prod", + "rg-contoso-staging" + ] + }, + "access": { + "accessLevel": "High", + "actionMode": "Review" + }, + "upgradeChannel": "Preview", + "defaultModelProvider": "Anthropic", + "monthlyAgentUnitLimit": 10000, + "tags": {}, + "toggles": { + "enableWebhookBridge": true, + "webhookBridgeTriggerUrl": "" + }, + "existingUamiId": "", + "existingAgentAppInsightsId": "" +} diff --git a/sreagent-templates/agents/dg-azd-bash/automations/http-triggers/pr-deployment-guard.yaml b/sreagent-templates/agents/dg-azd-bash/automations/http-triggers/pr-deployment-guard.yaml new file mode 100644 index 000000000..df617cdfb --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/automations/http-triggers/pr-deployment-guard.yaml @@ -0,0 +1,10 @@ +name: pr-deployment-guard +spec: + description: Receives PR webhooks from GitHub and triggers the deployment guard + to analyze changes for production safety. + prompt: A PR webhook has been received from the connected GitHub repo. Use the deployment-guard-analysis + skill to read the PR diff, deploy changes to the staging environment, monitor + health for 5 minutes comparing against production, then post a risk assessment + comment on the PR. + handlingAgent: deployment-guard + agentMode: autonomous diff --git a/sreagent-templates/agents/dg-azd-bash/config/common-prompts/investigation-guidelines.yaml b/sreagent-templates/agents/dg-azd-bash/config/common-prompts/investigation-guidelines.yaml new file mode 100644 index 000000000..d7c1b4b8d --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/config/common-prompts/investigation-guidelines.yaml @@ -0,0 +1,15 @@ +metadata: + name: investigation-guidelines +spec: + prompt: '## Investigation guidelines + + + - Always check the last 3 deployments for correlation + + - Include timestamp, affected resource, and severity in all summaries + + - Never take destructive actions without explicit approval + + - Prefer read-only investigation before recommending changes + + - Always provide an impact assessment (users affected, blast radius)' diff --git a/sreagent-templates/agents/dg-azd-bash/config/common-prompts/safety-rules.yaml b/sreagent-templates/agents/dg-azd-bash/config/common-prompts/safety-rules.yaml new file mode 100644 index 000000000..efa6dd631 --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/config/common-prompts/safety-rules.yaml @@ -0,0 +1,15 @@ +metadata: + name: safety-rules +spec: + prompt: '## Safety rules + + + - Never delete resources in production without explicit approval + + - Always prefer read-only investigation before taking action + + - Escalate to human if confidence is below 80% + + - Do not modify network security groups or firewall rules + + - Do not access or display secrets, keys, or connection strings' diff --git a/sreagent-templates/agents/dg-azd-bash/config/hooks/deny-prod-deletes.yaml b/sreagent-templates/agents/dg-azd-bash/config/hooks/deny-prod-deletes.yaml new file mode 100644 index 000000000..4545f0aae --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/config/hooks/deny-prod-deletes.yaml @@ -0,0 +1,11 @@ +metadata: + name: deny-prod-deletes +spec: + eventType: PreToolUse + hook: + type: prompt + prompt: If the tool targets a production resource (name contains 'prod' or 'prd'), + deny the action. Otherwise allow. + matcher: ^(delete_|remove_).* + permissionDecision: deny + enabled: true diff --git a/sreagent-templates/agents/dg-azd-bash/config/hooks/require-approval-for-restarts.yaml b/sreagent-templates/agents/dg-azd-bash/config/hooks/require-approval-for-restarts.yaml new file mode 100644 index 000000000..3eae406c9 --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/config/hooks/require-approval-for-restarts.yaml @@ -0,0 +1,11 @@ +metadata: + name: require-approval-for-restarts +spec: + eventType: PreToolUse + hook: + type: prompt + prompt: If this action will restart or scale a resource, require human approval + before proceeding. + matcher: ^(restart_|scale_).* + permissionDecision: allow + enabled: true diff --git a/sreagent-templates/agents/dg-azd-bash/config/repos/github-repo.yaml b/sreagent-templates/agents/dg-azd-bash/config/repos/github-repo.yaml new file mode 100644 index 000000000..a1f2f5bf5 --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/config/repos/github-repo.yaml @@ -0,0 +1,5 @@ +name: github-repo +spec: + url: "dm-chelupati/contoso-trading" + branch: main + description: Connected GitHub repository diff --git a/sreagent-templates/agents/dg-azd-bash/config/skills/deployment-guard-analysis.md b/sreagent-templates/agents/dg-azd-bash/config/skills/deployment-guard-analysis.md new file mode 100644 index 000000000..d072c4bb8 --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/config/skills/deployment-guard-analysis.md @@ -0,0 +1,23 @@ +You are a deployment guard. When triggered by a PR webhook, you assess if the change is safe for production. + +Step 1: Read the PR diff from the connected GitHub repo. Identify what changed — app code, IaC, config, DB schema, dependencies. + +Step 2: Static analysis — check for breaking changes: API contract changes, removed endpoints, changed DB schemas, renamed env vars, missing error handling. + +Step 3: Capture production baseline. Use Dynatrace DQL to query current error rates, latency p50/p95/p99, throughput. Use az CLI to check ContainerAppConsoleLogs_CL in LAW. Also capture baseline API responses by sending test requests to production endpoints and recording the response structure, status codes, and key data fields. + +Step 4: Deploy the PR changes to the STAGING environment using az containerapp update. This is a separate environment from production — deploy the new image there. + +Step 5: Send synthetic test traffic to the staging services to exercise the code paths affected by the PR. Use ExecutePythonCode to send HTTP requests to the staging endpoints (e.g. GET /orders, POST /orders, GET /health) for 2-3 minutes. This is canary testing — you need real traffic to surface regressions like timeouts, 500s, or latency spikes. + +Step 6: Validate response correctness — compare staging API responses against the production baseline captured in Step 3. Look for any differences in response bodies, status codes, data fields, or behavior. The app may return 200 OK but serve degraded or incorrect data. + +Step 7: Monitor staging health for 5 minutes. Query Dynatrace and LAW for the staging services. Compare all metrics and response patterns against the production baseline. Use PlotAreaChartWithCorrelation to visualize. + +Step 8: Risk assessment — LOW (no functional or performance changes), MEDIUM (minor changes), HIGH (behavioral or performance regression detected), CRITICAL (staging failing or data integrity compromised). + +Step 9: Post a structured PR comment with: risk level, changes analyzed, static analysis findings, canary test results, any behavioral regressions found, health comparison table (prod baseline vs staging), and recommendation. + +Tools to use: RunAzCliReadCommands, RunAzCliWriteCommands, ExecutePythonCode, PlotAreaChartWithCorrelation, PlotBarChart, CreateGithubIssue, FindConnectedGitHubRepo, and all dynatrace MCP tools. + +# Updated by e2e test diff --git a/sreagent-templates/agents/dg-azd-bash/config/skills/deployment-guard-analysis.yaml b/sreagent-templates/agents/dg-azd-bash/config/skills/deployment-guard-analysis.yaml new file mode 100644 index 000000000..a4a551772 --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/config/skills/deployment-guard-analysis.yaml @@ -0,0 +1,17 @@ +metadata: + name: deployment-guard-analysis + description: Deployment guard that assesses PR safety for production by analyzing + diffs, capturing baselines, deploying to staging, running canary tests, validating + response correctness, and comparing health metrics. + spec: + tools: + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - CreateGithubIssue + - FindConnectedGitHubRepo + - QueryLogAnalyticsByWorkspaceId +skillContent: skills/deployment-guard-analysis.md +additionalFiles: [] diff --git a/sreagent-templates/agents/dg-azd-bash/config/skills/investigate-app-errors.md b/sreagent-templates/agents/dg-azd-bash/config/skills/investigate-app-errors.md new file mode 100644 index 000000000..508a81608 --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/config/skills/investigate-app-errors.md @@ -0,0 +1,20 @@ +You are an application error investigator. When errors are reported, follow this workflow: + +1. **Identify the error**: Get the error details — HTTP status codes, exception types, affected endpoints, timestamps. + +2. **Check recent deployments**: Use az CLI to list recent Container App revisions or deployments. Correlate error start time with deployment timestamps. + +3. **Query Dynatrace**: Use DQL to query error rates, response times, and throughput for the affected services. Look for anomalies that started around the same time. + +4. **Query Log Analytics**: Check ContainerAppConsoleLogs_CL and ContainerAppSystemLogs_CL for exceptions, crash loops, or OOM kills. + +5. **Check dependencies**: Query Dynatrace for dependency health — databases, external APIs, message queues. An upstream failure may be the root cause. + +6. **Correlate findings**: Build a timeline of events — deployment, config change, traffic spike, dependency failure — and identify the most likely root cause. + +7. **Recommend fix**: Provide actionable recommendations — rollback, config change, scaling, or code fix with the specific file/line if the GitHub repo is connected. + +Always include: +- Impact assessment (users affected, error rate, duration) +- Root cause confidence level +- Recommended action with rollback option diff --git a/sreagent-templates/agents/dg-azd-bash/config/skills/investigate-app-errors.yaml b/sreagent-templates/agents/dg-azd-bash/config/skills/investigate-app-errors.yaml new file mode 100644 index 000000000..669dddcb8 --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/config/skills/investigate-app-errors.yaml @@ -0,0 +1,16 @@ +metadata: + name: investigate-app-errors + description: Investigate application errors using Dynatrace DQL and Log Analytics + to correlate errors with deployments, infrastructure changes, and dependencies. + spec: + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - ExecutePythonCode + - PlotAreaChartWithCorrelation +skillContent: skills/investigate-app-errors.md +additionalFiles: [] diff --git a/sreagent-templates/agents/dg-azd-bash/config/subagents/deployment-guard.instructions.md b/sreagent-templates/agents/dg-azd-bash/config/subagents/deployment-guard.instructions.md new file mode 100644 index 000000000..28019290a --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/config/subagents/deployment-guard.instructions.md @@ -0,0 +1 @@ +You are the best engineer who guards production deployments operating in autonomous mode. Use the deployment-guard-analysis skill to assess PRs for production safety. Follow the full 9-step workflow: analyze the PR diff, perform static analysis, capture production baselines from Dynatrace and LAW, deploy to staging, send synthetic canary traffic, validate response correctness, monitor staging health, assess risk, and post a structured PR comment with your findings. diff --git a/sreagent-templates/agents/dg-azd-bash/config/subagents/deployment-guard.yaml b/sreagent-templates/agents/dg-azd-bash/config/subagents/deployment-guard.yaml new file mode 100644 index 000000000..99ead646b --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/config/subagents/deployment-guard.yaml @@ -0,0 +1,31 @@ +metadata: + name: deployment-guard +spec: + instructions: subagents/deployment-guard.instructions.md + handoffDescription: Analyzes PRs by deploying to staging, comparing health against + production via Dynatrace + LAW, and posting risk assessment as a PR comment + handoffs: [] + tools: + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - CreateGithubIssue + - FindConnectedGitHubRepo + - QueryLogAnalyticsByWorkspaceId + - dynatrace_adaptive-anomaly-detector + - dynatrace_create-dql + - dynatrace_execute-dql + - dynatrace_explain-dql + - dynatrace_get-entity-id + - dynatrace_get-entity-name + - dynatrace_query-problems + - dynatrace_seasonal-baseline-anomaly-detector + - dynatrace_static-threshold-analyzer + - dynatrace_timeseries-forecast + - dynatrace_timeseries-novelty-detection + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/agents/dg-azd-bash/config/subagents/error-investigator.instructions.md b/sreagent-templates/agents/dg-azd-bash/config/subagents/error-investigator.instructions.md new file mode 100644 index 000000000..41b3c7a46 --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/config/subagents/error-investigator.instructions.md @@ -0,0 +1 @@ +You are an application error investigator. When errors are reported, use the investigate-app-errors skill to systematically diagnose the issue. Correlate Dynatrace metrics with Log Analytics data and deployment history. Always provide impact assessment and actionable recommendations with rollback options. diff --git a/sreagent-templates/agents/dg-azd-bash/config/subagents/error-investigator.yaml b/sreagent-templates/agents/dg-azd-bash/config/subagents/error-investigator.yaml new file mode 100644 index 000000000..61fb2f3a4 --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/config/subagents/error-investigator.yaml @@ -0,0 +1,23 @@ +metadata: + name: error-investigator +spec: + instructions: subagents/error-investigator.instructions.md + handoffDescription: Investigates application errors by correlating Dynatrace metrics, + LAW logs, and deployment history to identify root cause + handoffs: [] + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - FindConnectedGitHubRepo + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - dynatrace_get-entity-name + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/agents/dg-azd-bash/connectors.json b/sreagent-templates/agents/dg-azd-bash/connectors.json new file mode 100644 index 000000000..2cd878182 --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/connectors.json @@ -0,0 +1,30 @@ +{ + "toggles": { + "enableAppInsightsConnector": false, + "appInsightsResourceId": "", + "appInsightsAppId": "", + "enableLogAnalyticsConnector": true, + "lawResourceId": "/subscriptions/cbf44432-7f45-4906-a85d-d2b14a1e8328/resourceGroups/rg-contoso-swe/providers/Microsoft.OperationalInsights/workspaces/law-7defkiyvn3r44", + "enableAzureMonitorConnector": false, + "azureMonitorLookbackDays": 7, + "grafanaUrl": "", + "grafanaApiKey": "" + }, + "connectors": [ + { + "name": "dynatrace", + "properties": { + "dataConnectorType": "Mcp", + "dataSource": "placeholder", + "extendedProperties": { + "type": "http", + "endpoint": "https://dhu66396.apps.dynatrace.com/platform-reserved/mcp-gateway/v0.1/servers/dynatrace-mcp/mcp", + "authType": "BearerToken", + "bearerToken": "${DYNATRACE_BEARER_TOKEN}", + "partnerType": "DynatraceMcp" + }, + "identity": "system" + } + } + ] +} diff --git a/sreagent-templates/agents/dg-azd-bash/data/knowledge/.gitkeep b/sreagent-templates/agents/dg-azd-bash/data/knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/agents/dg-azd-bash/data/synthesized-knowledge/.gitkeep b/sreagent-templates/agents/dg-azd-bash/data/synthesized-knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/agents/dg-azd-bash/docs/sample-github-workflow.yml b/sreagent-templates/agents/dg-azd-bash/docs/sample-github-workflow.yml new file mode 100644 index 000000000..82883626a --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/docs/sample-github-workflow.yml @@ -0,0 +1,40 @@ +# Sample GitHub Actions workflow for your application repo. +# This sends PR events to the SRE Agent via the Logic App webhook bridge, +# which triggers the deployment-guard-analysis skill. +# +# Setup: +# 1. Copy this file to your app repo: .github/workflows/sre-agent-pr-guard.yml +# 2. Add a repo secret SRE_AGENT_WEBHOOK_URL with the Logic App trigger URL +# (find it in the Azure portal under the Logic App's trigger settings, +# or run: az resource show ... to get the callback URL) + +name: SRE Agent — PR Deployment Guard + +on: + pull_request: + types: [opened, synchronize, reopened] + +jobs: + notify-sre-agent: + runs-on: ubuntu-latest + steps: + - name: Trigger SRE Agent via webhook bridge + env: + WEBHOOK_URL: ${{ secrets.SRE_AGENT_WEBHOOK_URL }} + run: | + curl -s -X POST "$WEBHOOK_URL" \ + -H "Content-Type: application/json" \ + -d '{ + "event": "pull_request", + "action": "${{ github.event.action }}", + "pr_number": ${{ github.event.pull_request.number }}, + "pr_title": "${{ github.event.pull_request.title }}", + "pr_url": "${{ github.event.pull_request.html_url }}", + "pr_diff_url": "${{ github.event.pull_request.diff_url }}", + "pr_author": "${{ github.event.pull_request.user.login }}", + "repo": "${{ github.repository }}", + "head_ref": "${{ github.event.pull_request.head.ref }}", + "base_ref": "${{ github.event.pull_request.base.ref }}", + "head_sha": "${{ github.event.pull_request.head.sha }}" + }' + echo "Webhook sent to SRE Agent" diff --git a/sreagent-templates/agents/dg-azd-bash/expected-config.json b/sreagent-templates/agents/dg-azd-bash/expected-config.json new file mode 100644 index 000000000..f1a36a4a0 --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/expected-config.json @@ -0,0 +1,47 @@ +{ + "_description": "Expected configuration for law-dynatrace-httptrigger recipe. Used by verify-agent.sh to validate deployments.", + "_scenario": "law-dynatrace-httptrigger", + + "agent": { + "accessLevel": "High", + "actionMode": "Review", + "upgradeChannel": "Preview", + "defaultModelProvider": "Anthropic", + "incidentPlatform": "None" + }, + + "connectors": [ + { "name": "log-analytics", "type": "LogAnalytics" }, + { "name": "dynatrace", "type": "Mcp" } + ], + + "skills": [ + "deployment-guard-analysis", + "investigate-app-errors" + ], + + "subagents": [ + "deployment-guard", + "error-investigator" + ], + + "hooks": [ + "deny-prod-deletes", + "require-approval-for-restarts" + ], + + "commonPrompts": [ + "investigation-guidelines", + "safety-rules" + ], + + "scheduledTasks": [], + + "responsePlans": [], + + "httpTriggers": [ + { "name": "pr-deployment-guard", "handlingAgent": "deployment-guard" } + ], + + "repos": [] +} diff --git a/sreagent-templates/agents/dg-azd-bash/roles.yaml b/sreagent-templates/agents/dg-azd-bash/roles.yaml new file mode 100644 index 000000000..9ec1aa266 --- /dev/null +++ b/sreagent-templates/agents/dg-azd-bash/roles.yaml @@ -0,0 +1,19 @@ +# Required roles/credentials for the law-dynatrace-httptrigger recipe. +# deploy.sh processes this after the UAMI is created. + +roles: + # GitHub repos — prints OAuth URL or uses GITHUB_PAT env var + - name: GitHub OAuth + type: manual + instructions: | + To connect GitHub repos, either: + 1. Set GITHUB_PAT env var before deploy: export GITHUB_PAT=ghp_xxx + 2. Or after deploy, open the OAuth URL printed by apply-extras.sh + + # Dynatrace MCP — requires bearer token in connectors.secrets.env + - name: Dynatrace MCP + type: manual + instructions: | + Create a Dynatrace API token with scopes: entities.read, events.read, metrics.read, problems.read + Save it in connectors.secrets.env: + DYNATRACE_BEARER_TOKEN=dt0c01.xxx diff --git a/sreagent-templates/bicep/agent-core.bicep b/sreagent-templates/bicep/agent-core.bicep index d5a93a1c0..0ca684525 100644 --- a/sreagent-templates/bicep/agent-core.bicep +++ b/sreagent-templates/bicep/agent-core.bicep @@ -24,6 +24,31 @@ param existingManagedIdentityId string = '' @description('Optional. Resource ID of an existing Application Insights for agent telemetry. If provided, skips creating a new one.') param existingAgentAppInsightsId string = '' +@description('Optional. Skip all role assignments. Set to true when RBAC is pre-configured or on redeploy to avoid RoleAssignmentExists errors.') +param skipRoleAssignments bool = false + +@description('Optional. Full ARM resource ID of a delegated subnet (Microsoft.App/environments) for VNet integration. Leave empty for no VNet.') +param vnetSubnetId string = '' + +@description('Optional. Sandbox egress mode: Unrestricted (default), Limited, or AzureVNet.') +@allowed(['Unrestricted', 'Limited', 'AzureVNet']) +param egressMode string = 'Unrestricted' + +@description('Optional. Additional hosts the sandbox may reach (e.g. *.contoso.com). Only used in Limited/AzureVNet modes.') +param allowedHosts array = [] + +@description('Optional. Registry catalog IDs (pypi, npmjs, nuget-org) whose hosts are allowed. Only used in Limited/AzureVNet modes.') +param allowedRegistries array = [] + +@description('Optional. Code-repo providers (Github, AzureDevOps) whose hosts are allowed. Only used in Limited/AzureVNet modes.') +param allowedCodeRepositories array = [] + +@description('Optional. Allow remote HTTP MCP server endpoints in sandbox egress.') +param allowHttpMcpServerNetworkAccess bool = true + +@description('Optional. Use VNet private DNS resolver instead of platform default. Only for AzureVNet mode.') +param usePrivateDnsResolution bool = false + // ── Observability ── resource law 'Microsoft.OperationalInsights/workspaces@2023-09-01' = { @@ -73,7 +98,7 @@ var effectivePrincipalId = empty(existingManagedIdentityId) ? identity.propertie // ── RBAC on target resource groups ── -module targetRbac 'role-assignments-target.bicep' = [for (rg, i) in targetResourceGroups: { +module targetRbac 'role-assignments-target.bicep' = [for (rg, i) in targetResourceGroups: if (!skipRoleAssignments && empty(existingManagedIdentityId)) { name: 'rbac-${i}-${uniqueString(deployment().name)}' scope: resourceGroup(subscriptionId, rg) params: { @@ -84,7 +109,7 @@ module targetRbac 'role-assignments-target.bicep' = [for (rg, i) in targetResour // ── Monitoring Reader on deployment RG ── -resource monitoringReader 'Microsoft.Authorization/roleAssignments@2022-04-01' = { +resource monitoringReader 'Microsoft.Authorization/roleAssignments@2022-04-01' = if (!skipRoleAssignments && empty(existingManagedIdentityId)) { name: guid(resourceGroup().id, effectiveIdentityId, '43d0d8ad-25c7-4714-9337-8ba259a9fe05') properties: { roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', '43d0d8ad-25c7-4714-9337-8ba259a9fe05') @@ -132,6 +157,21 @@ resource sreAgent 'Microsoft.App/agents@2025-05-01-preview' = { EnableHttpTriggers: true EnableV2AgentLoop: true } + vnetConfiguration: !empty(vnetSubnetId) ? { + subnetResourceId: vnetSubnetId + } : null + sandboxConfiguration: egressMode != 'Unrestricted' ? { + egress: { + mode: egressMode + allowedHosts: allowedHosts + allowedRegistries: allowedRegistries + allowedCodeRepositories: allowedCodeRepositories + allowHttpMcpServerNetworkAccess: allowHttpMcpServerNetworkAccess + vnetConfiguration: egressMode == 'AzureVNet' ? { + usePrivateDnsResolution: usePrivateDnsResolution + } : null + } + } : null } dependsOn: [ targetRbac, monitoringReader ] } @@ -151,7 +191,7 @@ module targetRbacSystemMi 'role-assignments-target.bicep' = [for (rg, i) in targ // ── SRE Agent Administrator for deployer ── -resource adminRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { +resource adminRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = if (!skipRoleAssignments) { name: guid(sreAgent.id, deployer().objectId, 'e79298df-d852-4c6d-84f9-5d13249d1e55') scope: sreAgent properties: { @@ -163,7 +203,7 @@ resource adminRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { // ── SRE Agent Administrator for UAMI (needed for Logic App webhook bridge to call HTTP triggers) ── -resource uamiAdminRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = { +resource uamiAdminRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = if (!skipRoleAssignments) { name: guid(sreAgent.id, effectiveIdentityId, 'e79298df-d852-4c6d-84f9-5d13249d1e55') scope: sreAgent properties: { diff --git a/sreagent-templates/bicep/apply-extras.sh b/sreagent-templates/bicep/apply-extras.sh index 9093ebae5..f3f089ef3 100755 --- a/sreagent-templates/bicep/apply-extras.sh +++ b/sreagent-templates/bicep/apply-extras.sh @@ -231,13 +231,41 @@ dataplane_put_extended() { body=$(jq -nc --arg n "$name" --arg t "$type" --argjson tags "$tags_json" --argjson props "$props_json" \ '{name:$n, type:$t, tags:$tags, properties:$props}') url="${AGENT_ENDPOINT}/api/v2/extendedAgent/${kind}/$(printf %s "$name" | jq -sRr @uri)" - if curl -sS -f -X PUT "$url" \ + # Try PUT first. If the resource exists and the API doesn't update immutable + # fields (e.g. hook type), fall back to DELETE + PUT. + local put_result existing_type desired_type + put_result=$(curl -sS -w "\n%{http_code}" -X PUT "$url" \ -H "Authorization: Bearer ${TOKEN}" \ -H "Content-Type: application/json" \ - --data "$body" >/dev/null; then + --data "$body" 2>&1) + local http_code + http_code=$(echo "$put_result" | tail -1) + if [[ "$http_code" =~ ^2 ]]; then + # PUT succeeded — verify the update actually took effect for hooks + if [[ "$kind" == "hooks" ]]; then + desired_type=$(echo "$props_json" | jq -r '.hook.type // empty') + if [[ -n "$desired_type" ]]; then + existing_type=$(curl -sS "$url" -H "Authorization: Bearer ${TOKEN}" 2>/dev/null \ + | jq -r '.properties.hook.type // empty') + if [[ -n "$existing_type" && "$existing_type" != "$desired_type" ]]; then + echo " ${kind}/${name}: type mismatch (want=${desired_type}, got=${existing_type}) — recreating" + curl -sS -X DELETE "$url" -H "Authorization: Bearer ${TOKEN}" >/dev/null 2>&1 + TOKEN=$(_dp_token) + if curl -sS -f -X PUT "$url" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H "Content-Type: application/json" \ + --data "$body" >/dev/null 2>&1; then + echo " ok ${kind}/${name} (recreated as ${desired_type})" + else + echo " FAILED — recreate ${kind}/${name}" + fi + return + fi + fi + fi echo " ok ${kind}/${name}" else - echo " FAILED — PUT ${kind}/${name}" + echo " FAILED — PUT ${kind}/${name} (HTTP ${http_code})" fi } @@ -270,21 +298,34 @@ if [[ "$count" -gt 0 ]]; then if [[ -n "$platform_type" ]]; then # Check for connectionKey (PagerDuty/ServiceNow need API key) conn_key=$(jq -r '.incidentPlatforms[0].spec.connectionKey // empty' "$FILE") + conn_url=$(jq -r '.incidentPlatforms[0].spec.connectionUrl // empty' "$FILE") echo " ARM PATCH → incidentManagementConfiguration.type=${platform_type}" - patch_body="" + conn_name=$(echo "$platform_type" | tr '[:upper:]' '[:lower:]') + _patch_file=$(mktemp) if [[ -n "$conn_key" ]]; then - patch_body="{\"properties\":{\"incidentManagementConfiguration\":{\"type\":\"${platform_type}\",\"connectionKey\":\"${conn_key}\",\"connectionName\":\"$(echo "$platform_type" | tr '[:upper:]' '[:lower:]')\"}}}" + jq -n \ + --arg pt "$platform_type" \ + --arg ck "$conn_key" \ + --arg cn "$conn_name" \ + --arg cu "$conn_url" \ + '{properties:{incidentManagementConfiguration:{type:$pt, connectionKey:$ck, connectionName:$cn, connectionUrl:$cu}}}' > "$_patch_file" else - patch_body="{\"properties\":{\"incidentManagementConfiguration\":{\"type\":\"${platform_type}\",\"connectionName\":\"$(echo "$platform_type" | tr '[:upper:]' '[:lower:]')\"}}}" + jq -n \ + --arg pt "$platform_type" \ + --arg cn "$conn_name" \ + --arg cu "$conn_url" \ + '{properties:{incidentManagementConfiguration:{type:$pt, connectionName:$cn, connectionUrl:$cu}}}' > "$_patch_file" fi if az rest --method PATCH \ --url "${ARM_BASE}?api-version=${API_VERSION}" \ - --body "$patch_body" \ + --headers "Content-Type=application/json" \ + --body @"$_patch_file" \ --output none 2>&1; then echo " ok" else echo " FAILED — could not set incident platform" fi + rm -f "$_patch_file" # Wait for platform to initialize echo " Waiting 30s for platform to initialize..." sleep 30 @@ -372,15 +413,37 @@ if [[ "$count" -gt 0 ]]; then fi # 2. repos — data-plane only (requires azuresre.dev token) +# Split repos into two buckets: +# - byoapp_repos: domain has a GitHubApp entry in githubDomains → push directly after githubDomains are applied +# - oauth_repos: domain uses OAuth/PAT → pushed in the OAuth sign-in block (step 5) count=$(jq '[.repos // [] | .[] | select(.spec.url // "" | length > 0)] | length' "$FILE") oauth_repos=() +byoapp_repos=() +# Build a set of domains that use GitHubApp auth (BYO App) +_byoapp_domains=$(jq -r '[.githubDomains // [] | .[] | select(.spec.authType == "GitHubApp") | .metadata.name // .name] | join("|")' "$FILE" 2>/dev/null) if [[ "$count" -gt 0 ]]; then if [[ "$DP_TOKEN_AVAILABLE" == "true" ]]; then for i in $(seq 0 $((count - 1))); do - name=$(jq -r --argjson i "$i" '[.repos[] | select(.spec.url // "" | length > 0)][$i].name' "$FILE") - oauth_repos+=("$name") - done - echo "repos: ${count} (will be wired up after GitHub sign-in below)" + name=$(jq -r --argjson i "$i" '[.repos[] | select(.spec.url // "" | length > 0)][$i].name' "$FILE") + rurl=$(jq -r --argjson i "$i" '[.repos[] | select(.spec.url // "" | length > 0)][$i].spec.url' "$FILE") + # Determine the domain: full URL → extract host; short "org/repo" → github.com + if [[ "$rurl" == http* ]]; then + rdomain=$(echo "$rurl" | sed 's|https\?://||' | cut -d/ -f1) + else + rdomain="github.com" + fi + if [[ -n "$_byoapp_domains" ]] && echo "$rdomain" | grep -qE "^(${_byoapp_domains})$"; then + byoapp_repos+=("$name") + else + oauth_repos+=("$name") + fi + done + if [[ ${#byoapp_repos[@]} -gt 0 ]]; then + echo "repos: ${#byoapp_repos[@]} via BYO App (will be wired after githubDomains)" + fi + if [[ ${#oauth_repos[@]} -gt 0 ]]; then + echo "repos: ${#oauth_repos[@]} via OAuth (will be wired after GitHub sign-in below)" + fi else echo "repos: ${count} — ⚠ skipped (no data-plane token)" for i in $(seq 0 $((count - 1))); do @@ -577,6 +640,184 @@ if [[ "$count" -gt 0 ]]; then fi fi +# 4f-2. toolPermissions — data-plane PUT /api/v2/agent/settings/global +# Body: { permissions: { allow: [...], ask: [...], deny: [...] } } +# Requires If-Match header (optimistic concurrency) — GET first to get etag. +# If no global-settings doc exists yet (bootstrap), use If-Match: * +tp_has=$(jq 'has("toolPermissions") and (.toolPermissions | length > 0)' "$FILE") +if [[ "$tp_has" == "true" ]]; then + if [[ "$DP_TOKEN_AVAILABLE" == "true" ]]; then + echo "toolPermissions: configuring" + TOKEN=$(_dp_token) + # GET current to capture etag + tp_resp=$(curl -sS -D- -o /tmp/tp_body.json "${AGENT_ENDPOINT}/api/v2/agent/settings/global" \ + -H "Authorization: Bearer ${TOKEN}" 2>/dev/null) + tp_etag=$(echo "$tp_resp" | grep -i '^etag:' | tr -d '\r' | awk '{print $2}' || true) + if [[ -z "$tp_etag" ]]; then + tp_etag="*" # bootstrap: no doc exists yet + fi + # Build body + tp_body=$(jq -c '{permissions: .toolPermissions}' "$FILE") + tp_result=$(curl -sS -w "\n%{http_code}" -X PUT "${AGENT_ENDPOINT}/api/v2/agent/settings/global" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H "Content-Type: application/json" \ + -H "If-Match: ${tp_etag}" \ + --data "$tp_body" 2>&1) + tp_code=$(echo "$tp_result" | tail -1) + if [[ "$tp_code" =~ ^2 ]]; then + echo " ok toolPermissions" + else + echo " FAILED — PUT settings/global (HTTP ${tp_code})" + fi + rm -f /tmp/tp_body.json + else + echo "toolPermissions — ⚠ skipped (no data-plane token)" + DP_SKIPPED_ITEMS+=("toolPermissions") + fi +fi + +# 4f-3. githubDomains — data-plane PUT /api/v2/github/domains/{domain} +# Supports authType: Pat (github.com only) and GitHubApp (BYO App for GHE) +# Each entry: { metadata: { name: "github.com" }, spec: { authType, pat?, clientId?, privateKeySecretUri?, keyVaultManagedIdentityId? } } +count=$(jq '.githubDomains // [] | length' "$FILE") +if [[ "$count" -gt 0 ]]; then + if [[ "$DP_TOKEN_AVAILABLE" == "true" ]]; then + echo "githubDomains: ${count}" + for i in $(seq 0 $((count - 1))); do + domain=$(jq -r --argjson i "$i" '.githubDomains[$i].metadata.name // .githubDomains[$i].name' "$FILE") + spec=$(jq -c --argjson i "$i" '.githubDomains[$i].spec // .githubDomains[$i]' "$FILE") + # Resolve env vars in spec (secrets like clientId, privateKeySecretUri) + auth_type=$(echo "$spec" | jq -r '.authType // "Pat"') + # Encode domain for URL: github.com → github_com (dots to underscores) + domain_encoded=$(echo "$domain" | tr '.' '_') + TOKEN=$(_dp_token) + ghd_result=$(curl -sS -w "\n%{http_code}" -X PUT \ + "${AGENT_ENDPOINT}/api/v2/github/domains/${domain_encoded}" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H "Content-Type: application/json" \ + --data "$spec" 2>&1) + ghd_code=$(echo "$ghd_result" | tail -1) + if [[ "$ghd_code" =~ ^2 ]]; then + echo " ok githubDomains/${domain} (${auth_type})" + else + echo " FAILED — PUT github/domains/${domain_encoded} (HTTP ${ghd_code})" + echo " $(echo "$ghd_result" | sed '$d' | head -2)" + fi + done + else + echo "githubDomains: ${count} — ⚠ skipped (no data-plane token)" + for i in $(seq 0 $((count - 1))); do + gd=$(jq -r --argjson i "$i" '.githubDomains[$i].metadata.name // .githubDomains[$i].name' "$FILE") + DP_SKIPPED_ITEMS+=("githubDomain/${gd}") + done + fi +fi + +# 4f-3b. BYO App repos — push repos that use GitHubApp auth directly (no OAuth needed) +# These were identified in step 2 above. The githubDomains PUT above already configured +# the BYO App auth, so the agent can access repos using the app's installation token. +if [[ ${#byoapp_repos[@]} -gt 0 && "$DP_TOKEN_AVAILABLE" == "true" ]]; then + echo "byoapp repos: ${#byoapp_repos[@]}" + TOKEN=$(_dp_token) + repo_count=$(jq '.repos // [] | length' "$FILE") + for rname in "${byoapp_repos[@]}"; do + rurl=$(jq -r --arg n "$rname" '[.repos[] | select(.name == $n)][0].spec.url' "$FILE") + rdesc=$(jq -r --arg n "$rname" '[.repos[] | select(.name == $n)][0].spec.description // ""' "$FILE") + rtype_in=$(jq -r --arg n "$rname" '[.repos[] | select(.name == $n)][0].spec.type // "github"' "$FILE") + case "$(printf %s "$rtype_in" | tr "[:upper:]" "[:lower:]")" in + ado|azuredevops|azure-devops) rtype="AzureDevOps" ;; + *) rtype="GitHub" ;; + esac + # Normalize short "org/repo" to full URL + if [[ "$rurl" != http* && "$rurl" == */* ]]; then + rurl="https://github.com/${rurl}" + fi + rbody=$(jq -nc --arg n "$rname" --arg u "$rurl" --arg t "$rtype" --arg d "$rdesc" '{ + name: $n, + type: "CodeRepo", + properties: ({ url: $u, type: $t } + (if $d == "" then {} else { description: $d } end)) + }') + if curl -sS -f -X PUT "${AGENT_ENDPOINT}/api/v2/repos/$(printf %s "$rname" | jq -sRr @uri)" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H "Content-Type: application/json" \ + --data "$rbody" >/dev/null 2>&1; then + echo " ok repo/${rname} (${rurl}) [BYO App]" + else + echo " FAILED — PUT /api/v2/repos/${rname} (try the portal Repos blade)" + fi + done +fi + +# 4f-4. connectorV2 — data-plane multi-step setup via /api/v2/connectorV2 +# Each entry: { metadata: { name }, spec: { apiName, displayName, connectionName?, +# parameterValueSet?: { name, values }, requireApprovalTools?: [...] } } +# Flow: 1) PUT connection 2) list consent links 3) print consent URL 4) PUT mcpserver config +count=$(jq '.connectorV2 // [] | length' "$FILE") +if [[ "$count" -gt 0 ]]; then + if [[ "$DP_TOKEN_AVAILABLE" == "true" ]]; then + echo "connectorV2: ${count}" + for i in $(seq 0 $((count - 1))); do + cv2_name=$(jq -r --argjson i "$i" '.connectorV2[$i].metadata.name // .connectorV2[$i].name' "$FILE") + cv2_spec=$(jq -c --argjson i "$i" '.connectorV2[$i].spec // .connectorV2[$i]' "$FILE") + cv2_api=$(echo "$cv2_spec" | jq -r '.apiName') + cv2_display=$(echo "$cv2_spec" | jq -r '.displayName // .apiName') + cv2_conn=$(echo "$cv2_spec" | jq -r '.connectionName // .apiName | ascii_downcase') + cv2_pvs=$(echo "$cv2_spec" | jq -c '.parameterValueSet // null') + cv2_pv=$(echo "$cv2_spec" | jq -c '.parameterValues // null') + cv2_rat=$(echo "$cv2_spec" | jq -c '.requireApprovalTools // null') + + TOKEN=$(_dp_token) + + # Step 1: Create the connection + conn_body=$(jq -nc --arg dn "$cv2_display" --arg cn "$cv2_api" \ + --argjson pvs "$cv2_pvs" --argjson pv "$cv2_pv" \ + '{displayName: $dn, connectorName: $cn} + (if $pvs != null then {parameterValueSet: $pvs} else {} end) + (if $pv != null then {parameterValues: $pv} else {} end)') + conn_result=$(curl -sS -w "\n%{http_code}" -X PUT \ + "${AGENT_ENDPOINT}/api/v2/connectorV2/connections/${cv2_conn}" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H "Content-Type: application/json" \ + --data "$conn_body" 2>&1) + conn_code=$(echo "$conn_result" | tail -1) + if [[ "$conn_code" =~ ^2 ]]; then + echo " ok connectorV2/connection/${cv2_conn}" + else + echo " WARN — PUT connection/${cv2_conn} (HTTP ${conn_code}) — may need OAuth consent in portal" + fi + + # Step 2: Create MCP server config (links connection to MCP tools) + mcp_body=$(jq -nc --arg desc "$cv2_display" --arg cn "$cv2_conn" --arg api "$cv2_api" \ + --argjson rat "$cv2_rat" \ + '{properties: {description: $desc, connectors: [{name: $api, connectionName: $cn}]}} + (if $rat != null then {runtimeMcpConfiguration: {requireApprovalTools: $rat}} else {} end)') + TOKEN=$(_dp_token) + mcp_result=$(curl -sS -w "\n%{http_code}" -X PUT \ + "${AGENT_ENDPOINT}/api/v2/connectorV2/mcpservers/${cv2_conn}" \ + -H "Authorization: Bearer ${TOKEN}" \ + -H "Content-Type: application/json" \ + --data "$mcp_body" 2>&1) + mcp_code=$(echo "$mcp_result" | tail -1) + if [[ "$mcp_code" =~ ^2 ]]; then + echo " ok connectorV2/mcpserver/${cv2_conn}" + else + echo " FAILED — PUT mcpservers/${cv2_conn} (HTTP ${mcp_code})" + echo " $(echo "$mcp_result" | sed '$d' | head -2)" + fi + + # Step 3: Print consent link if connection needs OAuth + conn_status=$(echo "$conn_result" | sed '$d' | jq -r '.properties.overallStatus // "Unknown"' 2>/dev/null) + if [[ "$conn_status" == "Error" || "$conn_status" == "Unauthenticated" ]]; then + echo " ⚠ Connection ${cv2_conn} needs OAuth consent. Complete in the portal:" + echo " https://sre.azure.com → Connectors → ${cv2_display} → Authorize" + fi + done + else + echo "connectorV2: ${count} — ⚠ skipped (no data-plane token)" + for i in $(seq 0 $((count - 1))); do + cn=$(jq -r --argjson i "$i" '.connectorV2[$i].metadata.name // .connectorV2[$i].name' "$FILE") + DP_SKIPPED_ITEMS+=("connectorV2/${cn}") + done + fi +fi + # 4g. httpTriggers — data-plane only count=$(jq '.httpTriggers // [] | length' "$FILE") HTTP_TRIGGER_URL="" @@ -761,13 +1002,13 @@ if [[ "$DP_TOKEN_AVAILABLE" == "true" ]]; then if [[ -n "${GITHUB_PAT:-}" ]]; then echo "GitHub auth: installing PAT (no browser needed)" TOKEN=$(_dp_token) - if curl -sS -f -X POST "${AGENT_ENDPOINT}/api/v1/Github/auth/pat" \ + if curl -sS -f -X PUT "${AGENT_ENDPOINT}/api/v2/github/domains/github.com" \ -H "Authorization: Bearer ${TOKEN}" \ -H "Content-Type: application/json" \ - --data "{\"accessToken\":\"${GITHUB_PAT}\"}" >/dev/null; then + --data "{\"AuthType\":\"Pat\",\"Pat\":\"${GITHUB_PAT}\"}" >/dev/null; then echo " ok" else - echo " FAILED — POST /api/v1/Github/auth/pat" + echo " FAILED — PUT /api/v2/github/domains/github.com" fi elif [[ ${#oauth_repos[@]} -gt 0 ]]; then echo "GitHub auth: will use OAuth (browser sign-in) — see URL below" @@ -828,9 +1069,22 @@ echo # --------------------------------------------------------------------------- if [[ ${#oauth_repos[@]} -gt 0 ]]; then TOKEN=$(_dp_token 2>/dev/null || true) + # Check if OAuth is configured via domains endpoint GH_STATUS=$(curl -sS -H "Authorization: Bearer ${TOKEN}" \ - "${AGENT_ENDPOINT}/api/v1/Github/auth/status" 2>/dev/null || echo '{}') - GH_CONFIGURED=$(echo "$GH_STATUS" | jq -r '.isConfigured // .hosts[0].isConfigured // false') + "${AGENT_ENDPOINT}/api/v2/github/domains" 2>/dev/null || echo '{}') + if echo "$GH_STATUS" | jq empty 2>/dev/null; then + GH_CONFIGURED=$(echo "$GH_STATUS" | jq -r 'if (.values // []) | length > 0 then "true" else "false" end') + else + GH_CONFIGURED="false" + fi + # Also check if the github connector already exists (OAuth was done in a prior deploy) + if [[ "$GH_CONFIGURED" == "false" ]]; then + _connectors=$(curl -sS -H "Authorization: Bearer ${TOKEN}" -H "Accept: application/json" \ + "${AGENT_ENDPOINT}/api/v2/extendedAgent/connectors" 2>/dev/null || echo '{}') + if echo "$_connectors" | jq -e '[.value // [] | .[] | select(.name == "github")] | length > 0' >/dev/null 2>&1; then + GH_CONFIGURED="true" + fi + fi if [[ "$GH_CONFIGURED" == "true" || -n "${GITHUB_PAT:-}" ]]; then # ── OAuth (or PAT) is in place — wire the connector + repos ── @@ -901,9 +1155,11 @@ if [[ ${#oauth_repos[@]} -gt 0 ]]; then echo "Repos waiting: ${oauth_repos[*]}" OAUTH_URL="" if [[ -n "$TOKEN" ]]; then - OAUTH_URL=$(curl -sS -f -H "Authorization: Bearer ${TOKEN}" \ - "${AGENT_ENDPOINT}/api/v1/Github/config" 2>/dev/null \ - | jq -r '.oAuthUrl // .OAuthUrl // empty') + _gh_config=$(curl -sS -f -H "Authorization: Bearer ${TOKEN}" \ + "${AGENT_ENDPOINT}/api/v2/github/oauth/config" 2>/dev/null || echo '{}') + if echo "$_gh_config" | jq empty 2>/dev/null; then + OAUTH_URL=$(echo "$_gh_config" | jq -r '.oAuthUrl // .OAuthUrl // empty') + fi fi if [[ -n "${OAUTH_URL:-}" ]]; then echo " 1. Open this URL in a browser:" @@ -911,23 +1167,22 @@ if [[ ${#oauth_repos[@]} -gt 0 ]]; then echo " 2. Sign in to GitHub and approve the SRE Agent app." echo echo " Waiting for GitHub authorization (Ctrl-C to skip)..." - if [[ -z "$AGENT_UAMI" ]]; then IDENT="SystemAssigned"; else IDENT="$AGENT_UAMI"; fi - conn_body=$(jq -nc --arg id "$IDENT" '{name:"github",type:"AgentConnector",properties:{dataConnectorType:"GitHubOAuth",dataSource:"github-oauth",identity:$id}}') auth_ok=false for attempt in $(seq 1 24); do sleep 10 TOKEN=$(_dp_token 2>/dev/null || true) - # Check auth/status — only trust isConfigured, not connector PUT success - GH_CHECK=$(curl -sS -H "Authorization: Bearer ${TOKEN}" \ - "${AGENT_ENDPOINT}/api/v1/Github/auth/status" 2>/dev/null || echo '{}') - IS_AUTH=$(echo "$GH_CHECK" | jq -r '.isConfigured // .hosts[0].isConfigured // false') - if [[ "$IS_AUTH" == "true" ]]; then - # Auth confirmed — now create the connector + # Poll domains endpoint — non-empty means OAuth callback was received + _poll=$(curl -sS -H "Authorization: Bearer ${TOKEN}" \ + "${AGENT_ENDPOINT}/api/v2/github/domains" 2>/dev/null || echo '{}') + _has_domain=$(echo "$_poll" | jq -r 'if (.values // []) | length > 0 then "true" else "false" end' 2>/dev/null) + if [[ "$_has_domain" == "true" ]]; then + echo " GitHub authorized!" + # Now create the connector + if [[ -z "$AGENT_UAMI" ]]; then IDENT="SystemAssigned"; else IDENT="$AGENT_UAMI"; fi + conn_body=$(jq -nc --arg id "$IDENT" '{name:"github",type:"AgentConnector",properties:{dataConnectorType:"GitHubOAuth",dataSource:"github-oauth",identity:$id}}') curl -sS -f -X PUT "${AGENT_ENDPOINT}/api/v2/extendedAgent/connectors/github" \ -H "Authorization: Bearer ${TOKEN}" -H "Content-Type: application/json" \ - --data "$conn_body" >/dev/null 2>&1 || true - echo " GitHub authorized!" - echo " ok connector/github" + --data "$conn_body" >/dev/null 2>&1 && echo " ok connector/github" || echo " WARN connector/github PUT failed" auth_ok=true break fi @@ -959,7 +1214,7 @@ if [[ ${#oauth_repos[@]} -gt 0 ]]; then echo " Headless alternative: export GITHUB_PAT=ghp_xxx && re-run" fi else - echo " Could not fetch OAuth URL from ${AGENT_ENDPOINT}/api/v1/Github/config." + echo " Could not fetch OAuth URL from ${AGENT_ENDPOINT}/api/v2/github/oauth/config." echo " Fallback: Azure portal → agent → Repos → 'Authorize' next to each repo." fi echo diff --git a/sreagent-templates/bicep/assemble-agent.sh b/sreagent-templates/bicep/assemble-agent.sh index f4221983d..0632fca92 100755 --- a/sreagent-templates/bicep/assemble-agent.sh +++ b/sreagent-templates/bicep/assemble-agent.sh @@ -171,6 +171,31 @@ TAGS=$(echo "$AGENT_JSON" | jq -c '.tags // {}') EXISTING_UAMI=$(echo "$AGENT_JSON" | jq -r '.existingUamiId // ""') EXISTING_AI=$(echo "$AGENT_JSON" | jq -r '.existingAgentAppInsightsId // ""') +# ── Network configuration ── +NET_TYPE=$(echo "$AGENT_JSON" | jq -r '.networkConfiguration.type // "unrestricted"' | tr '[:upper:]' '[:lower:]') +NET_SUBNET_ID=$(echo "$AGENT_JSON" | jq -r '.networkConfiguration.subnetId // ""') +NET_RG=$(echo "$AGENT_JSON" | jq -r '.networkConfiguration.resourceGroup // ""') +NET_VNET=$(echo "$AGENT_JSON" | jq -r '.networkConfiguration.vnetName // ""') +NET_SUBNET_NAME=$(echo "$AGENT_JSON" | jq -r '.networkConfiguration.subnetName // "agent-subnet"') +NET_SUBNET_PREFIX=$(echo "$AGENT_JSON" | jq -r '.networkConfiguration.subnetPrefix // "10.2.0.0/28"') +NET_ALLOWED_HOSTS=$(echo "$AGENT_JSON" | jq -c '.networkConfiguration.allowedHosts // []') +NET_ALLOWED_REGISTRIES=$(echo "$AGENT_JSON" | jq -c '.networkConfiguration.allowedRegistries // []') +NET_ALLOWED_CODE_REPOS=$(echo "$AGENT_JSON" | jq -c '.networkConfiguration.allowedCodeRepositories // []') +NET_ALLOW_MCP=$(echo "$AGENT_JSON" | jq -r '.networkConfiguration.allowHttpMcpServerNetworkAccess // true') +NET_PRIVATE_DNS=$(echo "$AGENT_JSON" | jq -r '.networkConfiguration.usePrivateDnsResolution // false') + +# Map type to Bicep egressMode +case "$NET_TYPE" in + vnet|azurevnet) EGRESS_MODE="AzureVNet" ;; + limited) EGRESS_MODE="Limited" ;; + *) EGRESS_MODE="Unrestricted" ;; +esac + +# If broken-out VNet fields provided (not subnetId), resolve to full subnet ID +if [[ -z "$NET_SUBNET_ID" && -n "$NET_VNET" && -n "$NET_RG" ]]; then + NET_SUBNET_ID="/subscriptions/${AGENT_SUB}/resourceGroups/${NET_RG}/providers/Microsoft.Network/virtualNetworks/${NET_VNET}/subnets/${NET_SUBNET_NAME}" +fi + _log "Agent: ${AGENT_NAME} (${AGENT_LOC}, ${AGENT_RG})" # ═══════ Read connectors.json ═══════ @@ -235,6 +260,21 @@ _log "incident-platforms: $(echo "$INCIDENT_PLATFORMS" | jq 'length')" REPOS=$(collect_config "repos") _log "repos: $(echo "$REPOS" | jq 'length')" +# Tool permissions — single file (not a collection) +TOOL_PERMISSIONS='{}' +if [[ -f "${DIR}/tool-permissions.json" ]]; then + TOOL_PERMISSIONS=$(cat "${DIR}/tool-permissions.json") + _log "tool-permissions: loaded" +fi + +# GitHub domains (BYO App / PAT auth) — config/github-domains/*.yaml +GITHUB_DOMAINS=$(resolve_env_vars "$(collect_config "github-domains")") +_log "github-domains: $(echo "$GITHUB_DOMAINS" | jq 'length')" + +# ConnectorV2 (Jira, Slack, etc.) — config/connectorv2/*.yaml +CONNECTORV2=$(collect_config "connectorv2") +_log "connectorv2: $(echo "$CONNECTORV2" | jq 'length')" + MARKETPLACES="[]" [[ -d "${DIR}/config/plugins/marketplaces" ]] && MARKETPLACES=$(collect_config "plugins/marketplaces") INSTALLATIONS="[]" @@ -293,6 +333,13 @@ jq -n \ --argjson tags "$TAGS" \ --arg existingUami "$EXISTING_UAMI" \ --arg existingAi "$EXISTING_AI" \ + --arg vnetSubnetId "$NET_SUBNET_ID" \ + --arg egressMode "$EGRESS_MODE" \ + --argjson allowedHosts "$NET_ALLOWED_HOSTS" \ + --argjson allowedRegistries "$NET_ALLOWED_REGISTRIES" \ + --argjson allowedCodeRepositories "$NET_ALLOWED_CODE_REPOS" \ + --argjson allowMcp "$NET_ALLOW_MCP" \ + --argjson privateDns "$NET_PRIVATE_DNS" \ --argjson targetRgs "$TARGET_RGS" \ --argjson toggles "$TOGGLES" \ --argjson ctog "$CONNECTOR_TOGGLES" \ @@ -320,6 +367,13 @@ jq -n \ "tags": { "value": $tags }, "existingManagedIdentityId": { "value": $existingUami }, "existingAgentAppInsightsId": { "value": $existingAi }, + "vnetSubnetId": { "value": $vnetSubnetId }, + "egressMode": { "value": $egressMode }, + "allowedHosts": { "value": $allowedHosts }, + "allowedRegistries": { "value": $allowedRegistries }, + "allowedCodeRepositories": { "value": $allowedCodeRepositories }, + "allowHttpMcpServerNetworkAccess": { "value": $allowMcp }, + "usePrivateDnsResolution": { "value": $privateDns }, "enableAppInsightsConnector": { "value": ($ctog.enableAppInsightsConnector // false) }, "appInsightsResourceId": { "value": ($ctog.appInsightsResourceId // "") }, "appInsightsAppId": { "value": ($ctog.appInsightsAppId // "") }, @@ -369,6 +423,9 @@ jq -n \ --argjson subagents "$SUBAGENTS" \ --argjson tools "$TOOLS" \ --argjson pluginConfigs "$PLUGIN_CONFIGS" \ + --argjson toolPermissions "$TOOL_PERMISSIONS" \ + --argjson githubDomains "$GITHUB_DOMAINS" \ + --argjson connectorV2 "$CONNECTORV2" \ '{ "repos": $repos, "incidentPlatforms": $incidentPlatforms, @@ -390,7 +447,10 @@ jq -n \ "skills": $skills, "subagents": $subagents, "tools": $tools, - "pluginConfigs": [($pluginConfigs // [])[] | {name: (.metadata.name // .name), type: (.type // "Plugin"), tags: (.tags // []), properties: (.spec // .properties // {})}] + "pluginConfigs": [($pluginConfigs // [])[] | {name: (.metadata.name // .name), type: (.type // "Plugin"), tags: (.tags // []), properties: (.spec // .properties // {})}], + "toolPermissions": $toolPermissions, + "githubDomains": $githubDomains, + "connectorV2": $connectorV2 }' > "$EXTRAS_FILE" # Merge admin settings if present (adminUsers for cross-tenant access) diff --git a/sreagent-templates/bicep/main.bicep b/sreagent-templates/bicep/main.bicep index cc4b752aa..55c417f76 100644 --- a/sreagent-templates/bicep/main.bicep +++ b/sreagent-templates/bicep/main.bicep @@ -64,9 +64,33 @@ param tags object = {} @description('Optional. Resource ID of an existing UAMI. If provided, skips creating a new one.') param existingManagedIdentityId string = '' +@description('Optional. Skip all role assignments. Use on redeploy or when RBAC is pre-configured to avoid RoleAssignmentExists errors.') +param skipRoleAssignments bool = false + @description('Optional. Resource ID of an existing Application Insights for agent telemetry. If provided, skips creating a new one.') param existingAgentAppInsightsId string = '' +@description('Optional. Full ARM resource ID of a delegated subnet for VNet integration.') +param vnetSubnetId string = '' + +@description('Optional. Sandbox egress mode: Unrestricted (default), Limited, or AzureVNet.') +param egressMode string = 'Unrestricted' + +@description('Optional. Additional hosts the sandbox may reach.') +param allowedHosts array = [] + +@description('Optional. Registry catalog IDs (pypi, npmjs, nuget-org) to allow.') +param allowedRegistries array = [] + +@description('Optional. Code-repo providers (Github, AzureDevOps) to allow.') +param allowedCodeRepositories array = [] + +@description('Optional. Allow remote HTTP MCP server endpoints in sandbox egress.') +param allowHttpMcpServerNetworkAccess bool = true + +@description('Optional. Use VNet private DNS resolver. Only for AzureVNet mode.') +param usePrivateDnsResolution bool = false + // ═════════ FEATURE TOGGLES — common starter features ═════════ // Flip a toggle to true, fill the conditional strings below it. // Each toggle synthesizes a single connector / hook / prompt entry @@ -176,6 +200,14 @@ module core './agent-core.bicep' = { tags: tags existingManagedIdentityId: existingManagedIdentityId existingAgentAppInsightsId: existingAgentAppInsightsId + skipRoleAssignments: skipRoleAssignments + vnetSubnetId: vnetSubnetId + egressMode: egressMode + allowedHosts: allowedHosts + allowedRegistries: allowedRegistries + allowedCodeRepositories: allowedCodeRepositories + allowHttpMcpServerNetworkAccess: allowHttpMcpServerNetworkAccess + usePrivateDnsResolution: usePrivateDnsResolution } } diff --git a/sreagent-templates/bin/add-recipe.sh b/sreagent-templates/bin/add-recipe.sh new file mode 100755 index 000000000..d2dc6475f --- /dev/null +++ b/sreagent-templates/bin/add-recipe.sh @@ -0,0 +1,339 @@ +#!/usr/bin/env bash +# add-recipe.sh — Augment an existing agent with components from a recipe. +# +# Exports the live agent config, overlays recipe files, auto-detects values +# already configured (DT, LAW, GitHub repo, etc.), and produces a merged +# directory ready for deploy.sh. +# +# Key behaviour: +# - Does NOT overwrite agent.json identity/access/model — only merges toggles +# - Does NOT duplicate connectors — skips if connector name already exists +# - Does NOT re-ask for values the agent already has — auto-extracts from +# existing connectors.json, config/repos/*.yaml, and connectors.secrets.env +# - Only prompts for values the recipe needs that the agent doesn't have yet +# +# Usage: +# ./bin/add-recipe.sh --recipe law-dynatrace-github-httptrigger-prvalidation --agent-dir ./demo1-dt-snow +# ./bin/add-recipe.sh --recipe --agent-dir --non-interactive + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +RECIPES_DIR="${SCRIPT_DIR}/../recipes" + +# ─────────────────────────── Usage ─────────────────────────── +usage() { + cat < Recipe to add (required) + --agent-dir Existing agent directory (required) + --set key=value Override a prompt value (repeatable) + --non-interactive Use auto-detected + default values, no prompts + --list List available recipes and exit + -h, --help Show this help + +What it does: + 1. Auto-detects values already in the agent (DT tenant, LAW ID, GitHub repo, etc.) + 2. Copies NEW config files (skills, subagents, http-triggers, hooks) — skips existing + 3. Merges only new toggles into agent.json (preserves identity/access/model) + 4. Appends only new connectors to connectors.json (skips duplicates by name) + 5. Only prompts for values the recipe needs that the agent doesn't already have + +After adding: + ./bin/deploy.sh +EOF + exit "${1:-0}" +} + +RECIPE="" AGENT_DIR="" NON_INTERACTIVE=false LIST_ONLY=false +PRESET_FILE=$(mktemp /tmp/preset-add.XXXXXX) +VALUES_FILE=$(mktemp /tmp/values-add.XXXXXX) +_set() { echo "${2}=${3}" >> "$1"; } +_get() { grep "^${2}=" "$1" 2>/dev/null | tail -1 | cut -d= -f2- || true; } +_has() { grep -q "^${1}=" "$2" 2>/dev/null; } +trap 'rm -f "$PRESET_FILE" "$VALUES_FILE" 2>/dev/null' EXIT + +while [[ $# -gt 0 ]]; do + case "$1" in + --recipe) RECIPE="$2"; shift 2 ;; + --agent-dir) AGENT_DIR="$2"; shift 2 ;; + --set) key="${2%%=*}"; val="${2#*=}"; _set "$PRESET_FILE" "$key" "$val"; shift 2 ;; + --non-interactive) NON_INTERACTIVE=true; shift ;; + --list) LIST_ONLY=true; shift ;; + -h|--help) usage 0 ;; + *) echo "Unknown option: $1" >&2; usage 1 ;; + esac +done + +# ─────────────────────────── List recipes ─────────────────────────── +if [[ "$LIST_ONLY" == "true" ]]; then + echo "Available recipes:" + echo + for d in "${RECIPES_DIR}"/*/; do + [[ -f "${d}agent.json" ]] || continue + name=$(basename "$d") + desc=$(jq -r '._description // "No description"' "${d}agent.json") + printf ' %-45s %s\n' "$name" "$desc" + done + exit 0 +fi + +# ─────────────────────────── Validate inputs ─────────────────────────── +[[ -n "$RECIPE" ]] || { echo "Error: --recipe is required" >&2; usage 1; } +[[ -n "$AGENT_DIR" ]] || { echo "Error: --agent-dir is required" >&2; usage 1; } + +RECIPE_DIR="${RECIPES_DIR}/${RECIPE}" +[[ -d "$RECIPE_DIR" ]] || { echo "Recipe not found: ${RECIPE}" >&2; echo "Run $0 --list to see available recipes." >&2; exit 1; } +[[ -f "${RECIPE_DIR}/agent.json" ]] || { echo "Recipe missing agent.json: ${RECIPE}" >&2; exit 1; } +[[ -d "$AGENT_DIR" ]] || { echo "Agent directory not found: ${AGENT_DIR}" >&2; exit 1; } +[[ -f "${AGENT_DIR}/agent.json" ]] || { echo "Not an agent directory (no agent.json): ${AGENT_DIR}" >&2; exit 1; } + +echo +echo "── Adding recipe: ${RECIPE} → ${AGENT_DIR} ──" +jq -r '._description // ""' "${RECIPE_DIR}/agent.json" +echo + +# ─────────────────────────── Auto-detect existing values ─────────────────────────── +echo "── Auto-detecting existing agent configuration ──" + +PROMPTS=$(jq -c '._prompts // {}' "${RECIPE_DIR}/agent.json") +PROMPT_KEYS=$(echo "$PROMPTS" | jq -r 'keys[]') + +# Extract identity from agent.json +_auto() { local k="$1" v="$2"; if [[ -n "$v" ]] && ! _has "$k" "$PRESET_FILE"; then _set "$PRESET_FILE" "$k" "$v"; echo " auto: ${k} = ${v}"; fi; } + +_auto "agentName" "$(jq -r '.identity.agentName // ""' "${AGENT_DIR}/agent.json")" +_auto "resourceGroup" "$(jq -r '.identity.resourceGroup // ""' "${AGENT_DIR}/agent.json")" +_auto "location" "$(jq -r '.identity.location // ""' "${AGENT_DIR}/agent.json")" + +# Extract LAW ID from connectors.json +if [[ -f "${AGENT_DIR}/connectors.json" ]]; then + EXISTING_LAW=$(jq -r '.toggles.lawResourceId // ""' "${AGENT_DIR}/connectors.json") + _auto "lawId" "$EXISTING_LAW" + + # Extract Dynatrace tenant from connector endpoint URL + EXISTING_DT_ENDPOINT=$(jq -r '.connectors[]? | select(.name == "dynatrace") | .properties.extendedProperties.endpoint // ""' "${AGENT_DIR}/connectors.json" 2>/dev/null || echo "") + if [[ -n "$EXISTING_DT_ENDPOINT" ]]; then + # https://.apps.dynatrace.com/... → extract tenant + DT_TENANT_EXTRACTED=$(echo "$EXISTING_DT_ENDPOINT" | sed -n 's|https://\([^.]*\)\.apps\.dynatrace\.com.*|\1|p') + _auto "dtTenant" "$DT_TENANT_EXTRACTED" + fi +fi + +# Extract Dynatrace token from secrets env +if [[ -f "${AGENT_DIR}/connectors.secrets.env" ]]; then + EXISTING_DT_TOKEN=$(grep "^DYNATRACE_BEARER_TOKEN=" "${AGENT_DIR}/connectors.secrets.env" 2>/dev/null | cut -d= -f2- || echo "") + if [[ -n "$EXISTING_DT_TOKEN" ]]; then + _auto "dtToken" "$EXISTING_DT_TOKEN" + fi +fi + +# Extract GitHub repo from existing repos config +if compgen -G "${AGENT_DIR}/config/repos/*.yaml" > /dev/null 2>&1; then + for rf in "${AGENT_DIR}"/config/repos/*.yaml; do + [[ -f "$rf" ]] || continue + EXISTING_REPO=$(grep -m1 'url:' "$rf" 2>/dev/null | sed 's/.*url: *"\{0,1\}\([^"]*\)"\{0,1\}/\1/' || echo "") + if [[ -n "$EXISTING_REPO" && "$EXISTING_REPO" != *"{{" ]]; then + _auto "githubRepo" "$EXISTING_REPO" + break + fi + done +fi + +echo + +# ─────────────────────────── Collect remaining inputs ─────────────────────────── +for key in $PROMPT_KEYS; do + # Skip identity fields — already in agent.json, not changing them + case "$key" in agentName|resourceGroup|location|targetRGs|existingUamiId|modelProvider|existingAgentAppInsightsId) + if _has "$key" "$PRESET_FILE"; then + _set "$VALUES_FILE" "$key" "$(_get "$PRESET_FILE" "$key")" + fi + continue ;; + esac + + # Already auto-detected or preset + if _has "$key" "$PRESET_FILE"; then + _set "$VALUES_FILE" "$key" "$(_get "$PRESET_FILE" "$key")" + continue + fi + + ask=$(echo "$PROMPTS" | jq -r --arg k "$key" '.[$k].ask // $k') + default=$(echo "$PROMPTS" | jq -r --arg k "$key" '.[$k].default // ""') + required=$(echo "$PROMPTS" | jq -r --arg k "$key" '.[$k].required // false') + is_secret=$(echo "$PROMPTS" | jq -r --arg k "$key" '.[$k].secret // false') + + if [[ "$NON_INTERACTIVE" == "true" ]]; then + if [[ -n "$default" ]]; then + _set "$VALUES_FILE" "$key" "$default" + echo " ${ask}: ${default} (default)" + elif [[ "$required" == "true" ]]; then + echo "Error: ${key} is required, not auto-detected, and --non-interactive set" >&2 + echo " Use --set ${key}= to provide it" >&2 + exit 1 + fi + continue + fi + + prompt_text=" ${ask}" + [[ -n "$default" ]] && prompt_text="${prompt_text} (${default})" + prompt_text="${prompt_text}: " + + if [[ "$is_secret" == "true" ]]; then + read -rsp "$prompt_text" val; echo "(hidden)" + else + read -rp "$prompt_text" val + fi + [[ -z "$val" ]] && val="$default" + if [[ -z "$val" && "$required" == "true" ]]; then + echo " Error: ${key} is required" >&2; exit 1 + fi + _set "$VALUES_FILE" "$key" "$val" +done + +# ─────────────────────────── Copy config files (additive) ─────────────────────────── +ADDED=0 SKIPPED=0 + +copy_dir() { + local src_dir="$1" dst_dir="$2" label="$3" + [[ -d "$src_dir" ]] || return 0 + mkdir -p "$dst_dir" + for f in "$src_dir"/*; do + [[ -f "$f" ]] || continue + local fname + fname=$(basename "$f") + if [[ -f "${dst_dir}/${fname}" ]]; then + echo " skip ${label}/${fname} (already exists)" + SKIPPED=$((SKIPPED + 1)) + else + cp "$f" "${dst_dir}/${fname}" + echo " add ${label}/${fname}" + ADDED=$((ADDED + 1)) + fi + done +} + +echo +echo "── Copying config files ──" +copy_dir "${RECIPE_DIR}/config/skills" "${AGENT_DIR}/config/skills" "config/skills" +copy_dir "${RECIPE_DIR}/config/subagents" "${AGENT_DIR}/config/subagents" "config/subagents" +copy_dir "${RECIPE_DIR}/config/hooks" "${AGENT_DIR}/config/hooks" "config/hooks" +copy_dir "${RECIPE_DIR}/config/common-prompts" "${AGENT_DIR}/config/common-prompts" "config/common-prompts" +copy_dir "${RECIPE_DIR}/config/repos" "${AGENT_DIR}/config/repos" "config/repos" +copy_dir "${RECIPE_DIR}/config/tools" "${AGENT_DIR}/config/tools" "config/tools" +copy_dir "${RECIPE_DIR}/config/plugin-configs" "${AGENT_DIR}/config/plugin-configs" "config/plugin-configs" + +echo +echo "── Copying automations ──" +copy_dir "${RECIPE_DIR}/automations/http-triggers" "${AGENT_DIR}/automations/http-triggers" "automations/http-triggers" +copy_dir "${RECIPE_DIR}/automations/scheduled-tasks" "${AGENT_DIR}/automations/scheduled-tasks" "automations/scheduled-tasks" +copy_dir "${RECIPE_DIR}/automations/incident-filters" "${AGENT_DIR}/automations/incident-filters" "automations/incident-filters" +copy_dir "${RECIPE_DIR}/automations/incident-platforms" "${AGENT_DIR}/automations/incident-platforms" "automations/incident-platforms" + +# ─────────────────────────── Merge toggles into agent.json ─────────────────────────── +echo +echo "── Merging toggles into agent.json ──" + +# Extract only the toggles from the recipe's agent.json (ignore everything else) +RECIPE_TOGGLES=$(jq -c '.toggles // {}' "${RECIPE_DIR}/agent.json") +if [[ "$RECIPE_TOGGLES" != "{}" ]]; then + # Merge: recipe toggles override only the keys they set; existing toggles preserved + jq --argjson rt "$RECIPE_TOGGLES" '.toggles = (.toggles // {} | . * $rt)' \ + "${AGENT_DIR}/agent.json" > "${AGENT_DIR}/agent.json.tmp" + mv "${AGENT_DIR}/agent.json.tmp" "${AGENT_DIR}/agent.json" + echo " merged toggles: $(echo "$RECIPE_TOGGLES" | jq -r 'keys | join(", ")')" +else + echo " no toggles to merge" +fi + +# ─────────────────────────── Append connectors ─────────────────────────── +if [[ -f "${RECIPE_DIR}/connectors.json" ]]; then + echo + echo "── Merging connectors ──" + + # Merge connector toggles (LAW, AppInsights, AzMon settings) + RECIPE_CONN_TOGGLES=$(jq -c '.toggles // {}' "${RECIPE_DIR}/connectors.json") + if [[ "$RECIPE_CONN_TOGGLES" != "{}" ]]; then + if [[ -f "${AGENT_DIR}/connectors.json" ]]; then + jq --argjson rt "$RECIPE_CONN_TOGGLES" '.toggles = (.toggles // {} | . * $rt)' \ + "${AGENT_DIR}/connectors.json" > "${AGENT_DIR}/connectors.json.tmp" + mv "${AGENT_DIR}/connectors.json.tmp" "${AGENT_DIR}/connectors.json" + echo " merged connector toggles" + fi + fi + + # Append new connectors (by name, skip duplicates) + RECIPE_CONNECTORS=$(jq -c '.connectors // []' "${RECIPE_DIR}/connectors.json") + if [[ "$RECIPE_CONNECTORS" != "[]" ]]; then + EXISTING_NAMES=$(jq -r '.connectors // [] | .[].name' "${AGENT_DIR}/connectors.json" 2>/dev/null || echo "") + TEMP_CONNECTORS=$(mktemp /tmp/conn.XXXXXX) + echo "$RECIPE_CONNECTORS" | jq -c '.[]' | while read -r conn; do + cname=$(echo "$conn" | jq -r '.name') + if echo "$EXISTING_NAMES" | grep -qx "$cname"; then + echo " skip connector: ${cname} (already exists)" + else + echo "$conn" >> "$TEMP_CONNECTORS" + echo " add connector: ${cname}" + fi + done + if [[ -s "$TEMP_CONNECTORS" ]]; then + NEW_CONNS=$(jq -sc '.' "$TEMP_CONNECTORS") + jq --argjson nc "$NEW_CONNS" '.connectors = (.connectors // [] | . + $nc)' \ + "${AGENT_DIR}/connectors.json" > "${AGENT_DIR}/connectors.json.tmp" + mv "${AGENT_DIR}/connectors.json.tmp" "${AGENT_DIR}/connectors.json" + fi + rm -f "$TEMP_CONNECTORS" + fi +fi + +# ─────────────────────────── Replace placeholders in new files ─────────────────────────── +if [[ -s "$VALUES_FILE" ]]; then + echo + echo "── Replacing placeholders ──" + # Only replace in files that came from the recipe (avoid touching existing agent files) + for file in $(find "$AGENT_DIR" -type f \( -name '*.json' -o -name '*.yaml' -o -name '*.md' \)); do + content=$(cat "$file") + changed=false + while IFS="=" read -r key val || [[ -n "$key" ]]; do + if echo "$content" | grep -q "{{${key}}}\|{{${key}:bool}}"; then + content=$(echo "$content" | sed "s|\"{{${key}:bool}}\"|$(if [[ -n "$val" ]]; then echo "true"; else echo "false"; fi)|g") + content=$(echo "$content" | sed "s|{{${key}}}|${val}|g") + changed=true + fi + done < "$VALUES_FILE" + if [[ "$changed" == "true" ]]; then + echo "$content" > "$file" + echo " replaced placeholders in $(basename "$file")" + fi + done +fi + +# ─────────────────────────── Write secrets ─────────────────────────── +SECRETS_ENV="${AGENT_DIR}/connectors.secrets.env" +if [[ -s "$VALUES_FILE" ]]; then + while IFS="=" read -r key val || [[ -n "$key" ]]; do + is_secret=$(echo "$PROMPTS" | jq -r --arg k "$key" '.[$k].secret // false') + if [[ "$is_secret" == "true" && -n "$val" ]]; then + case "$key" in + dtToken) + if ! grep -q "DYNATRACE_BEARER_TOKEN=" "$SECRETS_ENV" 2>/dev/null; then + echo "DYNATRACE_BEARER_TOKEN=${val}" >> "$SECRETS_ENV" + echo " added DYNATRACE_BEARER_TOKEN to secrets" + fi ;; + esac + fi + done < "$VALUES_FILE" +fi + +# ─────────────────────────── Summary ─────────────────────────── +echo +echo "── Done ──" +echo " Added: ${ADDED} files" +echo " Skipped: ${SKIPPED} files (already existed)" +echo +echo "Next step:" +echo " ./bin/deploy.sh ${AGENT_DIR}" diff --git a/sreagent-templates/bin/deploy.sh b/sreagent-templates/bin/deploy.sh index 66f725ea9..925e35a0a 100755 --- a/sreagent-templates/bin/deploy.sh +++ b/sreagent-templates/bin/deploy.sh @@ -208,15 +208,76 @@ fi # ── Run the deployment with progress visible ── TMP=$(mktemp) +# ── Pre-deploy: auto-create VNet subnet with delegation if networkConfiguration.type=vnet ── +AGENT_JSON_FILE="${INPUT}/agent.json" +if [[ -f "$AGENT_JSON_FILE" ]]; then + NET_TYPE=$(jq -r '.networkConfiguration.type // "unrestricted"' "$AGENT_JSON_FILE" | tr '[:upper:]' '[:lower:]') + if [[ "$NET_TYPE" == "vnet" || "$NET_TYPE" == "azurevnet" ]]; then + NET_SUBNET_ID=$(jq -r '.networkConfiguration.subnetId // ""' "$AGENT_JSON_FILE") + NET_RG=$(jq -r '.networkConfiguration.resourceGroup // ""' "$AGENT_JSON_FILE") + NET_VNET=$(jq -r '.networkConfiguration.vnetName // ""' "$AGENT_JSON_FILE") + NET_SUBNET_NAME=$(jq -r '.networkConfiguration.subnetName // "agent-subnet"' "$AGENT_JSON_FILE") + NET_SUBNET_PREFIX=$(jq -r '.networkConfiguration.subnetPrefix // "10.2.0.0/28"' "$AGENT_JSON_FILE") + + # Resolve subnet ID from broken-out fields if not given directly + if [[ -z "$NET_SUBNET_ID" && -n "$NET_VNET" && -n "$NET_RG" ]]; then + NET_SUBNET_ID="/subscriptions/${SUB}/resourceGroups/${NET_RG}/providers/Microsoft.Network/virtualNetworks/${NET_VNET}/subnets/${NET_SUBNET_NAME}" + fi + + if [[ -n "$NET_SUBNET_ID" ]]; then + # Extract components from subnet ID (macOS-compatible) + _vnet_rg=$(echo "$NET_SUBNET_ID" | sed 's|.*/resourceGroups/||' | sed 's|/.*||') + _vnet_name=$(echo "$NET_SUBNET_ID" | sed 's|.*/virtualNetworks/||' | sed 's|/.*||') + _subnet_name=$(echo "$NET_SUBNET_ID" | sed 's|.*/subnets/||') + + # Check if subnet exists + if ! az network vnet subnet show -g "$_vnet_rg" --vnet-name "$_vnet_name" -n "$_subnet_name" &>/dev/null; then + echo "── Creating VNet subnet with Microsoft.App/environments delegation ──" + echo " VNet: $_vnet_name Subnet: $_subnet_name Prefix: $NET_SUBNET_PREFIX" + az network vnet subnet create \ + -g "$_vnet_rg" \ + --vnet-name "$_vnet_name" \ + -n "$_subnet_name" \ + --address-prefixes "$NET_SUBNET_PREFIX" \ + --delegations "Microsoft.App/environments" \ + --output none 2>&1 || { echo " ⚠ Failed to create subnet — VNet integration may fail"; } + echo " ✅ Subnet created" + else + # Verify delegation exists + _delegation=$(az network vnet subnet show -g "$_vnet_rg" --vnet-name "$_vnet_name" -n "$_subnet_name" --query "delegations[0].serviceName" -o tsv 2>/dev/null) + if [[ "$_delegation" != "Microsoft.App/environments" ]]; then + echo " ⚠ Subnet $_subnet_name exists but missing Microsoft.App/environments delegation" + echo " Adding delegation..." + az network vnet subnet update \ + -g "$_vnet_rg" \ + --vnet-name "$_vnet_name" \ + -n "$_subnet_name" \ + --delegations "Microsoft.App/environments" \ + --output none 2>&1 || echo " ⚠ Failed to add delegation" + else + echo " VNet subnet $_subnet_name ready (delegation: Microsoft.App/environments)" + fi + fi + fi + fi +fi + echo "Starting deployment (this typically takes 3-5 min)..." echo "Tip: open another terminal and run 'az deployment operation sub list -n $NAME -o table' to watch progress." echo +# Auto-detect redeploy: if agent already exists, skip role assignments to avoid RoleAssignmentExists +SKIP_RBAC="" +if az resource show -g "$RG" --resource-type "Microsoft.App/agents" -n "$AG" --query "name" -o tsv &>/dev/null; then + echo " Agent '$AG' already exists — skipping role assignments on redeploy." + SKIP_RBAC="skipRoleAssignments=true" +fi + az deployment sub create \ --location "$LOC" \ --name "$NAME" \ --template-file "$TEMPLATE" \ - --parameters "@${FILE}" \ + --parameters "@${FILE}" ${SKIP_RBAC:+--parameters $SKIP_RBAC} \ --output json > "$TMP" 2>&1 AZ_RC=$? cat "$TMP" @@ -269,19 +330,33 @@ check_connector_health() { } if [[ "$STATE" != "Succeeded" ]]; then - echo - echo -e "${RED}${BOLD}══════════ Deployment FAILED ══════════${NC}" - # Extract the most useful error message - ERR_MSG=$(jq -r '.. | .message? // empty' "$TMP" 2>/dev/null | grep -v "^At least" | head -3) - if [[ -n "$ERR_MSG" ]]; then + # Check if this is a non-fatal RoleAssignmentExists error (common on redeploy) + ROLE_ERR=$(jq -r '.. | .code? // empty' "$TMP" 2>/dev/null | grep -c "RoleAssignmentExists" || true) + AGENT_EXISTS=$(az resource list -g "$RG" --resource-type "Microsoft.App/agents" --query "[?name=='$AG'].name" -o tsv 2>/dev/null) + + if [[ "$ROLE_ERR" -gt 0 && -n "$AGENT_EXISTS" ]]; then + echo + echo -e "${YELLOW}${BOLD}── Deployment partially failed (RoleAssignmentExists) ──${NC}" + echo -e " ${YELLOW}Role assignments already exist — this is safe on redeploy.${NC}" + echo -e " Agent ${CYAN}${AG}${NC} exists. Continuing to apply-extras..." + echo + # Override STATE so the rest of the script continues + STATE="Succeeded" + else echo - echo -e " ${RED}Root cause:${NC}" - echo "$ERR_MSG" | sed 's/^/ /' + echo -e "${RED}${BOLD}══════════ Deployment FAILED ══════════${NC}" + # Extract the most useful error message + ERR_MSG=$(jq -r '.. | .message? // empty' "$TMP" 2>/dev/null | grep -v "^At least" | head -3) + if [[ -n "$ERR_MSG" ]]; then + echo + echo -e " ${RED}Root cause:${NC}" + echo "$ERR_MSG" | sed 's/^/ /' + fi + echo + echo " Debug: az deployment operation sub list -n $NAME -o table" + echo + exit 1 fi - echo - echo " Debug: az deployment operation sub list -n $NAME -o table" - echo - exit 1 fi echo diff --git a/sreagent-templates/bin/export-agent.sh b/sreagent-templates/bin/export-agent.sh index 346fc0b50..328bd3e9e 100755 --- a/sreagent-templates/bin/export-agent.sh +++ b/sreagent-templates/bin/export-agent.sh @@ -900,17 +900,26 @@ for i in $(seq 0 $((CONNECTOR_COUNT - 1))); do ctype=$(echo "$CONNECTORS" | jq -r --argjson i "$i" '.[$i].properties.dataConnectorType') case "$ctype" in AppInsights) - ENABLE_AI=true - AI_RESOURCE_ID=$(echo "$CONNECTORS" | jq -r --argjson i "$i" '.[$i].properties.dataSource // .[$i].properties.extendedProperties.armResourceId // ""') - AI_APP_ID=$(echo "$CONNECTORS" | jq -r --argjson i "$i" '.[$i].properties.extendedProperties.appId // ""') + # Only capture the FIRST AppInsights connector for the toggle + if [[ "$ENABLE_AI" == "false" ]]; then + ENABLE_AI=true + AI_RESOURCE_ID=$(echo "$CONNECTORS" | jq -r --argjson i "$i" '.[$i].properties.dataSource // .[$i].properties.extendedProperties.armResourceId // ""') + AI_APP_ID=$(echo "$CONNECTORS" | jq -r --argjson i "$i" '.[$i].properties.extendedProperties.appId // ""') + fi ;; LogAnalytics) - ENABLE_LAW=true - LAW_RESOURCE_ID=$(echo "$CONNECTORS" | jq -r --argjson i "$i" '.[$i].properties.dataSource // .[$i].properties.extendedProperties.armResourceId // ""') + # Only capture the FIRST LogAnalytics connector for the toggle + if [[ "$ENABLE_LAW" == "false" ]]; then + ENABLE_LAW=true + LAW_RESOURCE_ID=$(echo "$CONNECTORS" | jq -r --argjson i "$i" '.[$i].properties.dataSource // .[$i].properties.extendedProperties.armResourceId // ""') + fi ;; AzureMonitor) - ENABLE_AZMON=true - AZMON_LOOKBACK=$(echo "$CONNECTORS" | jq -r --argjson i "$i" '.[$i].properties.extendedProperties.lookbackDays // 7') + # Only capture the FIRST AzureMonitor connector for the toggle + if [[ "$ENABLE_AZMON" == "false" ]]; then + ENABLE_AZMON=true + AZMON_LOOKBACK=$(echo "$CONNECTORS" | jq -r --argjson i "$i" '.[$i].properties.extendedProperties.lookbackDays // 7') + fi ;; esac done @@ -1050,10 +1059,29 @@ done CONNECTORS_CLEAN=$(sanitize "$CONNECTORS_CLEAN") # ── Write connectors.json ── -# Toggle-managed types (AppInsights, LogAnalytics, AzureMonitor) go into toggles. -# All other connectors (MCP, Kusto, etc.) go into the connectors array. -TOGGLE_TYPES="AppInsights|LogAnalytics|AzureMonitor" -CONNECTORS_ARRAY=$(echo "$CONNECTORS_CLEAN" | jq -c --arg tt "$TOGGLE_TYPES" '[.[] | select(.properties.dataConnectorType | test("^(\($tt))$") | not)]') +# Toggle-managed types (AppInsights, LogAnalytics, AzureMonitor) map to Bicep +# parameters that create ARM resources. The FIRST connector of each toggle type +# goes into toggles; any ADDITIONAL connectors of the same type (e.g. a second +# LAW workspace) stay in the connectors array (deployed via data-plane). +TOGGLE_NAMES="" +[[ "$ENABLE_AI" == "true" ]] && { + first_ai=$(echo "$CONNECTORS_CLEAN" | jq -r '[.[] | select(.properties.dataConnectorType == "AppInsights")][0].name') + [[ "$first_ai" != "null" ]] && TOGGLE_NAMES="${TOGGLE_NAMES}${first_ai}|" +} +[[ "$ENABLE_LAW" == "true" ]] && { + first_law=$(echo "$CONNECTORS_CLEAN" | jq -r '[.[] | select(.properties.dataConnectorType == "LogAnalytics")][0].name') + [[ "$first_law" != "null" ]] && TOGGLE_NAMES="${TOGGLE_NAMES}${first_law}|" +} +[[ "$ENABLE_AZMON" == "true" ]] && { + first_azmon=$(echo "$CONNECTORS_CLEAN" | jq -r '[.[] | select(.properties.dataConnectorType == "AzureMonitor")][0].name') + [[ "$first_azmon" != "null" ]] && TOGGLE_NAMES="${TOGGLE_NAMES}${first_azmon}|" +} +TOGGLE_NAMES="${TOGGLE_NAMES%|}" # strip trailing | +if [[ -n "$TOGGLE_NAMES" ]]; then + CONNECTORS_ARRAY=$(echo "$CONNECTORS_CLEAN" | jq -c --arg tn "$TOGGLE_NAMES" '[.[] | select(.name | test("^(\($tn))$") | not)]') +else + CONNECTORS_ARRAY=$(echo "$CONNECTORS_CLEAN" | jq -c '.') +fi echo "$CONNECTORS_ARRAY" | jq --argjson enableAI "$ENABLE_AI" --arg aiResId "$AI_RESOURCE_ID" --arg aiAppId "$AI_APP_ID" \ --argjson enableLAW "$ENABLE_LAW" --arg lawResId "$LAW_RESOURCE_ID" \ @@ -1071,7 +1099,7 @@ echo "$CONNECTORS_ARRAY" | jq --argjson enableAI "$ENABLE_AI" --arg aiResId "$AI "connectors": . }' > "${EXPORT_DIR}/connectors.json" CONN_COUNT=$(echo "$CONNECTORS_ARRAY" | jq 'length') -_log "Wrote connectors.json (${CONN_COUNT} connector(s) + toggles)" +_log "Wrote connectors.json (${CONN_COUNT} extra connector(s) + toggles)" _log "Wrote connectors.secrets.env (secrets extracted — DO NOT commit)" @@ -1093,9 +1121,8 @@ cat > "${EXPORT_DIR}/.gitignore" << 'GITIGNORE' # Secrets — never commit connectors.secrets.env *.secrets.env - -# Downloaded data (can be large) -data/ +# Generated verification spec +expected-config.json GITIGNORE _log "Wrote .gitignore" @@ -1114,7 +1141,7 @@ fi if [[ "$ENABLE_AZMON" == "true" ]]; then EXPECTED_CONNECTORS=$(echo "$EXPECTED_CONNECTORS" | jq '. + [{"name":"azure-monitor","type":"AzureMonitor"}]') fi -# Array connectors (MCP, Kusto, etc.) — skip null/empty entries +# Array connectors (MCP, extra LAW/AI/AzMon, Kusto, etc.) for i in $(seq 0 $(($(echo "$CONNECTORS_ARRAY" | jq 'length') - 1))); do cname=$(echo "$CONNECTORS_ARRAY" | jq -r --argjson i "$i" '.[$i].name') ctype=$(echo "$CONNECTORS_ARRAY" | jq -r --argjson i "$i" '.[$i].properties.dataConnectorType') diff --git a/sreagent-templates/bin/new-agent.sh b/sreagent-templates/bin/new-agent.sh index fd0580253..a960cfb8d 100755 --- a/sreagent-templates/bin/new-agent.sh +++ b/sreagent-templates/bin/new-agent.sh @@ -223,14 +223,14 @@ done < "$VALUES_FILE" mv "$MAPPED_FILE" "$VALUES_FILE" # Replace {{placeholders}} with user values in all JSON and YAML files -for file in $(find "$OUTPUT" -name '*.json' -o -name '*.yaml' -type f); do +for file in $(find "$OUTPUT" -type f \( -name '*.json' -o -name '*.yaml' \)); do content=$(cat "$file") while IFS="=" read -r key val || [[ -n "$key" ]]; do # Handle {{key:bool}} — converts non-empty to true, empty to false content=$(echo "$content" | sed "s|\"{{${key}:bool}}\"|$(if [[ -n "$val" ]]; then echo "true"; else echo "false"; fi)|g") # Handle {{key}} that's a comma-separated list → JSON array if [[ "$key" == "targetRGs" && "$val" == *,* ]]; then - json_array=$(echo "$val" | tr ',' '\n' | sed 's/^ */"/;s/ *$/"/;' | paste -sd, | sed 's/^/[/;s/$/]/') + json_array=$(echo "$val" | tr ',' '\n' | sed 's/^ */"/;s/ *$/"/;' | paste -s -d, - | sed 's/^/[/;s/$/]/') content=$(echo "$content" | sed "s|\"{{${key}}}\"| ${json_array}|g") else content=$(echo "$content" | sed "s|{{${key}}}|${val}|g") diff --git a/sreagent-templates/bin/ps/Add-Recipe.ps1 b/sreagent-templates/bin/ps/Add-Recipe.ps1 new file mode 100644 index 000000000..c8aad13a3 --- /dev/null +++ b/sreagent-templates/bin/ps/Add-Recipe.ps1 @@ -0,0 +1,368 @@ +<# +.SYNOPSIS + Add a recipe's components to an existing agent directory (non-destructive merge). + +.DESCRIPTION + Augments an existing agent with a recipe's config files. Auto-detects values + already configured (DT, LAW, GitHub repo, etc.) — only prompts for missing values. + + Does NOT overwrite agent.json identity/access/model — only merges toggles. + Does NOT duplicate connectors — skips if connector name already exists. + +.EXAMPLE + ./Add-Recipe.ps1 -Recipe law-dynatrace-github-httptrigger-prvalidation -AgentDir ./demo1-dt-snow + ./Add-Recipe.ps1 -Recipe law-dynatrace-github-httptrigger-prvalidation -AgentDir ./demo1-dt-snow -NonInteractive + ./Add-Recipe.ps1 -Recipe law-dynatrace-github-httptrigger-prvalidation -AgentDir ./demo1-dt-snow -Set @{githubRepo='org/repo'} + +.NOTES + After adding: + ./Deploy-Agent.ps1 +#> +[CmdletBinding()] +param( + [Alias("r")] + [string]$Recipe, + + [Parameter(Mandatory)] + [string]$AgentDir, + + [switch]$List, + + [Parameter()] + $Set, + + [switch]$NonInteractive, + + [switch]$NoTelemetry +) + +Set-StrictMode -Version Latest +if ($PSVersionTable.PSVersion.Major -ge 7 -and $PSVersionTable.PSVersion.Minor -ge 3) { + $PSNativeCommandArgumentPassing = 'Legacy' +} +$ErrorActionPreference = 'Stop' + +$ScriptDir = $PSScriptRoot +$BinDir = Split-Path $ScriptDir -Parent +$RecipesDir = Join-Path (Split-Path $BinDir -Parent) 'recipes' + +# Dot-source prereq checker + jq wrapper +. (Join-Path $ScriptDir 'Check-Prerequisites.ps1') +if (-not (Test-Prerequisites)) { exit 1 } +. (Join-Path $ScriptDir 'Invoke-Jq.ps1') + +# ─────────────────────────── Parse -Set into hashtable ─────────────────────────── + +$Presets = @{} +if ($Set) { + if ($Set -is [hashtable]) { + $Presets = $Set.Clone() + } + elseif ($Set -is [string[]]) { + foreach ($item in $Set) { + $eqIdx = $item.IndexOf('=') + if ($eqIdx -gt 0) { + $Presets[$item.Substring(0, $eqIdx)] = $item.Substring($eqIdx + 1) + } + else { Write-Error "Invalid -Set value: '$item'. Expected key=value." } + } + } + elseif ($Set -is [string]) { + foreach ($item in $Set -split ',') { + $item = $item.Trim() + $eqIdx = $item.IndexOf('=') + if ($eqIdx -gt 0) { + $Presets[$item.Substring(0, $eqIdx)] = $item.Substring($eqIdx + 1) + } + elseif ($item -ne '') { Write-Error "Invalid -Set value: '$item'. Expected key=value." } + } + } + else { Write-Error "-Set must be a hashtable, string array, or comma-separated string." } +} + +# ─────────────────────────── List recipes ─────────────────────────── + +if ($List) { + Write-Host 'Available recipes:' -ForegroundColor Cyan + Write-Host '' + foreach ($d in Get-ChildItem -Path $RecipesDir -Directory -ErrorAction SilentlyContinue) { + $aj = Join-Path $d.FullName 'agent.json' + if (Test-Path $aj) { + $desc = Invoke-Jq -Raw -Filter '._description // "No description"' -InputFile $aj + Write-Host " $($d.Name.PadRight(45)) $desc" + } + } + exit 0 +} + +# ─────────────────────────── Validate inputs ─────────────────────────── + +if (-not $Recipe) { Write-Error '-Recipe is required. Run with -List to see available recipes.' } + +$RecipeDir = Join-Path $RecipesDir $Recipe +if (-not (Test-Path $RecipeDir -PathType Container)) { + Write-Error "Recipe not found: $Recipe`nRun with -List to see available recipes." +} +$RecipeAgentJson = Join-Path $RecipeDir 'agent.json' +if (-not (Test-Path $RecipeAgentJson)) { Write-Error "Recipe missing agent.json: $Recipe" } + +$AgentDir = (Resolve-Path $AgentDir -ErrorAction Stop).Path +$AgentAgentJson = Join-Path $AgentDir 'agent.json' +if (-not (Test-Path $AgentAgentJson)) { Write-Error "Not an agent directory (no agent.json): $AgentDir" } + +Write-Host '' +Write-Host "── Adding recipe: $Recipe → $AgentDir ──" -ForegroundColor Cyan +Invoke-Jq -Raw -Filter '._description // ""' -InputFile $RecipeAgentJson | ForEach-Object { Write-Host $_ } +Write-Host '' + +# ─────────────────────────── Auto-detect existing values ─────────────────────────── + +Write-Host '── Auto-detecting existing agent configuration ──' -ForegroundColor Cyan + +function Auto-Set([string]$Key, [string]$Value) { + if ($Value -and -not $Presets.ContainsKey($Key)) { + $Presets[$Key] = $Value + Write-Host " auto: $Key = $Value" + } +} + +# Identity from agent.json +Auto-Set 'agentName' (Invoke-Jq -Raw -Filter '.identity.agentName // ""' -InputFile $AgentAgentJson) +Auto-Set 'resourceGroup' (Invoke-Jq -Raw -Filter '.identity.resourceGroup // ""' -InputFile $AgentAgentJson) +Auto-Set 'location' (Invoke-Jq -Raw -Filter '.identity.location // ""' -InputFile $AgentAgentJson) + +# LAW and Dynatrace from connectors.json +$ConnectorsFile = Join-Path $AgentDir 'connectors.json' +if (Test-Path $ConnectorsFile) { + Auto-Set 'lawId' (Invoke-Jq -Raw -Filter '.toggles.lawResourceId // ""' -InputFile $ConnectorsFile) + + $dtEndpoint = Invoke-Jq -Raw -Filter '.connectors[]? | select(.name == "dynatrace") | .properties.extendedProperties.endpoint // ""' -InputFile $ConnectorsFile 2>$null + if ($dtEndpoint -match 'https://([^.]+)\.apps\.dynatrace\.com') { + Auto-Set 'dtTenant' $Matches[1] + } +} + +# Dynatrace token from secrets +$SecretsFile = Join-Path $AgentDir 'connectors.secrets.env' +if (Test-Path $SecretsFile) { + $dtTokenLine = Get-Content $SecretsFile | Where-Object { $_ -match '^DYNATRACE_BEARER_TOKEN=' } | Select-Object -First 1 + if ($dtTokenLine) { + Auto-Set 'dtToken' ($dtTokenLine -replace '^DYNATRACE_BEARER_TOKEN=', '') + } +} + +# GitHub repo from config/repos +$ReposDir = Join-Path $AgentDir 'config/repos' +if (Test-Path $ReposDir) { + foreach ($rf in Get-ChildItem -Path $ReposDir -Filter '*.yaml' -ErrorAction SilentlyContinue) { + $urlLine = Get-Content $rf.FullName | Where-Object { $_ -match 'url:' } | Select-Object -First 1 + if ($urlLine -match 'url:\s*"?([^"]+)"?' -and $Matches[1] -notmatch '\{\{') { + Auto-Set 'githubRepo' $Matches[1].Trim() + break + } + } +} + +Write-Host '' + +# ─────────────────────────── Collect remaining inputs ─────────────────────────── + +$promptsRaw = Invoke-Jq -Compact -Filter '._prompts // {}' -InputFile $RecipeAgentJson +if (-not $promptsRaw) { $promptsRaw = '{}' } +$Prompts = $promptsRaw | ConvertFrom-Json +$PromptKeys = Invoke-Jq -Raw -Filter '._prompts // {} | keys[]' -InputFile $RecipeAgentJson +$Values = @{} + +foreach ($key in $PromptKeys) { + # Skip identity fields + if ($key -in @('agentName', 'resourceGroup', 'location', 'targetRGs', 'existingUamiId', 'modelProvider', 'existingAgentAppInsightsId')) { + if ($Presets.ContainsKey($key)) { $Values[$key] = $Presets[$key] } + continue + } + + # Already auto-detected or preset + if ($Presets.ContainsKey($key)) { + $Values[$key] = $Presets[$key] + continue + } + + $promptDef = $Prompts.$key + $ask = if ($promptDef.PSObject.Properties['ask']) { $promptDef.ask } else { $key } + $default = if ($promptDef.PSObject.Properties['default'] -and $null -ne $promptDef.default) { "$($promptDef.default)" } else { '' } + $required = if ($promptDef.PSObject.Properties['required'] -and $promptDef.required -eq $true) { $true } else { $false } + $isSecret = if ($promptDef.PSObject.Properties['secret'] -and $promptDef.secret -eq $true) { $true } else { $false } + + if ($NonInteractive) { + if ($default -ne '') { + $Values[$key] = $default + Write-Host " ${ask}: $default (default)" + } + elseif ($required) { + Write-Error "${key} is required, not auto-detected, and -NonInteractive set. Use -Set @{${key}=''}" + } + continue + } + + $prompt = " ${ask}" + if ($default) { $prompt += " ($default)" } + $prompt += ': ' + + if ($isSecret) { + $val = Read-Host $prompt -AsSecureString + $val = [Runtime.InteropServices.Marshal]::PtrToStringAuto([Runtime.InteropServices.Marshal]::SecureStringToBSTR($val)) + } + else { $val = Read-Host $prompt } + + if (-not $val) { $val = $default } + if (-not $val -and $required) { Write-Error "${key} is required." } + $Values[$key] = $val +} + +# ─────────────────────────── Copy config files (additive) ─────────────────────────── + +$Added = 0; $Skipped = 0 + +function Copy-DirAdditive([string]$SrcDir, [string]$DstDir, [string]$Label) { + if (-not (Test-Path $SrcDir -PathType Container)) { return } + if (-not (Test-Path $DstDir)) { New-Item -ItemType Directory -Path $DstDir -Force | Out-Null } + foreach ($f in Get-ChildItem -Path $SrcDir -File) { + $dst = Join-Path $DstDir $f.Name + if (Test-Path $dst) { + Write-Host " skip $Label/$($f.Name) (already exists)" + $script:Skipped++ + } + else { + Copy-Item $f.FullName $dst + Write-Host " add $Label/$($f.Name)" + $script:Added++ + } + } +} + +Write-Host '' +Write-Host '── Copying config files ──' -ForegroundColor Cyan +Copy-DirAdditive (Join-Path $RecipeDir 'config/skills') (Join-Path $AgentDir 'config/skills') 'config/skills' +Copy-DirAdditive (Join-Path $RecipeDir 'config/subagents') (Join-Path $AgentDir 'config/subagents') 'config/subagents' +Copy-DirAdditive (Join-Path $RecipeDir 'config/hooks') (Join-Path $AgentDir 'config/hooks') 'config/hooks' +Copy-DirAdditive (Join-Path $RecipeDir 'config/common-prompts') (Join-Path $AgentDir 'config/common-prompts') 'config/common-prompts' +Copy-DirAdditive (Join-Path $RecipeDir 'config/repos') (Join-Path $AgentDir 'config/repos') 'config/repos' +Copy-DirAdditive (Join-Path $RecipeDir 'config/tools') (Join-Path $AgentDir 'config/tools') 'config/tools' +Copy-DirAdditive (Join-Path $RecipeDir 'config/plugin-configs') (Join-Path $AgentDir 'config/plugin-configs') 'config/plugin-configs' + +Write-Host '' +Write-Host '── Copying automations ──' -ForegroundColor Cyan +Copy-DirAdditive (Join-Path $RecipeDir 'automations/http-triggers') (Join-Path $AgentDir 'automations/http-triggers') 'automations/http-triggers' +Copy-DirAdditive (Join-Path $RecipeDir 'automations/scheduled-tasks') (Join-Path $AgentDir 'automations/scheduled-tasks') 'automations/scheduled-tasks' +Copy-DirAdditive (Join-Path $RecipeDir 'automations/incident-filters') (Join-Path $AgentDir 'automations/incident-filters') 'automations/incident-filters' +Copy-DirAdditive (Join-Path $RecipeDir 'automations/incident-platforms') (Join-Path $AgentDir 'automations/incident-platforms') 'automations/incident-platforms' + +# ─────────────────────────── Merge toggles into agent.json ─────────────────────────── + +Write-Host '' +Write-Host '── Merging toggles into agent.json ──' -ForegroundColor Cyan + +$recipeToggles = Invoke-Jq -Compact -Filter '.toggles // {}' -InputFile $RecipeAgentJson +if ($recipeToggles -and $recipeToggles -ne '{}') { + $tmpAgentJson = "$AgentAgentJson.tmp" + Invoke-Jq -Raw -Filter ". as `$root | `$root | .toggles = (.toggles // {} | . * $recipeToggles)" -InputFile $AgentAgentJson | Set-Content $tmpAgentJson -NoNewline + Move-Item $tmpAgentJson $AgentAgentJson -Force + $toggleKeys = Invoke-Jq -Raw -Filter '.toggles // {} | keys | join(", ")' -InputFile $RecipeAgentJson + Write-Host " merged toggles: $toggleKeys" +} +else { Write-Host ' no toggles to merge' } + +# ─────────────────────────── Append connectors ─────────────────────────── + +$RecipeConnJson = Join-Path $RecipeDir 'connectors.json' +if (Test-Path $RecipeConnJson) { + Write-Host '' + Write-Host '── Merging connectors ──' -ForegroundColor Cyan + + $AgentConnJson = Join-Path $AgentDir 'connectors.json' + + # Merge connector toggles + $recipeConnToggles = Invoke-Jq -Compact -Filter '.toggles // {}' -InputFile $RecipeConnJson + if ($recipeConnToggles -and $recipeConnToggles -ne '{}' -and (Test-Path $AgentConnJson)) { + $tmp = "$AgentConnJson.tmp" + Invoke-Jq -Raw -Filter ". as `$root | `$root | .toggles = (.toggles // {} | . * $recipeConnToggles)" -InputFile $AgentConnJson | Set-Content $tmp -NoNewline + Move-Item $tmp $AgentConnJson -Force + Write-Host ' merged connector toggles' + } + + # Append new connectors (skip duplicates by name) + $recipeConns = Invoke-Jq -Compact -Filter '.connectors // []' -InputFile $RecipeConnJson + if ($recipeConns -and $recipeConns -ne '[]') { + $existingNames = @(Invoke-Jq -Raw -Filter '.connectors // [] | .[].name' -InputFile $AgentConnJson 2>$null) + $newConns = @() + foreach ($conn in ($recipeConns | ConvertFrom-Json)) { + if ($existingNames -contains $conn.name) { + Write-Host " skip connector: $($conn.name) (already exists)" + } + else { + $newConns += $conn + Write-Host " add connector: $($conn.name)" + } + } + if ($newConns.Count -gt 0) { + $newConnsJson = $newConns | ConvertTo-Json -Compress -Depth 10 + if ($newConns.Count -eq 1) { $newConnsJson = "[$newConnsJson]" } + $tmp = "$AgentConnJson.tmp" + Invoke-Jq -Raw -Filter ".connectors = (.connectors // [] | . + $newConnsJson)" -InputFile $AgentConnJson | Set-Content $tmp -NoNewline + Move-Item $tmp $AgentConnJson -Force + } + } +} + +# ─────────────────────────── Replace placeholders in new files ─────────────────────────── + +if ($Values.Count -gt 0) { + Write-Host '' + Write-Host '── Replacing placeholders ──' -ForegroundColor Cyan + $files = Get-ChildItem -Path $AgentDir -Recurse -Include '*.json', '*.yaml', '*.md' -File + foreach ($file in $files) { + $content = Get-Content $file.FullName -Raw + $changed = $false + foreach ($kv in $Values.GetEnumerator()) { + $placeholder = "{{$($kv.Key)}}" + $boolPlaceholder = "`"{{$($kv.Key):bool}}`"" + if ($content -match [regex]::Escape($placeholder) -or $content -match [regex]::Escape($boolPlaceholder)) { + $boolVal = if ($kv.Value) { 'true' } else { 'false' } + $content = $content -replace [regex]::Escape($boolPlaceholder), $boolVal + $content = $content -replace [regex]::Escape($placeholder), $kv.Value + $changed = $true + } + } + if ($changed) { + $content | Set-Content $file.FullName -NoNewline + Write-Host " replaced placeholders in $($file.Name)" + } + } +} + +# ─────────────────────────── Write secrets ─────────────────────────── + +$SecretsEnv = Join-Path $AgentDir 'connectors.secrets.env' +foreach ($kv in $Values.GetEnumerator()) { + $promptDef = $Prompts.($kv.Key) + $isSecret = $promptDef -and $promptDef.PSObject.Properties['secret'] -and $promptDef.secret -eq $true + if ($isSecret -and $kv.Value) { + switch ($kv.Key) { + 'dtToken' { + if (-not (Test-Path $SecretsEnv) -or -not (Get-Content $SecretsEnv | Where-Object { $_ -match '^DYNATRACE_BEARER_TOKEN=' })) { + Add-Content -Path $SecretsEnv -Value "DYNATRACE_BEARER_TOKEN=$($kv.Value)" + Write-Host ' added DYNATRACE_BEARER_TOKEN to secrets' + } + } + } + } +} + +# ─────────────────────────── Summary ─────────────────────────── + +Write-Host '' +Write-Host '── Done ──' -ForegroundColor Cyan +Write-Host " Added: $Added files" +Write-Host " Skipped: $Skipped files (already existed)" +Write-Host '' +Write-Host 'Next step:' +Write-Host " ./bin/ps/Deploy-Agent.ps1 $AgentDir" diff --git a/sreagent-templates/bin/ps/Deploy-Agent.ps1 b/sreagent-templates/bin/ps/Deploy-Agent.ps1 index 4d55c94d6..6611ca8fc 100644 --- a/sreagent-templates/bin/ps/Deploy-Agent.ps1 +++ b/sreagent-templates/bin/ps/Deploy-Agent.ps1 @@ -392,18 +392,87 @@ if ($WhatIf_) { exit $whatIfExit } +# ── Pre-deploy: auto-create VNet subnet with delegation if networkConfiguration.type=vnet ── +if ($IsDirectory) { + $AgentJsonFile = Join-Path $InputPath 'agent.json' + if (Test-Path $AgentJsonFile) { + $agentCfg = Get-Content $AgentJsonFile -Raw | ConvertFrom-Json + $netType = if ($agentCfg.PSObject.Properties['networkConfiguration'] -and $agentCfg.networkConfiguration.type) { + $agentCfg.networkConfiguration.type.ToLower() + } else { 'unrestricted' } + + if ($netType -eq 'vnet' -or $netType -eq 'azurevnet') { + $netCfg = $agentCfg.networkConfiguration + $netSubnetId = if ($netCfg.subnetId) { $netCfg.subnetId } else { '' } + $netRg = if ($netCfg.resourceGroup) { $netCfg.resourceGroup } else { '' } + $netVnet = if ($netCfg.vnetName) { $netCfg.vnetName } else { '' } + $netSubnetName = if ($netCfg.subnetName) { $netCfg.subnetName } else { 'agent-subnet' } + $netSubnetPrefix = if ($netCfg.subnetPrefix) { $netCfg.subnetPrefix } else { '10.2.0.0/28' } + + # Resolve subnet ID from broken-out fields if not given directly + if (-not $netSubnetId -and $netVnet -and $netRg) { + $netSubnetId = "/subscriptions/$SubscriptionId/resourceGroups/$netRg/providers/Microsoft.Network/virtualNetworks/$netVnet/subnets/$netSubnetName" + } + + if ($netSubnetId) { + # Extract components from subnet ID + $parts = $netSubnetId -split '/' + $vnetRgIdx = [array]::IndexOf($parts, 'resourceGroups') + 1 + $vnetIdx = [array]::IndexOf($parts, 'virtualNetworks') + 1 + $subnetIdx = [array]::IndexOf($parts, 'subnets') + 1 + $_vnet_rg = $parts[$vnetRgIdx] + $_vnet_name = $parts[$vnetIdx] + $_subnet_name = $parts[$subnetIdx] + + $subnetExists = az network vnet subnet show -g $_vnet_rg --vnet-name $_vnet_name -n $_subnet_name 2>$null + if (-not $subnetExists) { + Write-Header "── Creating VNet subnet with Microsoft.App/environments delegation ──" + Write-Host " VNet: $_vnet_name Subnet: $_subnet_name Prefix: $netSubnetPrefix" + az network vnet subnet create -g $_vnet_rg --vnet-name $_vnet_name -n $_subnet_name ` + --address-prefixes $netSubnetPrefix --delegations 'Microsoft.App/environments' --output none 2>&1 + if ($LASTEXITCODE -ne 0) { Write-Host ' ⚠ Failed to create subnet — VNet integration may fail' } + else { Write-Host ' ✅ Subnet created' } + } else { + $delegation = az network vnet subnet show -g $_vnet_rg --vnet-name $_vnet_name -n $_subnet_name ` + --query 'delegations[0].serviceName' -o tsv 2>$null + if ($delegation -ne 'Microsoft.App/environments') { + Write-Host " ⚠ Subnet $_subnet_name exists but missing Microsoft.App/environments delegation" + Write-Host ' Adding delegation...' + az network vnet subnet update -g $_vnet_rg --vnet-name $_vnet_name -n $_subnet_name ` + --delegations 'Microsoft.App/environments' --output none 2>&1 + } else { + Write-Host " VNet subnet $_subnet_name ready (delegation: Microsoft.App/environments)" + } + } + } + } + } +} + +# ── Auto-detect redeploy: skip role assignments to avoid RoleAssignmentExists ── +$SkipRbacParam = '' +$agentExistsCheck = az resource show -g $ResourceGroup --resource-type 'Microsoft.App/agents' -n $AgentName --query 'name' -o tsv 2>$null +if ($agentExistsCheck) { + Write-Host " Agent '$AgentName' already exists — skipping role assignments on redeploy." + $SkipRbacParam = 'skipRoleAssignments=true' +} + # ── Run the deployment ── Write-Host 'Starting deployment (this typically takes 3-5 min)...' Write-Host "Tip: open another terminal and run 'az deployment operation sub list -n $DeploymentName -o table' to watch progress." Write-Host '' # Capture stdout (JSON) cleanly; let stderr (warnings/errors) flow to console -$deployJson = az deployment sub create ` - --location $Location ` - --name $DeploymentName ` - --template-file $Template ` - --parameters "@$ParametersFile" ` - --output json | Out-String +$deployArgs = @( + 'deployment', 'sub', 'create', + '--location', $Location, + '--name', $DeploymentName, + '--template-file', $Template, + '--parameters', "@$ParametersFile", + '--output', 'json' +) +if ($SkipRbacParam) { $deployArgs += @('--parameters', $SkipRbacParam) } +$deployJson = & az @deployArgs | Out-String Write-Host $deployJson diff --git a/sreagent-templates/bin/ps/New-Agent.ps1 b/sreagent-templates/bin/ps/New-Agent.ps1 index a6346d962..ebb3bd464 100644 --- a/sreagent-templates/bin/ps/New-Agent.ps1 +++ b/sreagent-templates/bin/ps/New-Agent.ps1 @@ -378,7 +378,7 @@ $configDirs = @("skills", "subagents", "tools", "hooks", "common-prompts", "plug foreach ($d in $configDirs) { $configPath = Join-Path $Output "config/$d" if (Test-Path $configPath -PathType Container) { - $count = @(Get-ChildItem -Path "$configPath/*" -Include "*.json", "*.yaml" -File -ErrorAction SilentlyContinue).Count + $count = @(Get-ChildItem -Path $configPath -Include "*.json", "*.yaml" -File -Recurse -ErrorAction SilentlyContinue).Count if ($count -gt 0) { Write-Host (" {0,-24} <- {1} file(s)" -f "$d/", $count) } diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/.gitignore b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/.gitignore new file mode 100644 index 000000000..f903c363c --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/.gitignore @@ -0,0 +1,5 @@ +# Secrets — never commit +connectors.secrets.env +*.secrets.env +# Generated verification spec +expected-config.json diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/agent.json b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/agent.json new file mode 100644 index 000000000..eea4a8878 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/agent.json @@ -0,0 +1,25 @@ +{ + "_description": "SRE Agent configuration — edit these values to clone to a new environment.", + "_exported_at": "2026-05-20T14:37:08Z", + "identity": { + "agentName": "bookstore-agent", + "resourceGroup": "rg-sre-bookstore", + "subscription": "cbf44432-7f45-4906-a85d-d2b14a1e8328", + "location": "eastus2", + "targetResourceGroups": [ + "rg-bookstore-demo" + ] + }, + "access": { + "accessLevel": "High", + "actionMode": "Review" + }, + "upgradeChannel": "Preview", + "defaultModelProvider": "Anthropic", + "monthlyAgentUnitLimit": 10000, + "tags": {}, + "toggles": { + "enableWebhookBridge": false, + "webhookBridgeTriggerUrl": "" + } +} diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/automations/incident-filters/snow-p1p2p3.yaml b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/automations/incident-filters/snow-p1p2p3.yaml new file mode 100644 index 000000000..f0927e5d8 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/automations/incident-filters/snow-p1p2p3.yaml @@ -0,0 +1,26 @@ +metadata: + name: snow-p1p2p3 +spec: + incidentPlatform: ServiceNow + impactedService: '' + priorities: + - '1' + - '2' + - '3' + incidentType: '' + alertId: '' + titleContains: '' + titleContainsAll: [] + titleContainsAny: [] + titleNotContains: [] + agentMode: Autonomous + handlingAgent: onprem-investigator + owningTeamId: '' + owningTeamIds: [] + maxAutomatedInvestigationAttempts: 3 + deepInvestigationEnabled: false + mergeEnabled: true + mergeWindowHours: 3 + createdAt: '2026-05-13T17:52:10.2627347Z' + updatedAt: '2026-05-13T17:52:10.2627495Z' + isEnabled: true diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/automations/incident-platforms/servicenow.yaml b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/automations/incident-platforms/servicenow.yaml new file mode 100644 index 000000000..67a86a565 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/automations/incident-platforms/servicenow.yaml @@ -0,0 +1,4 @@ +name: servicenow +spec: + platformType: ServiceNow + connectionKey: ${SERVICENOW_API_KEY} diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/automations/scheduled-tasks/Bookstore Alert & Health Monitor.yaml b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/automations/scheduled-tasks/Bookstore Alert & Health Monitor.yaml new file mode 100644 index 000000000..7f534298d --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/automations/scheduled-tasks/Bookstore Alert & Health Monitor.yaml @@ -0,0 +1,34 @@ +metadata: + name: Bookstore Alert & Health Monitor +spec: + description: Checks for fired Azure Monitor alerts and overall health of the bookstore + Container App and PostgreSQL every 15 minutes + cronExpression: '*/15 * * * *' + startTime: '2026-05-13T20:54:13.5090808Z' + agentPrompt: "Autonomous Scheduled Run\n\nScope:\n- Subscription: cbf44432-7f45-4906-a85d-d2b14a1e8328\n\ + - Resource Group: rg-bookstore-demo\n- Container App: /subscriptions/cbf44432-7f45-4906-a85d-d2b14a1e8328/resourceGroups/rg-bookstore-demo/providers/Microsoft.App/containerApps/ca-bookstore-ixiytoaegn4xu\n\ + - PostgreSQL: /subscriptions/cbf44432-7f45-4906-a85d-d2b14a1e8328/resourceGroups/rg-bookstore-demo/providers/Microsoft.DBforPostgreSQL/flexibleServers/pg-ixiytoaegn4xu\n\ + - Log Analytics Workspace: law-bookstore-ixiytoaegn4xu (workspace ID: ceff0903-3ad5-4a5b-9e08-ae84db500098)\n\ + \nTime Window: Analyze ONLY last 15 minutes\n\nGoal: Detect and report on any\ + \ fired/active Azure Monitor alerts and check overall application health.\n\n\ + Steps:\n1. Check for fired/active metric alerts in rg-bookstore-demo:\n - az\ + \ monitor metrics alert list --resource-group rg-bookstore-demo --subscription\ + \ cbf44432-7f45-4906-a85d-d2b14a1e8328\n - Focus on alerts with currentSeverity\ + \ 0 or 1 that are in \"Fired\" state\n2. Check Container App health:\n - Verify\ + \ revision is running and healthy via az containerapp revision list\n - Check\ + \ replica count > 0\n3. Check PostgreSQL health:\n - Verify server state is\ + \ Ready via az postgres flexible-server show\n4. Optionally check recent container\ + \ logs for errors:\n - Query Log Analytics for error-level logs in last 15 minutes\ + \ if workspace has data\n\nConstraints: Max 10 API calls per execution. Read-only\ + \ operations only, no write operations.\n\nIdempotence: If all alerts are in \"\ + Resolved\" state, Container App is healthy with replicas running, and PostgreSQL\ + \ is Ready -> output \"All systems healthy. No active alerts.\"\n\nOutput Format:\n\ + - Overall Status: Healthy / Degraded / Critical\n- Active Alerts: list any fired\ + \ alerts with name, severity, and description\n- Container App: running status,\ + \ replica count, health state\n- PostgreSQL: state, CPU if available\n- Recommendations:\ + \ 1-3 actions if issues detected\n\nEscalation: If any Sev0 alert is fired, or\ + \ Container App has 0 replicas, or PostgreSQL is not Ready, mark as CRITICAL and\ + \ include \"Immediate attention required\" in output." + createdBy: SREAgent + status: Paused + agentMode: autonomous diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/common-prompts/safety-rules.md b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/common-prompts/safety-rules.md new file mode 100644 index 000000000..8a28b3bec --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/common-prompts/safety-rules.md @@ -0,0 +1,5 @@ +## Safety rules +- Never delete or stop production resources without explicit human approval. +- Always confirm subscription and resource group before any write operation. +- For observability setup, prefer adding new resources over modifying existing ones. +- Never modify DNS or traffic routing without human approval — traffic cutover is high-risk. \ No newline at end of file diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/common-prompts/safety-rules.yaml b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/common-prompts/safety-rules.yaml new file mode 100644 index 000000000..1dece0896 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/common-prompts/safety-rules.yaml @@ -0,0 +1,14 @@ +metadata: + name: safety-rules +spec: + prompt: '## Safety rules + + - Never delete or stop production resources without explicit human approval. + + - Always confirm subscription and resource group before any write operation. + + - For observability setup, prefer adding new resources over modifying existing + ones. + + - Never modify DNS or traffic routing without human approval — traffic cutover + is high-risk.' diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/hooks/deny-prod-deletes.yaml b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/hooks/deny-prod-deletes.yaml new file mode 100644 index 000000000..9f3bd5779 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/hooks/deny-prod-deletes.yaml @@ -0,0 +1,12 @@ +name: deny-prod-deletes +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If the tool targets a production resource (name contains 'prod' or 'prd'), + deny. + matcher: ^(delete_|remove_).* + timeout: 30 diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/skills/diagnose-onprem.md b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/skills/diagnose-onprem.md new file mode 100644 index 000000000..3282e7b1c --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/skills/diagnose-onprem.md @@ -0,0 +1,11 @@ +You diagnose issues with the on-prem application by calling its API endpoints. +All requests require the header: X-API-Key: bookstore-demo-key-2026 + +Base URL: https://knoll-harvest-naming.ngrok-free.dev + +Endpoints: + - GET /api/health + - GET /api/metrics + - GET /api/logs?last=200 + +Investigate the issue, determine root cause from the data, and report findings. \ No newline at end of file diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/skills/diagnose-onprem.yaml b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/skills/diagnose-onprem.yaml new file mode 100644 index 000000000..f3df08870 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/skills/diagnose-onprem.yaml @@ -0,0 +1,10 @@ +metadata: + name: diagnose-onprem + description: Diagnoses on-prem application issues by reading health, metrics, and + log endpoints. + spec: + tools: + - ExecutePythonCode + - FetchWebpage +skillContent: skills/diagnose-onprem.md +additionalFiles: [] diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/subagents/onprem-investigator.instructions.md b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/subagents/onprem-investigator.instructions.md new file mode 100644 index 000000000..aa5d6aea0 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/subagents/onprem-investigator.instructions.md @@ -0,0 +1,11 @@ +You are an on-prem application investigator. When a ServiceNow incident is filed +that affects the on-prem application, you use the diagnose-onprem skill to read +the app's health, metrics, and logs, determine root cause, and post findings +back to ServiceNow work notes. + +Workflow: +1. Read the ServiceNow incident description — identify the affected system and symptoms. +2. Use the diagnose-onprem skill to investigate via the on-prem API endpoints. +3. Compile findings: timeline, root cause, evidence, recommended fix. +4. Post findings to ServiceNow work notes on the incident. +5. If you can determine a fix, recommend it. If not, escalate with what you found. \ No newline at end of file diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/subagents/onprem-investigator.yaml b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/subagents/onprem-investigator.yaml new file mode 100644 index 000000000..3eaa1ed09 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/config/subagents/onprem-investigator.yaml @@ -0,0 +1,11 @@ +metadata: + name: onprem-investigator +spec: + instructions: subagents/onprem-investigator.instructions.md + handoffDescription: Investigates on-prem application issues by reading logs, metrics, + and health endpoints. Posts findings to ServiceNow. + handoffs: [] + tools: + - ExecutePythonCode + - FetchWebpage + enableVanillaMode: false diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/connectors.json b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/connectors.json new file mode 100644 index 000000000..c8fdd4846 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/connectors.json @@ -0,0 +1,32 @@ +{ + "toggles": { + "enableAppInsightsConnector": false, + "appInsightsResourceId": "", + "appInsightsAppId": "", + "enableLogAnalyticsConnector": false, + "lawResourceId": "", + "enableAzureMonitorConnector": false, + "azureMonitorLookbackDays": 7 + }, + "connectors": [ + { + "name": "learnmcp", + "properties": { + "dataConnectorType": "Mcp", + "dataSource": "placeholder", + "extendedProperties": { + "type": "http", + "endpoint": "https://learn.microsoft.com/api/mcp", + "authType": "BearerToken", + "bearerToken": "${LEARNMCP_BEARER_TOKEN}", + "toolsVisibleToMetaAgent": [ + "learnmcp_microsoft_docs_search", + "learnmcp_microsoft_code_sample_search", + "learnmcp_microsoft_docs_fetch" + ] + }, + "identity": "" + } + } + ] +} diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/data/knowledge/.gitkeep b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/data/knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/data/synthesized-knowledge.json b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/data/synthesized-knowledge.json new file mode 100644 index 000000000..0605cac26 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/data/synthesized-knowledge.json @@ -0,0 +1,22 @@ +[ + { + "path": "synthesizedKnowledge/.gitkeep", + "size": 0, + "lastModified": "2026-05-20T06:06:36.2871278+00:00" + }, + { + "path": "synthesizedKnowledge/logs.md", + "size": 2120, + "lastModified": "2026-05-20T06:06:36.3391262+00:00" + }, + { + "path": "synthesizedKnowledge/overview.md", + "size": 2417, + "lastModified": "2026-05-20T06:06:36.3511258+00:00" + }, + { + "path": "synthesizedKnowledge/team.md", + "size": 47, + "lastModified": "2026-05-20T06:06:36.3591255+00:00" + } +] diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/data/synthesized-knowledge/.gitkeep b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/data/synthesized-knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/data/synthesized-knowledge/logs.md b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/data/synthesized-knowledge/logs.md new file mode 100644 index 000000000..344c51dc6 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/data/synthesized-knowledge/logs.md @@ -0,0 +1,56 @@ +## Log Sources & Monitoring + +### Log Analytics Workspace +- **Name**: `law-bookstore-ixiytoaegn4xu` +- **Workspace ID**: `ceff0903-3ad5-4a5b-9e08-ae84db500098` +- **Resource ID**: `/subscriptions/cbf44432-7f45-4906-a85d-d2b14a1e8328/resourceGroups/rg-bookstore-demo/providers/Microsoft.OperationalInsights/workspaces/law-bookstore-ixiytoaegn4xu` +- **Location**: Sweden Central +- **Retention**: 30 days + +### What Flows Where +| Source | Log Type | Destination | +|--------|----------|-------------| +| Container App stdout/stderr | App logs | LAW (via environment link) | +| Container App | AllMetrics | LAW (via diagnostic settings `diag-containerapp`) | +| PostgreSQL | PostgreSQLLogs, Sessions, QueryStoreRuntime, QueryStoreWaitStats, DatabaseXacts | LAW (via `diag-postgresql`) | +| PostgreSQL | AllMetrics | LAW (via `diag-postgresql`) | + +### Health Probes +- **Liveness**: `GET /api/health:8000` — every 30s, restart after 3 failures +- **Startup**: `GET /api/health:8000` — every 10s, fail after 10 attempts (100s max boot) + +### Alerts +| Alert | Metric | Condition | Severity | Window | +|-------|--------|-----------|----------|--------| +| `alert-no-replicas` | Replicas | avg < 1 | Sev 0 (Critical) | 5 min | +| `alert-container-restarts` | RestartCount | total > 3 | Sev 1 (Error) | 5 min | +| `alert-postgres-cpu-high` | cpu_percent | avg > 80% | Sev 1 (Error) | 5 min | + +### Action Group +- **Name**: `ag-bookstore-critical` (short: `BookCrit`) +- **Receivers**: None configured yet — add email/webhook to receive notifications + +### Useful KQL Queries + +**Container App logs (last hour)** +```kql +ContainerAppConsoleLogs_CL +| where TimeGenerated > ago(1h) +| project TimeGenerated, Log_s, RevisionName_s +| order by TimeGenerated desc +``` + +**PostgreSQL errors (last hour)** +```kql +AzureDiagnostics +| where ResourceProvider == "MICROSOFT.DBFORPOSTGRESQL" +| where TimeGenerated > ago(1h) +| where errorLevel_s in ("ERROR", "FATAL") +| project TimeGenerated, errorLevel_s, Message +| order by TimeGenerated desc +``` + +### Not Yet Configured +- Application Insights (no SDK/auto-instrumentation) +- OpenTelemetry +- Custom dashboards diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/data/synthesized-knowledge/overview.md b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/data/synthesized-knowledge/overview.md new file mode 100644 index 000000000..3ed43e4ef --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/data/synthesized-knowledge/overview.md @@ -0,0 +1,49 @@ +## Bookstore Application — Hybrid Migration + +Bookstore app originally running on-prem, modernized and deployed to Azure via AppMod. Both versions may run simultaneously during migration. + +### Architecture +| Component | On-Prem | Azure | +|-----------|---------|-------| +| Compute | Docker container | Container Apps (`ca-bookstore-ixiytoaegn4xu`) | +| Database | SQLite (local file) | PostgreSQL Flexible Server (`pg-ixiytoaegn4xu`) | +| Images | — | Container Registry (`crixiytoaegn4xu`) | +| Logs | Structured JSON via API | stdout → Log Analytics (`law-bookstore-ixiytoaegn4xu`) | +| Monitoring | Health/metrics/logs endpoints | Log Analytics + health probes + alerts configured | + +### On-Prem Endpoints +- `/api/health` — DB status, latency, active failure mode +- `/api/metrics` — request/error counts, avg latency, 5-min window +- `/api/logs?last=N` — structured JSON logs +- `/api/books` — catalog listing +- `/api/orders` — order processing + +### On-Prem Log Patterns +- `"level": "error"` + `"event": "db_write_failed"` → DB corruption/lock +- `"event": "search_timeout"` → search dependency slow/down +- `"event": "health_check_failed"` → DB connectivity lost +- High `duration_ms` → resource contention + +### Incident Management +- **ServiceNow** for both on-prem and cloud issues +- Diagnosis flow: read incident → check affected system → diagnose → post findings to work notes + +### Observability (configured 2026-05-13) +- **Log Analytics**: `law-bookstore-ixiytoaegn4xu` (workspace ID: `ceff0903-3ad5-4a5b-9e08-ae84db500098`) +- **Health probes**: Liveness (30s) + Startup (10s) on `/api/health:8000` +- **Diagnostic settings**: Container App metrics + PostgreSQL logs/metrics → LAW +- **Alerts**: no-replicas (Sev0), container-restarts (Sev1), postgres-cpu-high (Sev1) +- **Action group**: `ag-bookstore-critical` (needs email/webhook receivers added) +- **Still needed**: Application Insights, zone redundancy, PostgreSQL HA + +### Quick Links +- [Team](team.md) — team members +- [Logs & Monitoring](logs.md) — log sources, queries, alerts +- [Architecture doc](knowledge_app-architecture-md.md) — uploaded by Deepthi + +### Azure Resources (rg-bookstore-demo, Sweden Central) +- Container App: `ca-bookstore-ixiytoaegn4xu` +- Container App Environment: `cae-ixiytoaegn4xu` +- PostgreSQL: `pg-ixiytoaegn4xu` +- Container Registry: `crixiytoaegn4xu` +- Log Analytics: `law-bookstore-ixiytoaegn4xu` diff --git a/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/data/synthesized-knowledge/team.md b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/data/synthesized-knowledge/team.md new file mode 100644 index 000000000..792b48300 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.ZCwHXy/bookstore-agent/data/synthesized-knowledge/team.md @@ -0,0 +1,2 @@ +## Team +- **Deepthi Chelupati** — (role TBD) diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/.gitignore b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/.gitignore new file mode 100644 index 000000000..f903c363c --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/.gitignore @@ -0,0 +1,5 @@ +# Secrets — never commit +connectors.secrets.env +*.secrets.env +# Generated verification spec +expected-config.json diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/agent.json b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/agent.json new file mode 100644 index 000000000..683b2856d --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/agent.json @@ -0,0 +1,25 @@ +{ + "_description": "SRE Agent configuration — edit these values to clone to a new environment.", + "_exported_at": "2026-05-20T15:00:35Z", + "identity": { + "agentName": "bookstore-agent", + "resourceGroup": "rg-sre-bookstore", + "subscription": "cbf44432-7f45-4906-a85d-d2b14a1e8328", + "location": "eastus2", + "targetResourceGroups": [ + "rg-bookstore-demo" + ] + }, + "access": { + "accessLevel": "High", + "actionMode": "Review" + }, + "upgradeChannel": "Preview", + "defaultModelProvider": "Anthropic", + "monthlyAgentUnitLimit": 10000, + "tags": {}, + "toggles": { + "enableWebhookBridge": false, + "webhookBridgeTriggerUrl": "" + } +} diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/automations/incident-filters/snow-p1p2p3.yaml b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/automations/incident-filters/snow-p1p2p3.yaml new file mode 100644 index 000000000..f0927e5d8 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/automations/incident-filters/snow-p1p2p3.yaml @@ -0,0 +1,26 @@ +metadata: + name: snow-p1p2p3 +spec: + incidentPlatform: ServiceNow + impactedService: '' + priorities: + - '1' + - '2' + - '3' + incidentType: '' + alertId: '' + titleContains: '' + titleContainsAll: [] + titleContainsAny: [] + titleNotContains: [] + agentMode: Autonomous + handlingAgent: onprem-investigator + owningTeamId: '' + owningTeamIds: [] + maxAutomatedInvestigationAttempts: 3 + deepInvestigationEnabled: false + mergeEnabled: true + mergeWindowHours: 3 + createdAt: '2026-05-13T17:52:10.2627347Z' + updatedAt: '2026-05-13T17:52:10.2627495Z' + isEnabled: true diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/automations/incident-platforms/servicenow.yaml b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/automations/incident-platforms/servicenow.yaml new file mode 100644 index 000000000..67a86a565 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/automations/incident-platforms/servicenow.yaml @@ -0,0 +1,4 @@ +name: servicenow +spec: + platformType: ServiceNow + connectionKey: ${SERVICENOW_API_KEY} diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/automations/scheduled-tasks/Bookstore Alert & Health Monitor.yaml b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/automations/scheduled-tasks/Bookstore Alert & Health Monitor.yaml new file mode 100644 index 000000000..7f534298d --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/automations/scheduled-tasks/Bookstore Alert & Health Monitor.yaml @@ -0,0 +1,34 @@ +metadata: + name: Bookstore Alert & Health Monitor +spec: + description: Checks for fired Azure Monitor alerts and overall health of the bookstore + Container App and PostgreSQL every 15 minutes + cronExpression: '*/15 * * * *' + startTime: '2026-05-13T20:54:13.5090808Z' + agentPrompt: "Autonomous Scheduled Run\n\nScope:\n- Subscription: cbf44432-7f45-4906-a85d-d2b14a1e8328\n\ + - Resource Group: rg-bookstore-demo\n- Container App: /subscriptions/cbf44432-7f45-4906-a85d-d2b14a1e8328/resourceGroups/rg-bookstore-demo/providers/Microsoft.App/containerApps/ca-bookstore-ixiytoaegn4xu\n\ + - PostgreSQL: /subscriptions/cbf44432-7f45-4906-a85d-d2b14a1e8328/resourceGroups/rg-bookstore-demo/providers/Microsoft.DBforPostgreSQL/flexibleServers/pg-ixiytoaegn4xu\n\ + - Log Analytics Workspace: law-bookstore-ixiytoaegn4xu (workspace ID: ceff0903-3ad5-4a5b-9e08-ae84db500098)\n\ + \nTime Window: Analyze ONLY last 15 minutes\n\nGoal: Detect and report on any\ + \ fired/active Azure Monitor alerts and check overall application health.\n\n\ + Steps:\n1. Check for fired/active metric alerts in rg-bookstore-demo:\n - az\ + \ monitor metrics alert list --resource-group rg-bookstore-demo --subscription\ + \ cbf44432-7f45-4906-a85d-d2b14a1e8328\n - Focus on alerts with currentSeverity\ + \ 0 or 1 that are in \"Fired\" state\n2. Check Container App health:\n - Verify\ + \ revision is running and healthy via az containerapp revision list\n - Check\ + \ replica count > 0\n3. Check PostgreSQL health:\n - Verify server state is\ + \ Ready via az postgres flexible-server show\n4. Optionally check recent container\ + \ logs for errors:\n - Query Log Analytics for error-level logs in last 15 minutes\ + \ if workspace has data\n\nConstraints: Max 10 API calls per execution. Read-only\ + \ operations only, no write operations.\n\nIdempotence: If all alerts are in \"\ + Resolved\" state, Container App is healthy with replicas running, and PostgreSQL\ + \ is Ready -> output \"All systems healthy. No active alerts.\"\n\nOutput Format:\n\ + - Overall Status: Healthy / Degraded / Critical\n- Active Alerts: list any fired\ + \ alerts with name, severity, and description\n- Container App: running status,\ + \ replica count, health state\n- PostgreSQL: state, CPU if available\n- Recommendations:\ + \ 1-3 actions if issues detected\n\nEscalation: If any Sev0 alert is fired, or\ + \ Container App has 0 replicas, or PostgreSQL is not Ready, mark as CRITICAL and\ + \ include \"Immediate attention required\" in output." + createdBy: SREAgent + status: Paused + agentMode: autonomous diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/common-prompts/safety-rules.md b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/common-prompts/safety-rules.md new file mode 100644 index 000000000..8a28b3bec --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/common-prompts/safety-rules.md @@ -0,0 +1,5 @@ +## Safety rules +- Never delete or stop production resources without explicit human approval. +- Always confirm subscription and resource group before any write operation. +- For observability setup, prefer adding new resources over modifying existing ones. +- Never modify DNS or traffic routing without human approval — traffic cutover is high-risk. \ No newline at end of file diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/common-prompts/safety-rules.yaml b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/common-prompts/safety-rules.yaml new file mode 100644 index 000000000..1dece0896 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/common-prompts/safety-rules.yaml @@ -0,0 +1,14 @@ +metadata: + name: safety-rules +spec: + prompt: '## Safety rules + + - Never delete or stop production resources without explicit human approval. + + - Always confirm subscription and resource group before any write operation. + + - For observability setup, prefer adding new resources over modifying existing + ones. + + - Never modify DNS or traffic routing without human approval — traffic cutover + is high-risk.' diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/hooks/deny-prod-deletes.yaml b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/hooks/deny-prod-deletes.yaml new file mode 100644 index 000000000..9f3bd5779 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/hooks/deny-prod-deletes.yaml @@ -0,0 +1,12 @@ +name: deny-prod-deletes +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If the tool targets a production resource (name contains 'prod' or 'prd'), + deny. + matcher: ^(delete_|remove_).* + timeout: 30 diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/skills/diagnose-onprem.md b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/skills/diagnose-onprem.md new file mode 100644 index 000000000..3282e7b1c --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/skills/diagnose-onprem.md @@ -0,0 +1,11 @@ +You diagnose issues with the on-prem application by calling its API endpoints. +All requests require the header: X-API-Key: bookstore-demo-key-2026 + +Base URL: https://knoll-harvest-naming.ngrok-free.dev + +Endpoints: + - GET /api/health + - GET /api/metrics + - GET /api/logs?last=200 + +Investigate the issue, determine root cause from the data, and report findings. \ No newline at end of file diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/skills/diagnose-onprem.yaml b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/skills/diagnose-onprem.yaml new file mode 100644 index 000000000..f3df08870 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/skills/diagnose-onprem.yaml @@ -0,0 +1,10 @@ +metadata: + name: diagnose-onprem + description: Diagnoses on-prem application issues by reading health, metrics, and + log endpoints. + spec: + tools: + - ExecutePythonCode + - FetchWebpage +skillContent: skills/diagnose-onprem.md +additionalFiles: [] diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/subagents/onprem-investigator.instructions.md b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/subagents/onprem-investigator.instructions.md new file mode 100644 index 000000000..aa5d6aea0 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/subagents/onprem-investigator.instructions.md @@ -0,0 +1,11 @@ +You are an on-prem application investigator. When a ServiceNow incident is filed +that affects the on-prem application, you use the diagnose-onprem skill to read +the app's health, metrics, and logs, determine root cause, and post findings +back to ServiceNow work notes. + +Workflow: +1. Read the ServiceNow incident description — identify the affected system and symptoms. +2. Use the diagnose-onprem skill to investigate via the on-prem API endpoints. +3. Compile findings: timeline, root cause, evidence, recommended fix. +4. Post findings to ServiceNow work notes on the incident. +5. If you can determine a fix, recommend it. If not, escalate with what you found. \ No newline at end of file diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/subagents/onprem-investigator.yaml b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/subagents/onprem-investigator.yaml new file mode 100644 index 000000000..3eaa1ed09 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/config/subagents/onprem-investigator.yaml @@ -0,0 +1,11 @@ +metadata: + name: onprem-investigator +spec: + instructions: subagents/onprem-investigator.instructions.md + handoffDescription: Investigates on-prem application issues by reading logs, metrics, + and health endpoints. Posts findings to ServiceNow. + handoffs: [] + tools: + - ExecutePythonCode + - FetchWebpage + enableVanillaMode: false diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/connectors.json b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/connectors.json new file mode 100644 index 000000000..c8fdd4846 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/connectors.json @@ -0,0 +1,32 @@ +{ + "toggles": { + "enableAppInsightsConnector": false, + "appInsightsResourceId": "", + "appInsightsAppId": "", + "enableLogAnalyticsConnector": false, + "lawResourceId": "", + "enableAzureMonitorConnector": false, + "azureMonitorLookbackDays": 7 + }, + "connectors": [ + { + "name": "learnmcp", + "properties": { + "dataConnectorType": "Mcp", + "dataSource": "placeholder", + "extendedProperties": { + "type": "http", + "endpoint": "https://learn.microsoft.com/api/mcp", + "authType": "BearerToken", + "bearerToken": "${LEARNMCP_BEARER_TOKEN}", + "toolsVisibleToMetaAgent": [ + "learnmcp_microsoft_docs_search", + "learnmcp_microsoft_code_sample_search", + "learnmcp_microsoft_docs_fetch" + ] + }, + "identity": "" + } + } + ] +} diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/data/knowledge/.gitkeep b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/data/knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/data/synthesized-knowledge.json b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/data/synthesized-knowledge.json new file mode 100644 index 000000000..0605cac26 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/data/synthesized-knowledge.json @@ -0,0 +1,22 @@ +[ + { + "path": "synthesizedKnowledge/.gitkeep", + "size": 0, + "lastModified": "2026-05-20T06:06:36.2871278+00:00" + }, + { + "path": "synthesizedKnowledge/logs.md", + "size": 2120, + "lastModified": "2026-05-20T06:06:36.3391262+00:00" + }, + { + "path": "synthesizedKnowledge/overview.md", + "size": 2417, + "lastModified": "2026-05-20T06:06:36.3511258+00:00" + }, + { + "path": "synthesizedKnowledge/team.md", + "size": 47, + "lastModified": "2026-05-20T06:06:36.3591255+00:00" + } +] diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/data/synthesized-knowledge/.gitkeep b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/data/synthesized-knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/data/synthesized-knowledge/logs.md b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/data/synthesized-knowledge/logs.md new file mode 100644 index 000000000..344c51dc6 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/data/synthesized-knowledge/logs.md @@ -0,0 +1,56 @@ +## Log Sources & Monitoring + +### Log Analytics Workspace +- **Name**: `law-bookstore-ixiytoaegn4xu` +- **Workspace ID**: `ceff0903-3ad5-4a5b-9e08-ae84db500098` +- **Resource ID**: `/subscriptions/cbf44432-7f45-4906-a85d-d2b14a1e8328/resourceGroups/rg-bookstore-demo/providers/Microsoft.OperationalInsights/workspaces/law-bookstore-ixiytoaegn4xu` +- **Location**: Sweden Central +- **Retention**: 30 days + +### What Flows Where +| Source | Log Type | Destination | +|--------|----------|-------------| +| Container App stdout/stderr | App logs | LAW (via environment link) | +| Container App | AllMetrics | LAW (via diagnostic settings `diag-containerapp`) | +| PostgreSQL | PostgreSQLLogs, Sessions, QueryStoreRuntime, QueryStoreWaitStats, DatabaseXacts | LAW (via `diag-postgresql`) | +| PostgreSQL | AllMetrics | LAW (via `diag-postgresql`) | + +### Health Probes +- **Liveness**: `GET /api/health:8000` — every 30s, restart after 3 failures +- **Startup**: `GET /api/health:8000` — every 10s, fail after 10 attempts (100s max boot) + +### Alerts +| Alert | Metric | Condition | Severity | Window | +|-------|--------|-----------|----------|--------| +| `alert-no-replicas` | Replicas | avg < 1 | Sev 0 (Critical) | 5 min | +| `alert-container-restarts` | RestartCount | total > 3 | Sev 1 (Error) | 5 min | +| `alert-postgres-cpu-high` | cpu_percent | avg > 80% | Sev 1 (Error) | 5 min | + +### Action Group +- **Name**: `ag-bookstore-critical` (short: `BookCrit`) +- **Receivers**: None configured yet — add email/webhook to receive notifications + +### Useful KQL Queries + +**Container App logs (last hour)** +```kql +ContainerAppConsoleLogs_CL +| where TimeGenerated > ago(1h) +| project TimeGenerated, Log_s, RevisionName_s +| order by TimeGenerated desc +``` + +**PostgreSQL errors (last hour)** +```kql +AzureDiagnostics +| where ResourceProvider == "MICROSOFT.DBFORPOSTGRESQL" +| where TimeGenerated > ago(1h) +| where errorLevel_s in ("ERROR", "FATAL") +| project TimeGenerated, errorLevel_s, Message +| order by TimeGenerated desc +``` + +### Not Yet Configured +- Application Insights (no SDK/auto-instrumentation) +- OpenTelemetry +- Custom dashboards diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/data/synthesized-knowledge/overview.md b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/data/synthesized-knowledge/overview.md new file mode 100644 index 000000000..3ed43e4ef --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/data/synthesized-knowledge/overview.md @@ -0,0 +1,49 @@ +## Bookstore Application — Hybrid Migration + +Bookstore app originally running on-prem, modernized and deployed to Azure via AppMod. Both versions may run simultaneously during migration. + +### Architecture +| Component | On-Prem | Azure | +|-----------|---------|-------| +| Compute | Docker container | Container Apps (`ca-bookstore-ixiytoaegn4xu`) | +| Database | SQLite (local file) | PostgreSQL Flexible Server (`pg-ixiytoaegn4xu`) | +| Images | — | Container Registry (`crixiytoaegn4xu`) | +| Logs | Structured JSON via API | stdout → Log Analytics (`law-bookstore-ixiytoaegn4xu`) | +| Monitoring | Health/metrics/logs endpoints | Log Analytics + health probes + alerts configured | + +### On-Prem Endpoints +- `/api/health` — DB status, latency, active failure mode +- `/api/metrics` — request/error counts, avg latency, 5-min window +- `/api/logs?last=N` — structured JSON logs +- `/api/books` — catalog listing +- `/api/orders` — order processing + +### On-Prem Log Patterns +- `"level": "error"` + `"event": "db_write_failed"` → DB corruption/lock +- `"event": "search_timeout"` → search dependency slow/down +- `"event": "health_check_failed"` → DB connectivity lost +- High `duration_ms` → resource contention + +### Incident Management +- **ServiceNow** for both on-prem and cloud issues +- Diagnosis flow: read incident → check affected system → diagnose → post findings to work notes + +### Observability (configured 2026-05-13) +- **Log Analytics**: `law-bookstore-ixiytoaegn4xu` (workspace ID: `ceff0903-3ad5-4a5b-9e08-ae84db500098`) +- **Health probes**: Liveness (30s) + Startup (10s) on `/api/health:8000` +- **Diagnostic settings**: Container App metrics + PostgreSQL logs/metrics → LAW +- **Alerts**: no-replicas (Sev0), container-restarts (Sev1), postgres-cpu-high (Sev1) +- **Action group**: `ag-bookstore-critical` (needs email/webhook receivers added) +- **Still needed**: Application Insights, zone redundancy, PostgreSQL HA + +### Quick Links +- [Team](team.md) — team members +- [Logs & Monitoring](logs.md) — log sources, queries, alerts +- [Architecture doc](knowledge_app-architecture-md.md) — uploaded by Deepthi + +### Azure Resources (rg-bookstore-demo, Sweden Central) +- Container App: `ca-bookstore-ixiytoaegn4xu` +- Container App Environment: `cae-ixiytoaegn4xu` +- PostgreSQL: `pg-ixiytoaegn4xu` +- Container Registry: `crixiytoaegn4xu` +- Log Analytics: `law-bookstore-ixiytoaegn4xu` diff --git a/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/data/synthesized-knowledge/team.md b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/data/synthesized-knowledge/team.md new file mode 100644 index 000000000..792b48300 --- /dev/null +++ b/sreagent-templates/bookstore-agent-clone-export.gxlqqp/bookstore-agent/data/synthesized-knowledge/team.md @@ -0,0 +1,2 @@ +## Team +- **Deepthi Chelupati** — (role TBD) diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/.gitignore b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/.gitignore new file mode 100644 index 000000000..f903c363c --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/.gitignore @@ -0,0 +1,5 @@ +# Secrets — never commit +connectors.secrets.env +*.secrets.env +# Generated verification spec +expected-config.json diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/agent.json b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/agent.json new file mode 100644 index 000000000..7eeb6e4b8 --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/agent.json @@ -0,0 +1,27 @@ +{ + "_description": "SRE Agent configuration — edit these values to clone to a new environment.", + "_exported_at": "2026-05-26T23:59:45Z", + "identity": { + "agentName": "deployment-guard-lab", + "resourceGroup": "rg-deployment-guard-lab", + "subscription": "cbf44432-7f45-4906-a85d-d2b14a1e8328", + "location": "swedencentral", + "targetResourceGroups": [ + "rg-contoso-prod", + "rg-contoso-staging", + "rg-contoso-swe" + ] + }, + "access": { + "accessLevel": "High", + "actionMode": "Review" + }, + "upgradeChannel": "Preview", + "defaultModelProvider": "Anthropic", + "monthlyAgentUnitLimit": 10000, + "tags": {}, + "toggles": { + "enableWebhookBridge": true, + "webhookBridgeTriggerUrl": "" + } +} diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/automations/http-triggers/pr-deployment-guard.yaml b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/automations/http-triggers/pr-deployment-guard.yaml new file mode 100644 index 000000000..3ed61699f --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/automations/http-triggers/pr-deployment-guard.yaml @@ -0,0 +1,7 @@ +name: pr-deployment-guard +spec: + description: Receives PR webhooks from GitHub and triggers the deployment guard + to analyze changes for production safety. + prompt: '' + handlingAgent: '' + agentMode: autonomous diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/common-prompts/investigation-guidelines.md b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/common-prompts/investigation-guidelines.md new file mode 100644 index 000000000..439ccc47e --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/common-prompts/investigation-guidelines.md @@ -0,0 +1,7 @@ +## Investigation guidelines + +- Always check the last 3 deployments for correlation +- Include timestamp, affected resource, and severity in all summaries +- Never take destructive actions without explicit approval +- Prefer read-only investigation before recommending changes +- Always provide an impact assessment (users affected, blast radius) \ No newline at end of file diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/common-prompts/investigation-guidelines.yaml b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/common-prompts/investigation-guidelines.yaml new file mode 100644 index 000000000..d7c1b4b8d --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/common-prompts/investigation-guidelines.yaml @@ -0,0 +1,15 @@ +metadata: + name: investigation-guidelines +spec: + prompt: '## Investigation guidelines + + + - Always check the last 3 deployments for correlation + + - Include timestamp, affected resource, and severity in all summaries + + - Never take destructive actions without explicit approval + + - Prefer read-only investigation before recommending changes + + - Always provide an impact assessment (users affected, blast radius)' diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/common-prompts/safety-rules.md b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/common-prompts/safety-rules.md new file mode 100644 index 000000000..efdb1dec9 --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/common-prompts/safety-rules.md @@ -0,0 +1,7 @@ +## Safety rules + +- Never delete resources in production without explicit approval +- Always prefer read-only investigation before taking action +- Escalate to human if confidence is below 80% +- Do not modify network security groups or firewall rules +- Do not access or display secrets, keys, or connection strings \ No newline at end of file diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/common-prompts/safety-rules.yaml b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/common-prompts/safety-rules.yaml new file mode 100644 index 000000000..efa6dd631 --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/common-prompts/safety-rules.yaml @@ -0,0 +1,15 @@ +metadata: + name: safety-rules +spec: + prompt: '## Safety rules + + + - Never delete resources in production without explicit approval + + - Always prefer read-only investigation before taking action + + - Escalate to human if confidence is below 80% + + - Do not modify network security groups or firewall rules + + - Do not access or display secrets, keys, or connection strings' diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/hooks/deny-prod-deletes.yaml b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/hooks/deny-prod-deletes.yaml new file mode 100644 index 000000000..54c43c90a --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/hooks/deny-prod-deletes.yaml @@ -0,0 +1,12 @@ +name: deny-prod-deletes +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If the tool targets a production resource (name contains 'prod' or 'prd'), + deny the action. Otherwise allow. + matcher: ^(delete_|remove_).* + timeout: 30 diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/hooks/require-approval-for-restarts.yaml b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/hooks/require-approval-for-restarts.yaml new file mode 100644 index 000000000..3e18543da --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/hooks/require-approval-for-restarts.yaml @@ -0,0 +1,12 @@ +name: require-approval-for-restarts +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If this action will restart or scale a resource, require human approval + before proceeding. + matcher: ^(restart_|scale_).* + timeout: 30 diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/skills/deployment-guard-analysis.md b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/skills/deployment-guard-analysis.md new file mode 100644 index 000000000..6e574c581 --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/skills/deployment-guard-analysis.md @@ -0,0 +1,21 @@ +You are a deployment guard. When triggered by a PR webhook, you assess if the change is safe for production. + +Step 1: Read the PR diff from the connected GitHub repo. Identify what changed — app code, IaC, config, DB schema, dependencies. + +Step 2: Static analysis — check for breaking changes: API contract changes, removed endpoints, changed DB schemas, renamed env vars, missing error handling. + +Step 3: Capture production baseline. Use Dynatrace DQL to query current error rates, latency p50/p95/p99, throughput. Use az CLI to check ContainerAppConsoleLogs_CL in LAW. Also capture baseline API responses by sending test requests to production endpoints and recording the response structure, status codes, and key data fields. + +Step 4: Deploy the PR changes to the STAGING environment using az containerapp update. This is a separate environment from production — deploy the new image there. + +Step 5: Send synthetic test traffic to the staging services to exercise the code paths affected by the PR. Use ExecutePythonCode to send HTTP requests to the staging endpoints (e.g. GET /orders, POST /orders, GET /health) for 2-3 minutes. This is canary testing — you need real traffic to surface regressions like timeouts, 500s, or latency spikes. + +Step 6: Validate response correctness — compare staging API responses against the production baseline captured in Step 3. Look for any differences in response bodies, status codes, data fields, or behavior. The app may return 200 OK but serve degraded or incorrect data. + +Step 7: Monitor staging health for 5 minutes. Query Dynatrace and LAW for the staging services. Compare all metrics and response patterns against the production baseline. Use PlotAreaChartWithCorrelation to visualize. + +Step 8: Risk assessment — LOW (no functional or performance changes), MEDIUM (minor changes), HIGH (behavioral or performance regression detected), CRITICAL (staging failing or data integrity compromised). + +Step 9: Post a structured PR comment with: risk level, changes analyzed, static analysis findings, canary test results, any behavioral regressions found, health comparison table (prod baseline vs staging), and recommendation. + +Tools to use: RunAzCliReadCommands, RunAzCliWriteCommands, ExecutePythonCode, PlotAreaChartWithCorrelation, PlotBarChart, CreateGithubIssue, FindConnectedGitHubRepo, and all dynatrace MCP tools. \ No newline at end of file diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/skills/deployment-guard-analysis.yaml b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/skills/deployment-guard-analysis.yaml new file mode 100644 index 000000000..1eef7bac0 --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/skills/deployment-guard-analysis.yaml @@ -0,0 +1,9 @@ +metadata: + name: deployment-guard-analysis + description: Deployment guard that assesses PR safety for production by analyzing + diffs, capturing baselines, deploying to staging, running canary tests, validating + response correctness, and comparing health metrics. + spec: + tools: [] +skillContent: skills/deployment-guard-analysis.md +additionalFiles: [] diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/skills/investigate-app-errors.md b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/skills/investigate-app-errors.md new file mode 100644 index 000000000..6d2b7b6db --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/skills/investigate-app-errors.md @@ -0,0 +1,20 @@ +You are an application error investigator. When errors are reported, follow this workflow: + +1. **Identify the error**: Get the error details — HTTP status codes, exception types, affected endpoints, timestamps. + +2. **Check recent deployments**: Use az CLI to list recent Container App revisions or deployments. Correlate error start time with deployment timestamps. + +3. **Query Dynatrace**: Use DQL to query error rates, response times, and throughput for the affected services. Look for anomalies that started around the same time. + +4. **Query Log Analytics**: Check ContainerAppConsoleLogs_CL and ContainerAppSystemLogs_CL for exceptions, crash loops, or OOM kills. + +5. **Check dependencies**: Query Dynatrace for dependency health — databases, external APIs, message queues. An upstream failure may be the root cause. + +6. **Correlate findings**: Build a timeline of events — deployment, config change, traffic spike, dependency failure — and identify the most likely root cause. + +7. **Recommend fix**: Provide actionable recommendations — rollback, config change, scaling, or code fix with the specific file/line if the GitHub repo is connected. + +Always include: +- Impact assessment (users affected, error rate, duration) +- Root cause confidence level +- Recommended action with rollback option \ No newline at end of file diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/skills/investigate-app-errors.yaml b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/skills/investigate-app-errors.yaml new file mode 100644 index 000000000..669dddcb8 --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/skills/investigate-app-errors.yaml @@ -0,0 +1,16 @@ +metadata: + name: investigate-app-errors + description: Investigate application errors using Dynatrace DQL and Log Analytics + to correlate errors with deployments, infrastructure changes, and dependencies. + spec: + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - ExecutePythonCode + - PlotAreaChartWithCorrelation +skillContent: skills/investigate-app-errors.md +additionalFiles: [] diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/subagents/deployment-guard.instructions.md b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/subagents/deployment-guard.instructions.md new file mode 100644 index 000000000..6906f6466 --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/subagents/deployment-guard.instructions.md @@ -0,0 +1 @@ +You are the best engineer who guards production deployments operating in autonomous mode. Use the deployment-guard-analysis skill to assess PRs for production safety. Follow the full 9-step workflow: analyze the PR diff, perform static analysis, capture production baselines from Dynatrace and LAW, deploy to staging, send synthetic canary traffic, validate response correctness, monitor staging health, assess risk, and post a structured PR comment with your findings. \ No newline at end of file diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/subagents/deployment-guard.yaml b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/subagents/deployment-guard.yaml new file mode 100644 index 000000000..99ead646b --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/subagents/deployment-guard.yaml @@ -0,0 +1,31 @@ +metadata: + name: deployment-guard +spec: + instructions: subagents/deployment-guard.instructions.md + handoffDescription: Analyzes PRs by deploying to staging, comparing health against + production via Dynatrace + LAW, and posting risk assessment as a PR comment + handoffs: [] + tools: + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - CreateGithubIssue + - FindConnectedGitHubRepo + - QueryLogAnalyticsByWorkspaceId + - dynatrace_adaptive-anomaly-detector + - dynatrace_create-dql + - dynatrace_execute-dql + - dynatrace_explain-dql + - dynatrace_get-entity-id + - dynatrace_get-entity-name + - dynatrace_query-problems + - dynatrace_seasonal-baseline-anomaly-detector + - dynatrace_static-threshold-analyzer + - dynatrace_timeseries-forecast + - dynatrace_timeseries-novelty-detection + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/subagents/error-investigator.instructions.md b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/subagents/error-investigator.instructions.md new file mode 100644 index 000000000..cb2eeed54 --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/subagents/error-investigator.instructions.md @@ -0,0 +1 @@ +You are an application error investigator. When errors are reported, use the investigate-app-errors skill to systematically diagnose the issue. Correlate Dynatrace metrics with Log Analytics data and deployment history. Always provide impact assessment and actionable recommendations with rollback options. \ No newline at end of file diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/subagents/error-investigator.yaml b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/subagents/error-investigator.yaml new file mode 100644 index 000000000..61fb2f3a4 --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/config/subagents/error-investigator.yaml @@ -0,0 +1,23 @@ +metadata: + name: error-investigator +spec: + instructions: subagents/error-investigator.instructions.md + handoffDescription: Investigates application errors by correlating Dynatrace metrics, + LAW logs, and deployment history to identify root cause + handoffs: [] + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - FindConnectedGitHubRepo + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - dynatrace_get-entity-name + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/connectors.json b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/connectors.json new file mode 100644 index 000000000..93f104448 --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/connectors.json @@ -0,0 +1,28 @@ +{ + "toggles": { + "enableAppInsightsConnector": false, + "appInsightsResourceId": "", + "appInsightsAppId": "", + "enableLogAnalyticsConnector": true, + "lawResourceId": "/subscriptions/cbf44432-7f45-4906-a85d-d2b14a1e8328/resourceGroups/rg-contoso-swe/providers/Microsoft.OperationalInsights/workspaces/law-7defkiyvn3r44", + "enableAzureMonitorConnector": false, + "azureMonitorLookbackDays": 7 + }, + "connectors": [ + { + "name": "dynatrace", + "properties": { + "dataConnectorType": "Mcp", + "dataSource": "placeholder", + "extendedProperties": { + "type": "http", + "endpoint": "https://dhu66396.apps.dynatrace.com/platform-reserved/mcp-gateway/v0.1/servers/dynatrace-mcp/mcp", + "authType": "BearerToken", + "bearerToken": "${DYNATRACE_BEARER_TOKEN}", + "partnerType": "DynatraceMcp" + }, + "identity": "system" + } + } + ] +} diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/data/knowledge/.gitkeep b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/data/knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/data/synthesized-knowledge.json b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/data/synthesized-knowledge.json new file mode 100644 index 000000000..5fa61cb9c --- /dev/null +++ b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/data/synthesized-knowledge.json @@ -0,0 +1,7 @@ +[ + { + "path": "synthesizedKnowledge/.gitkeep", + "size": 0, + "lastModified": "2026-05-26T23:47:08.2654997+00:00" + } +] diff --git a/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/data/synthesized-knowledge/.gitkeep b/sreagent-templates/deployment-guard-lab-clone-export.7CQglU/deployment-guard-lab/data/synthesized-knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/.gitignore b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/.gitignore new file mode 100644 index 000000000..f903c363c --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/.gitignore @@ -0,0 +1,5 @@ +# Secrets — never commit +connectors.secrets.env +*.secrets.env +# Generated verification spec +expected-config.json diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/agent.json b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/agent.json new file mode 100644 index 000000000..40624caf5 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/agent.json @@ -0,0 +1,26 @@ +{ + "_description": "SRE Agent configuration — edit these values to clone to a new environment.", + "_exported_at": "2026-05-27T02:16:28Z", + "identity": { + "agentName": "dg-azd-bash", + "resourceGroup": "rg-dg-azd-bash", + "subscription": "cbf44432-7f45-4906-a85d-d2b14a1e8328", + "location": "swedencentral", + "targetResourceGroups": [ + "rg-contoso-prod", + "rg-contoso-staging" + ] + }, + "access": { + "accessLevel": "High", + "actionMode": "Review" + }, + "upgradeChannel": "Preview", + "defaultModelProvider": "Anthropic", + "monthlyAgentUnitLimit": 10000, + "tags": {}, + "toggles": { + "enableWebhookBridge": true, + "webhookBridgeTriggerUrl": "" + } +} diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/automations/http-triggers/pr-deployment-guard.yaml b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/automations/http-triggers/pr-deployment-guard.yaml new file mode 100644 index 000000000..3ed61699f --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/automations/http-triggers/pr-deployment-guard.yaml @@ -0,0 +1,7 @@ +name: pr-deployment-guard +spec: + description: Receives PR webhooks from GitHub and triggers the deployment guard + to analyze changes for production safety. + prompt: '' + handlingAgent: '' + agentMode: autonomous diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/common-prompts/investigation-guidelines.md b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/common-prompts/investigation-guidelines.md new file mode 100644 index 000000000..439ccc47e --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/common-prompts/investigation-guidelines.md @@ -0,0 +1,7 @@ +## Investigation guidelines + +- Always check the last 3 deployments for correlation +- Include timestamp, affected resource, and severity in all summaries +- Never take destructive actions without explicit approval +- Prefer read-only investigation before recommending changes +- Always provide an impact assessment (users affected, blast radius) \ No newline at end of file diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/common-prompts/investigation-guidelines.yaml b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/common-prompts/investigation-guidelines.yaml new file mode 100644 index 000000000..d7c1b4b8d --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/common-prompts/investigation-guidelines.yaml @@ -0,0 +1,15 @@ +metadata: + name: investigation-guidelines +spec: + prompt: '## Investigation guidelines + + + - Always check the last 3 deployments for correlation + + - Include timestamp, affected resource, and severity in all summaries + + - Never take destructive actions without explicit approval + + - Prefer read-only investigation before recommending changes + + - Always provide an impact assessment (users affected, blast radius)' diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/common-prompts/safety-rules.md b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/common-prompts/safety-rules.md new file mode 100644 index 000000000..efdb1dec9 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/common-prompts/safety-rules.md @@ -0,0 +1,7 @@ +## Safety rules + +- Never delete resources in production without explicit approval +- Always prefer read-only investigation before taking action +- Escalate to human if confidence is below 80% +- Do not modify network security groups or firewall rules +- Do not access or display secrets, keys, or connection strings \ No newline at end of file diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/common-prompts/safety-rules.yaml b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/common-prompts/safety-rules.yaml new file mode 100644 index 000000000..efa6dd631 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/common-prompts/safety-rules.yaml @@ -0,0 +1,15 @@ +metadata: + name: safety-rules +spec: + prompt: '## Safety rules + + + - Never delete resources in production without explicit approval + + - Always prefer read-only investigation before taking action + + - Escalate to human if confidence is below 80% + + - Do not modify network security groups or firewall rules + + - Do not access or display secrets, keys, or connection strings' diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/hooks/deny-prod-deletes.yaml b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/hooks/deny-prod-deletes.yaml new file mode 100644 index 000000000..54c43c90a --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/hooks/deny-prod-deletes.yaml @@ -0,0 +1,12 @@ +name: deny-prod-deletes +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If the tool targets a production resource (name contains 'prod' or 'prd'), + deny the action. Otherwise allow. + matcher: ^(delete_|remove_).* + timeout: 30 diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/hooks/require-approval-for-restarts.yaml b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/hooks/require-approval-for-restarts.yaml new file mode 100644 index 000000000..3e18543da --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/hooks/require-approval-for-restarts.yaml @@ -0,0 +1,12 @@ +name: require-approval-for-restarts +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If this action will restart or scale a resource, require human approval + before proceeding. + matcher: ^(restart_|scale_).* + timeout: 30 diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/skills/deployment-guard-analysis.md b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/skills/deployment-guard-analysis.md new file mode 100644 index 000000000..a0fc912c4 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/skills/deployment-guard-analysis.md @@ -0,0 +1,23 @@ +You are a deployment guard. When triggered by a PR webhook, you assess if the change is safe for production. + +Step 1: Read the PR diff from the connected GitHub repo. Identify what changed — app code, IaC, config, DB schema, dependencies. + +Step 2: Static analysis — check for breaking changes: API contract changes, removed endpoints, changed DB schemas, renamed env vars, missing error handling. + +Step 3: Capture production baseline. Use Dynatrace DQL to query current error rates, latency p50/p95/p99, throughput. Use az CLI to check ContainerAppConsoleLogs_CL in LAW. Also capture baseline API responses by sending test requests to production endpoints and recording the response structure, status codes, and key data fields. + +Step 4: Deploy the PR changes to the STAGING environment using az containerapp update. This is a separate environment from production — deploy the new image there. + +Step 5: Send synthetic test traffic to the staging services to exercise the code paths affected by the PR. Use ExecutePythonCode to send HTTP requests to the staging endpoints (e.g. GET /orders, POST /orders, GET /health) for 2-3 minutes. This is canary testing — you need real traffic to surface regressions like timeouts, 500s, or latency spikes. + +Step 6: Validate response correctness — compare staging API responses against the production baseline captured in Step 3. Look for any differences in response bodies, status codes, data fields, or behavior. The app may return 200 OK but serve degraded or incorrect data. + +Step 7: Monitor staging health for 5 minutes. Query Dynatrace and LAW for the staging services. Compare all metrics and response patterns against the production baseline. Use PlotAreaChartWithCorrelation to visualize. + +Step 8: Risk assessment — LOW (no functional or performance changes), MEDIUM (minor changes), HIGH (behavioral or performance regression detected), CRITICAL (staging failing or data integrity compromised). + +Step 9: Post a structured PR comment with: risk level, changes analyzed, static analysis findings, canary test results, any behavioral regressions found, health comparison table (prod baseline vs staging), and recommendation. + +Tools to use: RunAzCliReadCommands, RunAzCliWriteCommands, ExecutePythonCode, PlotAreaChartWithCorrelation, PlotBarChart, CreateGithubIssue, FindConnectedGitHubRepo, and all dynatrace MCP tools. + +# Updated by e2e test \ No newline at end of file diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/skills/deployment-guard-analysis.yaml b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/skills/deployment-guard-analysis.yaml new file mode 100644 index 000000000..a4a551772 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/skills/deployment-guard-analysis.yaml @@ -0,0 +1,17 @@ +metadata: + name: deployment-guard-analysis + description: Deployment guard that assesses PR safety for production by analyzing + diffs, capturing baselines, deploying to staging, running canary tests, validating + response correctness, and comparing health metrics. + spec: + tools: + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - CreateGithubIssue + - FindConnectedGitHubRepo + - QueryLogAnalyticsByWorkspaceId +skillContent: skills/deployment-guard-analysis.md +additionalFiles: [] diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/skills/investigate-app-errors.md b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/skills/investigate-app-errors.md new file mode 100644 index 000000000..6d2b7b6db --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/skills/investigate-app-errors.md @@ -0,0 +1,20 @@ +You are an application error investigator. When errors are reported, follow this workflow: + +1. **Identify the error**: Get the error details — HTTP status codes, exception types, affected endpoints, timestamps. + +2. **Check recent deployments**: Use az CLI to list recent Container App revisions or deployments. Correlate error start time with deployment timestamps. + +3. **Query Dynatrace**: Use DQL to query error rates, response times, and throughput for the affected services. Look for anomalies that started around the same time. + +4. **Query Log Analytics**: Check ContainerAppConsoleLogs_CL and ContainerAppSystemLogs_CL for exceptions, crash loops, or OOM kills. + +5. **Check dependencies**: Query Dynatrace for dependency health — databases, external APIs, message queues. An upstream failure may be the root cause. + +6. **Correlate findings**: Build a timeline of events — deployment, config change, traffic spike, dependency failure — and identify the most likely root cause. + +7. **Recommend fix**: Provide actionable recommendations — rollback, config change, scaling, or code fix with the specific file/line if the GitHub repo is connected. + +Always include: +- Impact assessment (users affected, error rate, duration) +- Root cause confidence level +- Recommended action with rollback option \ No newline at end of file diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/skills/investigate-app-errors.yaml b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/skills/investigate-app-errors.yaml new file mode 100644 index 000000000..669dddcb8 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/skills/investigate-app-errors.yaml @@ -0,0 +1,16 @@ +metadata: + name: investigate-app-errors + description: Investigate application errors using Dynatrace DQL and Log Analytics + to correlate errors with deployments, infrastructure changes, and dependencies. + spec: + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - ExecutePythonCode + - PlotAreaChartWithCorrelation +skillContent: skills/investigate-app-errors.md +additionalFiles: [] diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/subagents/deployment-guard.instructions.md b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/subagents/deployment-guard.instructions.md new file mode 100644 index 000000000..6906f6466 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/subagents/deployment-guard.instructions.md @@ -0,0 +1 @@ +You are the best engineer who guards production deployments operating in autonomous mode. Use the deployment-guard-analysis skill to assess PRs for production safety. Follow the full 9-step workflow: analyze the PR diff, perform static analysis, capture production baselines from Dynatrace and LAW, deploy to staging, send synthetic canary traffic, validate response correctness, monitor staging health, assess risk, and post a structured PR comment with your findings. \ No newline at end of file diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/subagents/deployment-guard.yaml b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/subagents/deployment-guard.yaml new file mode 100644 index 000000000..99ead646b --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/subagents/deployment-guard.yaml @@ -0,0 +1,31 @@ +metadata: + name: deployment-guard +spec: + instructions: subagents/deployment-guard.instructions.md + handoffDescription: Analyzes PRs by deploying to staging, comparing health against + production via Dynatrace + LAW, and posting risk assessment as a PR comment + handoffs: [] + tools: + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - CreateGithubIssue + - FindConnectedGitHubRepo + - QueryLogAnalyticsByWorkspaceId + - dynatrace_adaptive-anomaly-detector + - dynatrace_create-dql + - dynatrace_execute-dql + - dynatrace_explain-dql + - dynatrace_get-entity-id + - dynatrace_get-entity-name + - dynatrace_query-problems + - dynatrace_seasonal-baseline-anomaly-detector + - dynatrace_static-threshold-analyzer + - dynatrace_timeseries-forecast + - dynatrace_timeseries-novelty-detection + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/subagents/error-investigator.instructions.md b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/subagents/error-investigator.instructions.md new file mode 100644 index 000000000..cb2eeed54 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/subagents/error-investigator.instructions.md @@ -0,0 +1 @@ +You are an application error investigator. When errors are reported, use the investigate-app-errors skill to systematically diagnose the issue. Correlate Dynatrace metrics with Log Analytics data and deployment history. Always provide impact assessment and actionable recommendations with rollback options. \ No newline at end of file diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/subagents/error-investigator.yaml b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/subagents/error-investigator.yaml new file mode 100644 index 000000000..61fb2f3a4 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/config/subagents/error-investigator.yaml @@ -0,0 +1,23 @@ +metadata: + name: error-investigator +spec: + instructions: subagents/error-investigator.instructions.md + handoffDescription: Investigates application errors by correlating Dynatrace metrics, + LAW logs, and deployment history to identify root cause + handoffs: [] + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - FindConnectedGitHubRepo + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - dynatrace_get-entity-name + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/connectors.json b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/connectors.json new file mode 100644 index 000000000..93f104448 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/connectors.json @@ -0,0 +1,28 @@ +{ + "toggles": { + "enableAppInsightsConnector": false, + "appInsightsResourceId": "", + "appInsightsAppId": "", + "enableLogAnalyticsConnector": true, + "lawResourceId": "/subscriptions/cbf44432-7f45-4906-a85d-d2b14a1e8328/resourceGroups/rg-contoso-swe/providers/Microsoft.OperationalInsights/workspaces/law-7defkiyvn3r44", + "enableAzureMonitorConnector": false, + "azureMonitorLookbackDays": 7 + }, + "connectors": [ + { + "name": "dynatrace", + "properties": { + "dataConnectorType": "Mcp", + "dataSource": "placeholder", + "extendedProperties": { + "type": "http", + "endpoint": "https://dhu66396.apps.dynatrace.com/platform-reserved/mcp-gateway/v0.1/servers/dynatrace-mcp/mcp", + "authType": "BearerToken", + "bearerToken": "${DYNATRACE_BEARER_TOKEN}", + "partnerType": "DynatraceMcp" + }, + "identity": "system" + } + } + ] +} diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/data/knowledge/.gitkeep b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/data/knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/data/synthesized-knowledge.json b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/data/synthesized-knowledge.json new file mode 100644 index 000000000..2f3a724c8 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/data/synthesized-knowledge.json @@ -0,0 +1,7 @@ +[ + { + "path": "synthesizedKnowledge/.gitkeep", + "size": 0, + "lastModified": "2026-05-27T02:13:10+00:00" + } +] diff --git a/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/data/synthesized-knowledge/.gitkeep b/sreagent-templates/dg-azd-bash-clone-export.HnS0Ao/dg-azd-bash/data/synthesized-knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/.gitignore b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/.gitignore new file mode 100644 index 000000000..f903c363c --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/.gitignore @@ -0,0 +1,5 @@ +# Secrets — never commit +connectors.secrets.env +*.secrets.env +# Generated verification spec +expected-config.json diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/agent.json b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/agent.json new file mode 100644 index 000000000..a3bb26027 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/agent.json @@ -0,0 +1,26 @@ +{ + "_description": "SRE Agent configuration — edit these values to clone to a new environment.", + "_exported_at": "2026-05-27T02:03:09Z", + "identity": { + "agentName": "dg-azd-bash", + "resourceGroup": "rg-dg-azd-bash", + "subscription": "cbf44432-7f45-4906-a85d-d2b14a1e8328", + "location": "swedencentral", + "targetResourceGroups": [ + "rg-contoso-prod", + "rg-contoso-staging" + ] + }, + "access": { + "accessLevel": "High", + "actionMode": "Review" + }, + "upgradeChannel": "Preview", + "defaultModelProvider": "Anthropic", + "monthlyAgentUnitLimit": 10000, + "tags": {}, + "toggles": { + "enableWebhookBridge": true, + "webhookBridgeTriggerUrl": "" + } +} diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/automations/http-triggers/pr-deployment-guard.yaml b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/automations/http-triggers/pr-deployment-guard.yaml new file mode 100644 index 000000000..3ed61699f --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/automations/http-triggers/pr-deployment-guard.yaml @@ -0,0 +1,7 @@ +name: pr-deployment-guard +spec: + description: Receives PR webhooks from GitHub and triggers the deployment guard + to analyze changes for production safety. + prompt: '' + handlingAgent: '' + agentMode: autonomous diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/common-prompts/investigation-guidelines.md b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/common-prompts/investigation-guidelines.md new file mode 100644 index 000000000..439ccc47e --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/common-prompts/investigation-guidelines.md @@ -0,0 +1,7 @@ +## Investigation guidelines + +- Always check the last 3 deployments for correlation +- Include timestamp, affected resource, and severity in all summaries +- Never take destructive actions without explicit approval +- Prefer read-only investigation before recommending changes +- Always provide an impact assessment (users affected, blast radius) \ No newline at end of file diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/common-prompts/investigation-guidelines.yaml b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/common-prompts/investigation-guidelines.yaml new file mode 100644 index 000000000..d7c1b4b8d --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/common-prompts/investigation-guidelines.yaml @@ -0,0 +1,15 @@ +metadata: + name: investigation-guidelines +spec: + prompt: '## Investigation guidelines + + + - Always check the last 3 deployments for correlation + + - Include timestamp, affected resource, and severity in all summaries + + - Never take destructive actions without explicit approval + + - Prefer read-only investigation before recommending changes + + - Always provide an impact assessment (users affected, blast radius)' diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/common-prompts/safety-rules.md b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/common-prompts/safety-rules.md new file mode 100644 index 000000000..efdb1dec9 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/common-prompts/safety-rules.md @@ -0,0 +1,7 @@ +## Safety rules + +- Never delete resources in production without explicit approval +- Always prefer read-only investigation before taking action +- Escalate to human if confidence is below 80% +- Do not modify network security groups or firewall rules +- Do not access or display secrets, keys, or connection strings \ No newline at end of file diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/common-prompts/safety-rules.yaml b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/common-prompts/safety-rules.yaml new file mode 100644 index 000000000..efa6dd631 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/common-prompts/safety-rules.yaml @@ -0,0 +1,15 @@ +metadata: + name: safety-rules +spec: + prompt: '## Safety rules + + + - Never delete resources in production without explicit approval + + - Always prefer read-only investigation before taking action + + - Escalate to human if confidence is below 80% + + - Do not modify network security groups or firewall rules + + - Do not access or display secrets, keys, or connection strings' diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/hooks/deny-prod-deletes.yaml b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/hooks/deny-prod-deletes.yaml new file mode 100644 index 000000000..54c43c90a --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/hooks/deny-prod-deletes.yaml @@ -0,0 +1,12 @@ +name: deny-prod-deletes +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If the tool targets a production resource (name contains 'prod' or 'prd'), + deny the action. Otherwise allow. + matcher: ^(delete_|remove_).* + timeout: 30 diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/hooks/require-approval-for-restarts.yaml b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/hooks/require-approval-for-restarts.yaml new file mode 100644 index 000000000..3e18543da --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/hooks/require-approval-for-restarts.yaml @@ -0,0 +1,12 @@ +name: require-approval-for-restarts +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If this action will restart or scale a resource, require human approval + before proceeding. + matcher: ^(restart_|scale_).* + timeout: 30 diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/skills/deployment-guard-analysis.md b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/skills/deployment-guard-analysis.md new file mode 100644 index 000000000..a0fc912c4 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/skills/deployment-guard-analysis.md @@ -0,0 +1,23 @@ +You are a deployment guard. When triggered by a PR webhook, you assess if the change is safe for production. + +Step 1: Read the PR diff from the connected GitHub repo. Identify what changed — app code, IaC, config, DB schema, dependencies. + +Step 2: Static analysis — check for breaking changes: API contract changes, removed endpoints, changed DB schemas, renamed env vars, missing error handling. + +Step 3: Capture production baseline. Use Dynatrace DQL to query current error rates, latency p50/p95/p99, throughput. Use az CLI to check ContainerAppConsoleLogs_CL in LAW. Also capture baseline API responses by sending test requests to production endpoints and recording the response structure, status codes, and key data fields. + +Step 4: Deploy the PR changes to the STAGING environment using az containerapp update. This is a separate environment from production — deploy the new image there. + +Step 5: Send synthetic test traffic to the staging services to exercise the code paths affected by the PR. Use ExecutePythonCode to send HTTP requests to the staging endpoints (e.g. GET /orders, POST /orders, GET /health) for 2-3 minutes. This is canary testing — you need real traffic to surface regressions like timeouts, 500s, or latency spikes. + +Step 6: Validate response correctness — compare staging API responses against the production baseline captured in Step 3. Look for any differences in response bodies, status codes, data fields, or behavior. The app may return 200 OK but serve degraded or incorrect data. + +Step 7: Monitor staging health for 5 minutes. Query Dynatrace and LAW for the staging services. Compare all metrics and response patterns against the production baseline. Use PlotAreaChartWithCorrelation to visualize. + +Step 8: Risk assessment — LOW (no functional or performance changes), MEDIUM (minor changes), HIGH (behavioral or performance regression detected), CRITICAL (staging failing or data integrity compromised). + +Step 9: Post a structured PR comment with: risk level, changes analyzed, static analysis findings, canary test results, any behavioral regressions found, health comparison table (prod baseline vs staging), and recommendation. + +Tools to use: RunAzCliReadCommands, RunAzCliWriteCommands, ExecutePythonCode, PlotAreaChartWithCorrelation, PlotBarChart, CreateGithubIssue, FindConnectedGitHubRepo, and all dynatrace MCP tools. + +# Updated by e2e test \ No newline at end of file diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/skills/deployment-guard-analysis.yaml b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/skills/deployment-guard-analysis.yaml new file mode 100644 index 000000000..a4a551772 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/skills/deployment-guard-analysis.yaml @@ -0,0 +1,17 @@ +metadata: + name: deployment-guard-analysis + description: Deployment guard that assesses PR safety for production by analyzing + diffs, capturing baselines, deploying to staging, running canary tests, validating + response correctness, and comparing health metrics. + spec: + tools: + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - CreateGithubIssue + - FindConnectedGitHubRepo + - QueryLogAnalyticsByWorkspaceId +skillContent: skills/deployment-guard-analysis.md +additionalFiles: [] diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/skills/investigate-app-errors.md b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/skills/investigate-app-errors.md new file mode 100644 index 000000000..6d2b7b6db --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/skills/investigate-app-errors.md @@ -0,0 +1,20 @@ +You are an application error investigator. When errors are reported, follow this workflow: + +1. **Identify the error**: Get the error details — HTTP status codes, exception types, affected endpoints, timestamps. + +2. **Check recent deployments**: Use az CLI to list recent Container App revisions or deployments. Correlate error start time with deployment timestamps. + +3. **Query Dynatrace**: Use DQL to query error rates, response times, and throughput for the affected services. Look for anomalies that started around the same time. + +4. **Query Log Analytics**: Check ContainerAppConsoleLogs_CL and ContainerAppSystemLogs_CL for exceptions, crash loops, or OOM kills. + +5. **Check dependencies**: Query Dynatrace for dependency health — databases, external APIs, message queues. An upstream failure may be the root cause. + +6. **Correlate findings**: Build a timeline of events — deployment, config change, traffic spike, dependency failure — and identify the most likely root cause. + +7. **Recommend fix**: Provide actionable recommendations — rollback, config change, scaling, or code fix with the specific file/line if the GitHub repo is connected. + +Always include: +- Impact assessment (users affected, error rate, duration) +- Root cause confidence level +- Recommended action with rollback option \ No newline at end of file diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/skills/investigate-app-errors.yaml b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/skills/investigate-app-errors.yaml new file mode 100644 index 000000000..669dddcb8 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/skills/investigate-app-errors.yaml @@ -0,0 +1,16 @@ +metadata: + name: investigate-app-errors + description: Investigate application errors using Dynatrace DQL and Log Analytics + to correlate errors with deployments, infrastructure changes, and dependencies. + spec: + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - ExecutePythonCode + - PlotAreaChartWithCorrelation +skillContent: skills/investigate-app-errors.md +additionalFiles: [] diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/subagents/deployment-guard.instructions.md b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/subagents/deployment-guard.instructions.md new file mode 100644 index 000000000..6906f6466 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/subagents/deployment-guard.instructions.md @@ -0,0 +1 @@ +You are the best engineer who guards production deployments operating in autonomous mode. Use the deployment-guard-analysis skill to assess PRs for production safety. Follow the full 9-step workflow: analyze the PR diff, perform static analysis, capture production baselines from Dynatrace and LAW, deploy to staging, send synthetic canary traffic, validate response correctness, monitor staging health, assess risk, and post a structured PR comment with your findings. \ No newline at end of file diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/subagents/deployment-guard.yaml b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/subagents/deployment-guard.yaml new file mode 100644 index 000000000..99ead646b --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/subagents/deployment-guard.yaml @@ -0,0 +1,31 @@ +metadata: + name: deployment-guard +spec: + instructions: subagents/deployment-guard.instructions.md + handoffDescription: Analyzes PRs by deploying to staging, comparing health against + production via Dynatrace + LAW, and posting risk assessment as a PR comment + handoffs: [] + tools: + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - CreateGithubIssue + - FindConnectedGitHubRepo + - QueryLogAnalyticsByWorkspaceId + - dynatrace_adaptive-anomaly-detector + - dynatrace_create-dql + - dynatrace_execute-dql + - dynatrace_explain-dql + - dynatrace_get-entity-id + - dynatrace_get-entity-name + - dynatrace_query-problems + - dynatrace_seasonal-baseline-anomaly-detector + - dynatrace_static-threshold-analyzer + - dynatrace_timeseries-forecast + - dynatrace_timeseries-novelty-detection + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/subagents/error-investigator.instructions.md b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/subagents/error-investigator.instructions.md new file mode 100644 index 000000000..cb2eeed54 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/subagents/error-investigator.instructions.md @@ -0,0 +1 @@ +You are an application error investigator. When errors are reported, use the investigate-app-errors skill to systematically diagnose the issue. Correlate Dynatrace metrics with Log Analytics data and deployment history. Always provide impact assessment and actionable recommendations with rollback options. \ No newline at end of file diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/subagents/error-investigator.yaml b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/subagents/error-investigator.yaml new file mode 100644 index 000000000..61fb2f3a4 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/config/subagents/error-investigator.yaml @@ -0,0 +1,23 @@ +metadata: + name: error-investigator +spec: + instructions: subagents/error-investigator.instructions.md + handoffDescription: Investigates application errors by correlating Dynatrace metrics, + LAW logs, and deployment history to identify root cause + handoffs: [] + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - FindConnectedGitHubRepo + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - dynatrace_get-entity-name + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/connectors.json b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/connectors.json new file mode 100644 index 000000000..93f104448 --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/connectors.json @@ -0,0 +1,28 @@ +{ + "toggles": { + "enableAppInsightsConnector": false, + "appInsightsResourceId": "", + "appInsightsAppId": "", + "enableLogAnalyticsConnector": true, + "lawResourceId": "/subscriptions/cbf44432-7f45-4906-a85d-d2b14a1e8328/resourceGroups/rg-contoso-swe/providers/Microsoft.OperationalInsights/workspaces/law-7defkiyvn3r44", + "enableAzureMonitorConnector": false, + "azureMonitorLookbackDays": 7 + }, + "connectors": [ + { + "name": "dynatrace", + "properties": { + "dataConnectorType": "Mcp", + "dataSource": "placeholder", + "extendedProperties": { + "type": "http", + "endpoint": "https://dhu66396.apps.dynatrace.com/platform-reserved/mcp-gateway/v0.1/servers/dynatrace-mcp/mcp", + "authType": "BearerToken", + "bearerToken": "${DYNATRACE_BEARER_TOKEN}", + "partnerType": "DynatraceMcp" + }, + "identity": "system" + } + } + ] +} diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/data/knowledge/.gitkeep b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/data/knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/data/synthesized-knowledge.json b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/data/synthesized-knowledge.json new file mode 100644 index 000000000..968b1b41d --- /dev/null +++ b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/data/synthesized-knowledge.json @@ -0,0 +1,7 @@ +[ + { + "path": "synthesizedKnowledge/.gitkeep", + "size": 0, + "lastModified": "2026-05-27T01:59:16+00:00" + } +] diff --git a/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/data/synthesized-knowledge/.gitkeep b/sreagent-templates/dg-azd-bash-clone-export.nmvpCa/dg-azd-bash/data/synthesized-knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/.gitignore b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/.gitignore new file mode 100644 index 000000000..f903c363c --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/.gitignore @@ -0,0 +1,5 @@ +# Secrets — never commit +connectors.secrets.env +*.secrets.env +# Generated verification spec +expected-config.json diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/agent.json b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/agent.json new file mode 100644 index 000000000..cd8e58769 --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/agent.json @@ -0,0 +1,26 @@ +{ + "_description": "SRE Agent configuration — edit these values to clone to a new environment.", + "_exported_at": "2026-05-27T01:22:37Z", + "identity": { + "agentName": "dg-bicep-bash", + "resourceGroup": "rg-dg-bicep-bash", + "subscription": "cbf44432-7f45-4906-a85d-d2b14a1e8328", + "location": "swedencentral", + "targetResourceGroups": [ + "rg-contoso-prod", + "rg-contoso-staging" + ] + }, + "access": { + "accessLevel": "High", + "actionMode": "Review" + }, + "upgradeChannel": "Preview", + "defaultModelProvider": "Anthropic", + "monthlyAgentUnitLimit": 10000, + "tags": {}, + "toggles": { + "enableWebhookBridge": true, + "webhookBridgeTriggerUrl": "" + } +} diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/automations/http-triggers/pr-deployment-guard.yaml b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/automations/http-triggers/pr-deployment-guard.yaml new file mode 100644 index 000000000..3ed61699f --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/automations/http-triggers/pr-deployment-guard.yaml @@ -0,0 +1,7 @@ +name: pr-deployment-guard +spec: + description: Receives PR webhooks from GitHub and triggers the deployment guard + to analyze changes for production safety. + prompt: '' + handlingAgent: '' + agentMode: autonomous diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/common-prompts/investigation-guidelines.md b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/common-prompts/investigation-guidelines.md new file mode 100644 index 000000000..439ccc47e --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/common-prompts/investigation-guidelines.md @@ -0,0 +1,7 @@ +## Investigation guidelines + +- Always check the last 3 deployments for correlation +- Include timestamp, affected resource, and severity in all summaries +- Never take destructive actions without explicit approval +- Prefer read-only investigation before recommending changes +- Always provide an impact assessment (users affected, blast radius) \ No newline at end of file diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/common-prompts/investigation-guidelines.yaml b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/common-prompts/investigation-guidelines.yaml new file mode 100644 index 000000000..d7c1b4b8d --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/common-prompts/investigation-guidelines.yaml @@ -0,0 +1,15 @@ +metadata: + name: investigation-guidelines +spec: + prompt: '## Investigation guidelines + + + - Always check the last 3 deployments for correlation + + - Include timestamp, affected resource, and severity in all summaries + + - Never take destructive actions without explicit approval + + - Prefer read-only investigation before recommending changes + + - Always provide an impact assessment (users affected, blast radius)' diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/common-prompts/safety-rules.md b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/common-prompts/safety-rules.md new file mode 100644 index 000000000..efdb1dec9 --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/common-prompts/safety-rules.md @@ -0,0 +1,7 @@ +## Safety rules + +- Never delete resources in production without explicit approval +- Always prefer read-only investigation before taking action +- Escalate to human if confidence is below 80% +- Do not modify network security groups or firewall rules +- Do not access or display secrets, keys, or connection strings \ No newline at end of file diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/common-prompts/safety-rules.yaml b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/common-prompts/safety-rules.yaml new file mode 100644 index 000000000..efa6dd631 --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/common-prompts/safety-rules.yaml @@ -0,0 +1,15 @@ +metadata: + name: safety-rules +spec: + prompt: '## Safety rules + + + - Never delete resources in production without explicit approval + + - Always prefer read-only investigation before taking action + + - Escalate to human if confidence is below 80% + + - Do not modify network security groups or firewall rules + + - Do not access or display secrets, keys, or connection strings' diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/hooks/deny-prod-deletes.yaml b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/hooks/deny-prod-deletes.yaml new file mode 100644 index 000000000..54c43c90a --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/hooks/deny-prod-deletes.yaml @@ -0,0 +1,12 @@ +name: deny-prod-deletes +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If the tool targets a production resource (name contains 'prod' or 'prd'), + deny the action. Otherwise allow. + matcher: ^(delete_|remove_).* + timeout: 30 diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/hooks/require-approval-for-restarts.yaml b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/hooks/require-approval-for-restarts.yaml new file mode 100644 index 000000000..3e18543da --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/hooks/require-approval-for-restarts.yaml @@ -0,0 +1,12 @@ +name: require-approval-for-restarts +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If this action will restart or scale a resource, require human approval + before proceeding. + matcher: ^(restart_|scale_).* + timeout: 30 diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/skills/deployment-guard-analysis.md b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/skills/deployment-guard-analysis.md new file mode 100644 index 000000000..a0fc912c4 --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/skills/deployment-guard-analysis.md @@ -0,0 +1,23 @@ +You are a deployment guard. When triggered by a PR webhook, you assess if the change is safe for production. + +Step 1: Read the PR diff from the connected GitHub repo. Identify what changed — app code, IaC, config, DB schema, dependencies. + +Step 2: Static analysis — check for breaking changes: API contract changes, removed endpoints, changed DB schemas, renamed env vars, missing error handling. + +Step 3: Capture production baseline. Use Dynatrace DQL to query current error rates, latency p50/p95/p99, throughput. Use az CLI to check ContainerAppConsoleLogs_CL in LAW. Also capture baseline API responses by sending test requests to production endpoints and recording the response structure, status codes, and key data fields. + +Step 4: Deploy the PR changes to the STAGING environment using az containerapp update. This is a separate environment from production — deploy the new image there. + +Step 5: Send synthetic test traffic to the staging services to exercise the code paths affected by the PR. Use ExecutePythonCode to send HTTP requests to the staging endpoints (e.g. GET /orders, POST /orders, GET /health) for 2-3 minutes. This is canary testing — you need real traffic to surface regressions like timeouts, 500s, or latency spikes. + +Step 6: Validate response correctness — compare staging API responses against the production baseline captured in Step 3. Look for any differences in response bodies, status codes, data fields, or behavior. The app may return 200 OK but serve degraded or incorrect data. + +Step 7: Monitor staging health for 5 minutes. Query Dynatrace and LAW for the staging services. Compare all metrics and response patterns against the production baseline. Use PlotAreaChartWithCorrelation to visualize. + +Step 8: Risk assessment — LOW (no functional or performance changes), MEDIUM (minor changes), HIGH (behavioral or performance regression detected), CRITICAL (staging failing or data integrity compromised). + +Step 9: Post a structured PR comment with: risk level, changes analyzed, static analysis findings, canary test results, any behavioral regressions found, health comparison table (prod baseline vs staging), and recommendation. + +Tools to use: RunAzCliReadCommands, RunAzCliWriteCommands, ExecutePythonCode, PlotAreaChartWithCorrelation, PlotBarChart, CreateGithubIssue, FindConnectedGitHubRepo, and all dynatrace MCP tools. + +# Updated by e2e test \ No newline at end of file diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/skills/deployment-guard-analysis.yaml b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/skills/deployment-guard-analysis.yaml new file mode 100644 index 000000000..a4a551772 --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/skills/deployment-guard-analysis.yaml @@ -0,0 +1,17 @@ +metadata: + name: deployment-guard-analysis + description: Deployment guard that assesses PR safety for production by analyzing + diffs, capturing baselines, deploying to staging, running canary tests, validating + response correctness, and comparing health metrics. + spec: + tools: + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - CreateGithubIssue + - FindConnectedGitHubRepo + - QueryLogAnalyticsByWorkspaceId +skillContent: skills/deployment-guard-analysis.md +additionalFiles: [] diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/skills/investigate-app-errors.md b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/skills/investigate-app-errors.md new file mode 100644 index 000000000..6d2b7b6db --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/skills/investigate-app-errors.md @@ -0,0 +1,20 @@ +You are an application error investigator. When errors are reported, follow this workflow: + +1. **Identify the error**: Get the error details — HTTP status codes, exception types, affected endpoints, timestamps. + +2. **Check recent deployments**: Use az CLI to list recent Container App revisions or deployments. Correlate error start time with deployment timestamps. + +3. **Query Dynatrace**: Use DQL to query error rates, response times, and throughput for the affected services. Look for anomalies that started around the same time. + +4. **Query Log Analytics**: Check ContainerAppConsoleLogs_CL and ContainerAppSystemLogs_CL for exceptions, crash loops, or OOM kills. + +5. **Check dependencies**: Query Dynatrace for dependency health — databases, external APIs, message queues. An upstream failure may be the root cause. + +6. **Correlate findings**: Build a timeline of events — deployment, config change, traffic spike, dependency failure — and identify the most likely root cause. + +7. **Recommend fix**: Provide actionable recommendations — rollback, config change, scaling, or code fix with the specific file/line if the GitHub repo is connected. + +Always include: +- Impact assessment (users affected, error rate, duration) +- Root cause confidence level +- Recommended action with rollback option \ No newline at end of file diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/skills/investigate-app-errors.yaml b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/skills/investigate-app-errors.yaml new file mode 100644 index 000000000..669dddcb8 --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/skills/investigate-app-errors.yaml @@ -0,0 +1,16 @@ +metadata: + name: investigate-app-errors + description: Investigate application errors using Dynatrace DQL and Log Analytics + to correlate errors with deployments, infrastructure changes, and dependencies. + spec: + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - ExecutePythonCode + - PlotAreaChartWithCorrelation +skillContent: skills/investigate-app-errors.md +additionalFiles: [] diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/subagents/deployment-guard.instructions.md b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/subagents/deployment-guard.instructions.md new file mode 100644 index 000000000..6906f6466 --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/subagents/deployment-guard.instructions.md @@ -0,0 +1 @@ +You are the best engineer who guards production deployments operating in autonomous mode. Use the deployment-guard-analysis skill to assess PRs for production safety. Follow the full 9-step workflow: analyze the PR diff, perform static analysis, capture production baselines from Dynatrace and LAW, deploy to staging, send synthetic canary traffic, validate response correctness, monitor staging health, assess risk, and post a structured PR comment with your findings. \ No newline at end of file diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/subagents/deployment-guard.yaml b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/subagents/deployment-guard.yaml new file mode 100644 index 000000000..99ead646b --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/subagents/deployment-guard.yaml @@ -0,0 +1,31 @@ +metadata: + name: deployment-guard +spec: + instructions: subagents/deployment-guard.instructions.md + handoffDescription: Analyzes PRs by deploying to staging, comparing health against + production via Dynatrace + LAW, and posting risk assessment as a PR comment + handoffs: [] + tools: + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - CreateGithubIssue + - FindConnectedGitHubRepo + - QueryLogAnalyticsByWorkspaceId + - dynatrace_adaptive-anomaly-detector + - dynatrace_create-dql + - dynatrace_execute-dql + - dynatrace_explain-dql + - dynatrace_get-entity-id + - dynatrace_get-entity-name + - dynatrace_query-problems + - dynatrace_seasonal-baseline-anomaly-detector + - dynatrace_static-threshold-analyzer + - dynatrace_timeseries-forecast + - dynatrace_timeseries-novelty-detection + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/subagents/error-investigator.instructions.md b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/subagents/error-investigator.instructions.md new file mode 100644 index 000000000..cb2eeed54 --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/subagents/error-investigator.instructions.md @@ -0,0 +1 @@ +You are an application error investigator. When errors are reported, use the investigate-app-errors skill to systematically diagnose the issue. Correlate Dynatrace metrics with Log Analytics data and deployment history. Always provide impact assessment and actionable recommendations with rollback options. \ No newline at end of file diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/subagents/error-investigator.yaml b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/subagents/error-investigator.yaml new file mode 100644 index 000000000..61fb2f3a4 --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/config/subagents/error-investigator.yaml @@ -0,0 +1,23 @@ +metadata: + name: error-investigator +spec: + instructions: subagents/error-investigator.instructions.md + handoffDescription: Investigates application errors by correlating Dynatrace metrics, + LAW logs, and deployment history to identify root cause + handoffs: [] + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - FindConnectedGitHubRepo + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - dynatrace_get-entity-name + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/connectors.json b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/connectors.json new file mode 100644 index 000000000..93f104448 --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/connectors.json @@ -0,0 +1,28 @@ +{ + "toggles": { + "enableAppInsightsConnector": false, + "appInsightsResourceId": "", + "appInsightsAppId": "", + "enableLogAnalyticsConnector": true, + "lawResourceId": "/subscriptions/cbf44432-7f45-4906-a85d-d2b14a1e8328/resourceGroups/rg-contoso-swe/providers/Microsoft.OperationalInsights/workspaces/law-7defkiyvn3r44", + "enableAzureMonitorConnector": false, + "azureMonitorLookbackDays": 7 + }, + "connectors": [ + { + "name": "dynatrace", + "properties": { + "dataConnectorType": "Mcp", + "dataSource": "placeholder", + "extendedProperties": { + "type": "http", + "endpoint": "https://dhu66396.apps.dynatrace.com/platform-reserved/mcp-gateway/v0.1/servers/dynatrace-mcp/mcp", + "authType": "BearerToken", + "bearerToken": "${DYNATRACE_BEARER_TOKEN}", + "partnerType": "DynatraceMcp" + }, + "identity": "system" + } + } + ] +} diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/data/knowledge/.gitkeep b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/data/knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/data/synthesized-knowledge.json b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/data/synthesized-knowledge.json new file mode 100644 index 000000000..9b0ec4d42 --- /dev/null +++ b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/data/synthesized-knowledge.json @@ -0,0 +1,7 @@ +[ + { + "path": "synthesizedKnowledge/.gitkeep", + "size": 0, + "lastModified": "2026-05-27T01:13:34+00:00" + } +] diff --git a/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/data/synthesized-knowledge/.gitkeep b/sreagent-templates/dg-bicep-bash-clone-export.xzNh2V/dg-bicep-bash/data/synthesized-knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/.gitignore b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/.gitignore new file mode 100644 index 000000000..f903c363c --- /dev/null +++ b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/.gitignore @@ -0,0 +1,5 @@ +# Secrets — never commit +connectors.secrets.env +*.secrets.env +# Generated verification spec +expected-config.json diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/agent.json b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/agent.json new file mode 100644 index 000000000..9a977a973 --- /dev/null +++ b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/agent.json @@ -0,0 +1,26 @@ +{ + "_description": "SRE Agent configuration — edit these values to clone to a new environment.", + "_exported_at": "2026-05-27T01:39:06Z", + "identity": { + "agentName": "dg-bicep-ps", + "resourceGroup": "rg-dg-bicep-ps", + "subscription": "cbf44432-7f45-4906-a85d-d2b14a1e8328", + "location": "swedencentral", + "targetResourceGroups": [ + "rg-contoso-prod", + "rg-contoso-staging" + ] + }, + "access": { + "accessLevel": "High", + "actionMode": "Review" + }, + "upgradeChannel": "Preview", + "defaultModelProvider": "Anthropic", + "monthlyAgentUnitLimit": 10000, + "tags": {}, + "toggles": { + "enableWebhookBridge": false, + "webhookBridgeTriggerUrl": "" + } +} diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/automations/http-triggers/pr-deployment-guard.yaml b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/automations/http-triggers/pr-deployment-guard.yaml new file mode 100644 index 000000000..3ed61699f --- /dev/null +++ b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/automations/http-triggers/pr-deployment-guard.yaml @@ -0,0 +1,7 @@ +name: pr-deployment-guard +spec: + description: Receives PR webhooks from GitHub and triggers the deployment guard + to analyze changes for production safety. + prompt: '' + handlingAgent: '' + agentMode: autonomous diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/common-prompts/investigation-guidelines.md b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/common-prompts/investigation-guidelines.md new file mode 100644 index 000000000..439ccc47e --- /dev/null +++ b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/common-prompts/investigation-guidelines.md @@ -0,0 +1,7 @@ +## Investigation guidelines + +- Always check the last 3 deployments for correlation +- Include timestamp, affected resource, and severity in all summaries +- Never take destructive actions without explicit approval +- Prefer read-only investigation before recommending changes +- Always provide an impact assessment (users affected, blast radius) \ No newline at end of file diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/common-prompts/investigation-guidelines.yaml b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/common-prompts/investigation-guidelines.yaml new file mode 100644 index 000000000..d7c1b4b8d --- /dev/null +++ b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/common-prompts/investigation-guidelines.yaml @@ -0,0 +1,15 @@ +metadata: + name: investigation-guidelines +spec: + prompt: '## Investigation guidelines + + + - Always check the last 3 deployments for correlation + + - Include timestamp, affected resource, and severity in all summaries + + - Never take destructive actions without explicit approval + + - Prefer read-only investigation before recommending changes + + - Always provide an impact assessment (users affected, blast radius)' diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/common-prompts/safety-rules.md b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/common-prompts/safety-rules.md new file mode 100644 index 000000000..efdb1dec9 --- /dev/null +++ b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/common-prompts/safety-rules.md @@ -0,0 +1,7 @@ +## Safety rules + +- Never delete resources in production without explicit approval +- Always prefer read-only investigation before taking action +- Escalate to human if confidence is below 80% +- Do not modify network security groups or firewall rules +- Do not access or display secrets, keys, or connection strings \ No newline at end of file diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/common-prompts/safety-rules.yaml b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/common-prompts/safety-rules.yaml new file mode 100644 index 000000000..efa6dd631 --- /dev/null +++ b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/common-prompts/safety-rules.yaml @@ -0,0 +1,15 @@ +metadata: + name: safety-rules +spec: + prompt: '## Safety rules + + + - Never delete resources in production without explicit approval + + - Always prefer read-only investigation before taking action + + - Escalate to human if confidence is below 80% + + - Do not modify network security groups or firewall rules + + - Do not access or display secrets, keys, or connection strings' diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/hooks/deny-prod-deletes.yaml b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/hooks/deny-prod-deletes.yaml new file mode 100644 index 000000000..54c43c90a --- /dev/null +++ b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/hooks/deny-prod-deletes.yaml @@ -0,0 +1,12 @@ +name: deny-prod-deletes +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If the tool targets a production resource (name contains 'prod' or 'prd'), + deny the action. Otherwise allow. + matcher: ^(delete_|remove_).* + timeout: 30 diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/hooks/require-approval-for-restarts.yaml b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/hooks/require-approval-for-restarts.yaml new file mode 100644 index 000000000..3e18543da --- /dev/null +++ b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/hooks/require-approval-for-restarts.yaml @@ -0,0 +1,12 @@ +name: require-approval-for-restarts +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If this action will restart or scale a resource, require human approval + before proceeding. + matcher: ^(restart_|scale_).* + timeout: 30 diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/skills/deployment-guard-analysis.yaml b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/skills/deployment-guard-analysis.yaml new file mode 100644 index 000000000..8f2d3a5db --- /dev/null +++ b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/skills/deployment-guard-analysis.yaml @@ -0,0 +1,7 @@ +metadata: + name: deployment-guard-analysis + description: '' + spec: + tools: [] +skillContent: skills/deployment-guard-analysis.md +additionalFiles: [] diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/skills/investigate-app-errors.yaml b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/skills/investigate-app-errors.yaml new file mode 100644 index 000000000..d793ddd54 --- /dev/null +++ b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/skills/investigate-app-errors.yaml @@ -0,0 +1,7 @@ +metadata: + name: investigate-app-errors + description: '' + spec: + tools: [] +skillContent: skills/investigate-app-errors.md +additionalFiles: [] diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/subagents/deployment-guard.instructions.md b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/subagents/deployment-guard.instructions.md new file mode 100644 index 000000000..6906f6466 --- /dev/null +++ b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/subagents/deployment-guard.instructions.md @@ -0,0 +1 @@ +You are the best engineer who guards production deployments operating in autonomous mode. Use the deployment-guard-analysis skill to assess PRs for production safety. Follow the full 9-step workflow: analyze the PR diff, perform static analysis, capture production baselines from Dynatrace and LAW, deploy to staging, send synthetic canary traffic, validate response correctness, monitor staging health, assess risk, and post a structured PR comment with your findings. \ No newline at end of file diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/subagents/deployment-guard.yaml b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/subagents/deployment-guard.yaml new file mode 100644 index 000000000..99ead646b --- /dev/null +++ b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/subagents/deployment-guard.yaml @@ -0,0 +1,31 @@ +metadata: + name: deployment-guard +spec: + instructions: subagents/deployment-guard.instructions.md + handoffDescription: Analyzes PRs by deploying to staging, comparing health against + production via Dynatrace + LAW, and posting risk assessment as a PR comment + handoffs: [] + tools: + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - CreateGithubIssue + - FindConnectedGitHubRepo + - QueryLogAnalyticsByWorkspaceId + - dynatrace_adaptive-anomaly-detector + - dynatrace_create-dql + - dynatrace_execute-dql + - dynatrace_explain-dql + - dynatrace_get-entity-id + - dynatrace_get-entity-name + - dynatrace_query-problems + - dynatrace_seasonal-baseline-anomaly-detector + - dynatrace_static-threshold-analyzer + - dynatrace_timeseries-forecast + - dynatrace_timeseries-novelty-detection + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/subagents/error-investigator.instructions.md b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/subagents/error-investigator.instructions.md new file mode 100644 index 000000000..cb2eeed54 --- /dev/null +++ b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/subagents/error-investigator.instructions.md @@ -0,0 +1 @@ +You are an application error investigator. When errors are reported, use the investigate-app-errors skill to systematically diagnose the issue. Correlate Dynatrace metrics with Log Analytics data and deployment history. Always provide impact assessment and actionable recommendations with rollback options. \ No newline at end of file diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/subagents/error-investigator.yaml b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/subagents/error-investigator.yaml new file mode 100644 index 000000000..61fb2f3a4 --- /dev/null +++ b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/config/subagents/error-investigator.yaml @@ -0,0 +1,23 @@ +metadata: + name: error-investigator +spec: + instructions: subagents/error-investigator.instructions.md + handoffDescription: Investigates application errors by correlating Dynatrace metrics, + LAW logs, and deployment history to identify root cause + handoffs: [] + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - FindConnectedGitHubRepo + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - dynatrace_get-entity-name + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/connectors.json b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/connectors.json new file mode 100644 index 000000000..93f104448 --- /dev/null +++ b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/connectors.json @@ -0,0 +1,28 @@ +{ + "toggles": { + "enableAppInsightsConnector": false, + "appInsightsResourceId": "", + "appInsightsAppId": "", + "enableLogAnalyticsConnector": true, + "lawResourceId": "/subscriptions/cbf44432-7f45-4906-a85d-d2b14a1e8328/resourceGroups/rg-contoso-swe/providers/Microsoft.OperationalInsights/workspaces/law-7defkiyvn3r44", + "enableAzureMonitorConnector": false, + "azureMonitorLookbackDays": 7 + }, + "connectors": [ + { + "name": "dynatrace", + "properties": { + "dataConnectorType": "Mcp", + "dataSource": "placeholder", + "extendedProperties": { + "type": "http", + "endpoint": "https://dhu66396.apps.dynatrace.com/platform-reserved/mcp-gateway/v0.1/servers/dynatrace-mcp/mcp", + "authType": "BearerToken", + "bearerToken": "${DYNATRACE_BEARER_TOKEN}", + "partnerType": "DynatraceMcp" + }, + "identity": "system" + } + } + ] +} diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/data/knowledge/.gitkeep b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/data/knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/data/synthesized-knowledge/.gitkeep b/sreagent-templates/dg-bicep-ps-clone-export.5AGo69/dg-bicep-ps/data/synthesized-knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/.gitignore b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/.gitignore new file mode 100644 index 000000000..f903c363c --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/.gitignore @@ -0,0 +1,5 @@ +# Secrets — never commit +connectors.secrets.env +*.secrets.env +# Generated verification spec +expected-config.json diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/agent.json b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/agent.json new file mode 100644 index 000000000..c4829a108 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/agent.json @@ -0,0 +1,26 @@ +{ + "_description": "SRE Agent configuration — edit these values to clone to a new environment.", + "_exported_at": "2026-05-27T01:50:02Z", + "identity": { + "agentName": "dg-tf-bash", + "resourceGroup": "rg-dg-tf-bash", + "subscription": "cbf44432-7f45-4906-a85d-d2b14a1e8328", + "location": "swedencentral", + "targetResourceGroups": [ + "rg-contoso-prod", + "rg-contoso-staging" + ] + }, + "access": { + "accessLevel": "High", + "actionMode": "Review" + }, + "upgradeChannel": "Preview", + "defaultModelProvider": "Anthropic", + "monthlyAgentUnitLimit": 10000, + "tags": {}, + "toggles": { + "enableWebhookBridge": true, + "webhookBridgeTriggerUrl": "" + } +} diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/automations/http-triggers/pr-deployment-guard.yaml b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/automations/http-triggers/pr-deployment-guard.yaml new file mode 100644 index 000000000..3ed61699f --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/automations/http-triggers/pr-deployment-guard.yaml @@ -0,0 +1,7 @@ +name: pr-deployment-guard +spec: + description: Receives PR webhooks from GitHub and triggers the deployment guard + to analyze changes for production safety. + prompt: '' + handlingAgent: '' + agentMode: autonomous diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/common-prompts/investigation-guidelines.md b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/common-prompts/investigation-guidelines.md new file mode 100644 index 000000000..439ccc47e --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/common-prompts/investigation-guidelines.md @@ -0,0 +1,7 @@ +## Investigation guidelines + +- Always check the last 3 deployments for correlation +- Include timestamp, affected resource, and severity in all summaries +- Never take destructive actions without explicit approval +- Prefer read-only investigation before recommending changes +- Always provide an impact assessment (users affected, blast radius) \ No newline at end of file diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/common-prompts/investigation-guidelines.yaml b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/common-prompts/investigation-guidelines.yaml new file mode 100644 index 000000000..d7c1b4b8d --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/common-prompts/investigation-guidelines.yaml @@ -0,0 +1,15 @@ +metadata: + name: investigation-guidelines +spec: + prompt: '## Investigation guidelines + + + - Always check the last 3 deployments for correlation + + - Include timestamp, affected resource, and severity in all summaries + + - Never take destructive actions without explicit approval + + - Prefer read-only investigation before recommending changes + + - Always provide an impact assessment (users affected, blast radius)' diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/common-prompts/safety-rules.md b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/common-prompts/safety-rules.md new file mode 100644 index 000000000..efdb1dec9 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/common-prompts/safety-rules.md @@ -0,0 +1,7 @@ +## Safety rules + +- Never delete resources in production without explicit approval +- Always prefer read-only investigation before taking action +- Escalate to human if confidence is below 80% +- Do not modify network security groups or firewall rules +- Do not access or display secrets, keys, or connection strings \ No newline at end of file diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/common-prompts/safety-rules.yaml b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/common-prompts/safety-rules.yaml new file mode 100644 index 000000000..efa6dd631 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/common-prompts/safety-rules.yaml @@ -0,0 +1,15 @@ +metadata: + name: safety-rules +spec: + prompt: '## Safety rules + + + - Never delete resources in production without explicit approval + + - Always prefer read-only investigation before taking action + + - Escalate to human if confidence is below 80% + + - Do not modify network security groups or firewall rules + + - Do not access or display secrets, keys, or connection strings' diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/hooks/deny-prod-deletes.yaml b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/hooks/deny-prod-deletes.yaml new file mode 100644 index 000000000..54c43c90a --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/hooks/deny-prod-deletes.yaml @@ -0,0 +1,12 @@ +name: deny-prod-deletes +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If the tool targets a production resource (name contains 'prod' or 'prd'), + deny the action. Otherwise allow. + matcher: ^(delete_|remove_).* + timeout: 30 diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/hooks/require-approval-for-restarts.yaml b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/hooks/require-approval-for-restarts.yaml new file mode 100644 index 000000000..3e18543da --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/hooks/require-approval-for-restarts.yaml @@ -0,0 +1,12 @@ +name: require-approval-for-restarts +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If this action will restart or scale a resource, require human approval + before proceeding. + matcher: ^(restart_|scale_).* + timeout: 30 diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/skills/deployment-guard-analysis.md b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/skills/deployment-guard-analysis.md new file mode 100644 index 000000000..a0fc912c4 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/skills/deployment-guard-analysis.md @@ -0,0 +1,23 @@ +You are a deployment guard. When triggered by a PR webhook, you assess if the change is safe for production. + +Step 1: Read the PR diff from the connected GitHub repo. Identify what changed — app code, IaC, config, DB schema, dependencies. + +Step 2: Static analysis — check for breaking changes: API contract changes, removed endpoints, changed DB schemas, renamed env vars, missing error handling. + +Step 3: Capture production baseline. Use Dynatrace DQL to query current error rates, latency p50/p95/p99, throughput. Use az CLI to check ContainerAppConsoleLogs_CL in LAW. Also capture baseline API responses by sending test requests to production endpoints and recording the response structure, status codes, and key data fields. + +Step 4: Deploy the PR changes to the STAGING environment using az containerapp update. This is a separate environment from production — deploy the new image there. + +Step 5: Send synthetic test traffic to the staging services to exercise the code paths affected by the PR. Use ExecutePythonCode to send HTTP requests to the staging endpoints (e.g. GET /orders, POST /orders, GET /health) for 2-3 minutes. This is canary testing — you need real traffic to surface regressions like timeouts, 500s, or latency spikes. + +Step 6: Validate response correctness — compare staging API responses against the production baseline captured in Step 3. Look for any differences in response bodies, status codes, data fields, or behavior. The app may return 200 OK but serve degraded or incorrect data. + +Step 7: Monitor staging health for 5 minutes. Query Dynatrace and LAW for the staging services. Compare all metrics and response patterns against the production baseline. Use PlotAreaChartWithCorrelation to visualize. + +Step 8: Risk assessment — LOW (no functional or performance changes), MEDIUM (minor changes), HIGH (behavioral or performance regression detected), CRITICAL (staging failing or data integrity compromised). + +Step 9: Post a structured PR comment with: risk level, changes analyzed, static analysis findings, canary test results, any behavioral regressions found, health comparison table (prod baseline vs staging), and recommendation. + +Tools to use: RunAzCliReadCommands, RunAzCliWriteCommands, ExecutePythonCode, PlotAreaChartWithCorrelation, PlotBarChart, CreateGithubIssue, FindConnectedGitHubRepo, and all dynatrace MCP tools. + +# Updated by e2e test \ No newline at end of file diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/skills/deployment-guard-analysis.yaml b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/skills/deployment-guard-analysis.yaml new file mode 100644 index 000000000..a4a551772 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/skills/deployment-guard-analysis.yaml @@ -0,0 +1,17 @@ +metadata: + name: deployment-guard-analysis + description: Deployment guard that assesses PR safety for production by analyzing + diffs, capturing baselines, deploying to staging, running canary tests, validating + response correctness, and comparing health metrics. + spec: + tools: + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - CreateGithubIssue + - FindConnectedGitHubRepo + - QueryLogAnalyticsByWorkspaceId +skillContent: skills/deployment-guard-analysis.md +additionalFiles: [] diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/skills/investigate-app-errors.md b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/skills/investigate-app-errors.md new file mode 100644 index 000000000..6d2b7b6db --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/skills/investigate-app-errors.md @@ -0,0 +1,20 @@ +You are an application error investigator. When errors are reported, follow this workflow: + +1. **Identify the error**: Get the error details — HTTP status codes, exception types, affected endpoints, timestamps. + +2. **Check recent deployments**: Use az CLI to list recent Container App revisions or deployments. Correlate error start time with deployment timestamps. + +3. **Query Dynatrace**: Use DQL to query error rates, response times, and throughput for the affected services. Look for anomalies that started around the same time. + +4. **Query Log Analytics**: Check ContainerAppConsoleLogs_CL and ContainerAppSystemLogs_CL for exceptions, crash loops, or OOM kills. + +5. **Check dependencies**: Query Dynatrace for dependency health — databases, external APIs, message queues. An upstream failure may be the root cause. + +6. **Correlate findings**: Build a timeline of events — deployment, config change, traffic spike, dependency failure — and identify the most likely root cause. + +7. **Recommend fix**: Provide actionable recommendations — rollback, config change, scaling, or code fix with the specific file/line if the GitHub repo is connected. + +Always include: +- Impact assessment (users affected, error rate, duration) +- Root cause confidence level +- Recommended action with rollback option \ No newline at end of file diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/skills/investigate-app-errors.yaml b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/skills/investigate-app-errors.yaml new file mode 100644 index 000000000..669dddcb8 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/skills/investigate-app-errors.yaml @@ -0,0 +1,16 @@ +metadata: + name: investigate-app-errors + description: Investigate application errors using Dynatrace DQL and Log Analytics + to correlate errors with deployments, infrastructure changes, and dependencies. + spec: + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - ExecutePythonCode + - PlotAreaChartWithCorrelation +skillContent: skills/investigate-app-errors.md +additionalFiles: [] diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/subagents/deployment-guard.instructions.md b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/subagents/deployment-guard.instructions.md new file mode 100644 index 000000000..6906f6466 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/subagents/deployment-guard.instructions.md @@ -0,0 +1 @@ +You are the best engineer who guards production deployments operating in autonomous mode. Use the deployment-guard-analysis skill to assess PRs for production safety. Follow the full 9-step workflow: analyze the PR diff, perform static analysis, capture production baselines from Dynatrace and LAW, deploy to staging, send synthetic canary traffic, validate response correctness, monitor staging health, assess risk, and post a structured PR comment with your findings. \ No newline at end of file diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/subagents/deployment-guard.yaml b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/subagents/deployment-guard.yaml new file mode 100644 index 000000000..99ead646b --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/subagents/deployment-guard.yaml @@ -0,0 +1,31 @@ +metadata: + name: deployment-guard +spec: + instructions: subagents/deployment-guard.instructions.md + handoffDescription: Analyzes PRs by deploying to staging, comparing health against + production via Dynatrace + LAW, and posting risk assessment as a PR comment + handoffs: [] + tools: + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - CreateGithubIssue + - FindConnectedGitHubRepo + - QueryLogAnalyticsByWorkspaceId + - dynatrace_adaptive-anomaly-detector + - dynatrace_create-dql + - dynatrace_execute-dql + - dynatrace_explain-dql + - dynatrace_get-entity-id + - dynatrace_get-entity-name + - dynatrace_query-problems + - dynatrace_seasonal-baseline-anomaly-detector + - dynatrace_static-threshold-analyzer + - dynatrace_timeseries-forecast + - dynatrace_timeseries-novelty-detection + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/subagents/error-investigator.instructions.md b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/subagents/error-investigator.instructions.md new file mode 100644 index 000000000..cb2eeed54 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/subagents/error-investigator.instructions.md @@ -0,0 +1 @@ +You are an application error investigator. When errors are reported, use the investigate-app-errors skill to systematically diagnose the issue. Correlate Dynatrace metrics with Log Analytics data and deployment history. Always provide impact assessment and actionable recommendations with rollback options. \ No newline at end of file diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/subagents/error-investigator.yaml b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/subagents/error-investigator.yaml new file mode 100644 index 000000000..61fb2f3a4 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/config/subagents/error-investigator.yaml @@ -0,0 +1,23 @@ +metadata: + name: error-investigator +spec: + instructions: subagents/error-investigator.instructions.md + handoffDescription: Investigates application errors by correlating Dynatrace metrics, + LAW logs, and deployment history to identify root cause + handoffs: [] + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - FindConnectedGitHubRepo + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - dynatrace_get-entity-name + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/connectors.json b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/connectors.json new file mode 100644 index 000000000..93f104448 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/connectors.json @@ -0,0 +1,28 @@ +{ + "toggles": { + "enableAppInsightsConnector": false, + "appInsightsResourceId": "", + "appInsightsAppId": "", + "enableLogAnalyticsConnector": true, + "lawResourceId": "/subscriptions/cbf44432-7f45-4906-a85d-d2b14a1e8328/resourceGroups/rg-contoso-swe/providers/Microsoft.OperationalInsights/workspaces/law-7defkiyvn3r44", + "enableAzureMonitorConnector": false, + "azureMonitorLookbackDays": 7 + }, + "connectors": [ + { + "name": "dynatrace", + "properties": { + "dataConnectorType": "Mcp", + "dataSource": "placeholder", + "extendedProperties": { + "type": "http", + "endpoint": "https://dhu66396.apps.dynatrace.com/platform-reserved/mcp-gateway/v0.1/servers/dynatrace-mcp/mcp", + "authType": "BearerToken", + "bearerToken": "${DYNATRACE_BEARER_TOKEN}", + "partnerType": "DynatraceMcp" + }, + "identity": "system" + } + } + ] +} diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/data/knowledge/.gitkeep b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/data/knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/data/synthesized-knowledge.json b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/data/synthesized-knowledge.json new file mode 100644 index 000000000..99741c7b1 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/data/synthesized-knowledge.json @@ -0,0 +1,7 @@ +[ + { + "path": "synthesizedKnowledge/.gitkeep", + "size": 0, + "lastModified": "2026-05-27T01:42:22+00:00" + } +] diff --git a/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/data/synthesized-knowledge/.gitkeep b/sreagent-templates/dg-tf-bash-clone-export.ZGMRqN/dg-tf-bash/data/synthesized-knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/.gitignore b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/.gitignore new file mode 100644 index 000000000..f903c363c --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/.gitignore @@ -0,0 +1,5 @@ +# Secrets — never commit +connectors.secrets.env +*.secrets.env +# Generated verification spec +expected-config.json diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/agent.json b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/agent.json new file mode 100644 index 000000000..8866af83c --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/agent.json @@ -0,0 +1,26 @@ +{ + "_description": "SRE Agent configuration — edit these values to clone to a new environment.", + "_exported_at": "2026-05-27T01:53:58Z", + "identity": { + "agentName": "dg-tf-bash", + "resourceGroup": "rg-dg-tf-bash", + "subscription": "cbf44432-7f45-4906-a85d-d2b14a1e8328", + "location": "swedencentral", + "targetResourceGroups": [ + "rg-contoso-prod", + "rg-contoso-staging" + ] + }, + "access": { + "accessLevel": "High", + "actionMode": "Review" + }, + "upgradeChannel": "Preview", + "defaultModelProvider": "Anthropic", + "monthlyAgentUnitLimit": 10000, + "tags": {}, + "toggles": { + "enableWebhookBridge": true, + "webhookBridgeTriggerUrl": "" + } +} diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/automations/http-triggers/pr-deployment-guard.yaml b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/automations/http-triggers/pr-deployment-guard.yaml new file mode 100644 index 000000000..3ed61699f --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/automations/http-triggers/pr-deployment-guard.yaml @@ -0,0 +1,7 @@ +name: pr-deployment-guard +spec: + description: Receives PR webhooks from GitHub and triggers the deployment guard + to analyze changes for production safety. + prompt: '' + handlingAgent: '' + agentMode: autonomous diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/common-prompts/investigation-guidelines.md b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/common-prompts/investigation-guidelines.md new file mode 100644 index 000000000..439ccc47e --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/common-prompts/investigation-guidelines.md @@ -0,0 +1,7 @@ +## Investigation guidelines + +- Always check the last 3 deployments for correlation +- Include timestamp, affected resource, and severity in all summaries +- Never take destructive actions without explicit approval +- Prefer read-only investigation before recommending changes +- Always provide an impact assessment (users affected, blast radius) \ No newline at end of file diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/common-prompts/investigation-guidelines.yaml b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/common-prompts/investigation-guidelines.yaml new file mode 100644 index 000000000..d7c1b4b8d --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/common-prompts/investigation-guidelines.yaml @@ -0,0 +1,15 @@ +metadata: + name: investigation-guidelines +spec: + prompt: '## Investigation guidelines + + + - Always check the last 3 deployments for correlation + + - Include timestamp, affected resource, and severity in all summaries + + - Never take destructive actions without explicit approval + + - Prefer read-only investigation before recommending changes + + - Always provide an impact assessment (users affected, blast radius)' diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/common-prompts/safety-rules.md b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/common-prompts/safety-rules.md new file mode 100644 index 000000000..efdb1dec9 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/common-prompts/safety-rules.md @@ -0,0 +1,7 @@ +## Safety rules + +- Never delete resources in production without explicit approval +- Always prefer read-only investigation before taking action +- Escalate to human if confidence is below 80% +- Do not modify network security groups or firewall rules +- Do not access or display secrets, keys, or connection strings \ No newline at end of file diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/common-prompts/safety-rules.yaml b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/common-prompts/safety-rules.yaml new file mode 100644 index 000000000..efa6dd631 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/common-prompts/safety-rules.yaml @@ -0,0 +1,15 @@ +metadata: + name: safety-rules +spec: + prompt: '## Safety rules + + + - Never delete resources in production without explicit approval + + - Always prefer read-only investigation before taking action + + - Escalate to human if confidence is below 80% + + - Do not modify network security groups or firewall rules + + - Do not access or display secrets, keys, or connection strings' diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/hooks/deny-prod-deletes.yaml b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/hooks/deny-prod-deletes.yaml new file mode 100644 index 000000000..54c43c90a --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/hooks/deny-prod-deletes.yaml @@ -0,0 +1,12 @@ +name: deny-prod-deletes +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If the tool targets a production resource (name contains 'prod' or 'prd'), + deny the action. Otherwise allow. + matcher: ^(delete_|remove_).* + timeout: 30 diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/hooks/require-approval-for-restarts.yaml b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/hooks/require-approval-for-restarts.yaml new file mode 100644 index 000000000..3e18543da --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/hooks/require-approval-for-restarts.yaml @@ -0,0 +1,12 @@ +name: require-approval-for-restarts +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If this action will restart or scale a resource, require human approval + before proceeding. + matcher: ^(restart_|scale_).* + timeout: 30 diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/skills/deployment-guard-analysis.md b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/skills/deployment-guard-analysis.md new file mode 100644 index 000000000..a0fc912c4 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/skills/deployment-guard-analysis.md @@ -0,0 +1,23 @@ +You are a deployment guard. When triggered by a PR webhook, you assess if the change is safe for production. + +Step 1: Read the PR diff from the connected GitHub repo. Identify what changed — app code, IaC, config, DB schema, dependencies. + +Step 2: Static analysis — check for breaking changes: API contract changes, removed endpoints, changed DB schemas, renamed env vars, missing error handling. + +Step 3: Capture production baseline. Use Dynatrace DQL to query current error rates, latency p50/p95/p99, throughput. Use az CLI to check ContainerAppConsoleLogs_CL in LAW. Also capture baseline API responses by sending test requests to production endpoints and recording the response structure, status codes, and key data fields. + +Step 4: Deploy the PR changes to the STAGING environment using az containerapp update. This is a separate environment from production — deploy the new image there. + +Step 5: Send synthetic test traffic to the staging services to exercise the code paths affected by the PR. Use ExecutePythonCode to send HTTP requests to the staging endpoints (e.g. GET /orders, POST /orders, GET /health) for 2-3 minutes. This is canary testing — you need real traffic to surface regressions like timeouts, 500s, or latency spikes. + +Step 6: Validate response correctness — compare staging API responses against the production baseline captured in Step 3. Look for any differences in response bodies, status codes, data fields, or behavior. The app may return 200 OK but serve degraded or incorrect data. + +Step 7: Monitor staging health for 5 minutes. Query Dynatrace and LAW for the staging services. Compare all metrics and response patterns against the production baseline. Use PlotAreaChartWithCorrelation to visualize. + +Step 8: Risk assessment — LOW (no functional or performance changes), MEDIUM (minor changes), HIGH (behavioral or performance regression detected), CRITICAL (staging failing or data integrity compromised). + +Step 9: Post a structured PR comment with: risk level, changes analyzed, static analysis findings, canary test results, any behavioral regressions found, health comparison table (prod baseline vs staging), and recommendation. + +Tools to use: RunAzCliReadCommands, RunAzCliWriteCommands, ExecutePythonCode, PlotAreaChartWithCorrelation, PlotBarChart, CreateGithubIssue, FindConnectedGitHubRepo, and all dynatrace MCP tools. + +# Updated by e2e test \ No newline at end of file diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/skills/deployment-guard-analysis.yaml b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/skills/deployment-guard-analysis.yaml new file mode 100644 index 000000000..a4a551772 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/skills/deployment-guard-analysis.yaml @@ -0,0 +1,17 @@ +metadata: + name: deployment-guard-analysis + description: Deployment guard that assesses PR safety for production by analyzing + diffs, capturing baselines, deploying to staging, running canary tests, validating + response correctness, and comparing health metrics. + spec: + tools: + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - CreateGithubIssue + - FindConnectedGitHubRepo + - QueryLogAnalyticsByWorkspaceId +skillContent: skills/deployment-guard-analysis.md +additionalFiles: [] diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/skills/investigate-app-errors.md b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/skills/investigate-app-errors.md new file mode 100644 index 000000000..6d2b7b6db --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/skills/investigate-app-errors.md @@ -0,0 +1,20 @@ +You are an application error investigator. When errors are reported, follow this workflow: + +1. **Identify the error**: Get the error details — HTTP status codes, exception types, affected endpoints, timestamps. + +2. **Check recent deployments**: Use az CLI to list recent Container App revisions or deployments. Correlate error start time with deployment timestamps. + +3. **Query Dynatrace**: Use DQL to query error rates, response times, and throughput for the affected services. Look for anomalies that started around the same time. + +4. **Query Log Analytics**: Check ContainerAppConsoleLogs_CL and ContainerAppSystemLogs_CL for exceptions, crash loops, or OOM kills. + +5. **Check dependencies**: Query Dynatrace for dependency health — databases, external APIs, message queues. An upstream failure may be the root cause. + +6. **Correlate findings**: Build a timeline of events — deployment, config change, traffic spike, dependency failure — and identify the most likely root cause. + +7. **Recommend fix**: Provide actionable recommendations — rollback, config change, scaling, or code fix with the specific file/line if the GitHub repo is connected. + +Always include: +- Impact assessment (users affected, error rate, duration) +- Root cause confidence level +- Recommended action with rollback option \ No newline at end of file diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/skills/investigate-app-errors.yaml b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/skills/investigate-app-errors.yaml new file mode 100644 index 000000000..669dddcb8 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/skills/investigate-app-errors.yaml @@ -0,0 +1,16 @@ +metadata: + name: investigate-app-errors + description: Investigate application errors using Dynatrace DQL and Log Analytics + to correlate errors with deployments, infrastructure changes, and dependencies. + spec: + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - ExecutePythonCode + - PlotAreaChartWithCorrelation +skillContent: skills/investigate-app-errors.md +additionalFiles: [] diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/subagents/deployment-guard.instructions.md b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/subagents/deployment-guard.instructions.md new file mode 100644 index 000000000..6906f6466 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/subagents/deployment-guard.instructions.md @@ -0,0 +1 @@ +You are the best engineer who guards production deployments operating in autonomous mode. Use the deployment-guard-analysis skill to assess PRs for production safety. Follow the full 9-step workflow: analyze the PR diff, perform static analysis, capture production baselines from Dynatrace and LAW, deploy to staging, send synthetic canary traffic, validate response correctness, monitor staging health, assess risk, and post a structured PR comment with your findings. \ No newline at end of file diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/subagents/deployment-guard.yaml b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/subagents/deployment-guard.yaml new file mode 100644 index 000000000..99ead646b --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/subagents/deployment-guard.yaml @@ -0,0 +1,31 @@ +metadata: + name: deployment-guard +spec: + instructions: subagents/deployment-guard.instructions.md + handoffDescription: Analyzes PRs by deploying to staging, comparing health against + production via Dynatrace + LAW, and posting risk assessment as a PR comment + handoffs: [] + tools: + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - CreateGithubIssue + - FindConnectedGitHubRepo + - QueryLogAnalyticsByWorkspaceId + - dynatrace_adaptive-anomaly-detector + - dynatrace_create-dql + - dynatrace_execute-dql + - dynatrace_explain-dql + - dynatrace_get-entity-id + - dynatrace_get-entity-name + - dynatrace_query-problems + - dynatrace_seasonal-baseline-anomaly-detector + - dynatrace_static-threshold-analyzer + - dynatrace_timeseries-forecast + - dynatrace_timeseries-novelty-detection + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/subagents/error-investigator.instructions.md b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/subagents/error-investigator.instructions.md new file mode 100644 index 000000000..cb2eeed54 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/subagents/error-investigator.instructions.md @@ -0,0 +1 @@ +You are an application error investigator. When errors are reported, use the investigate-app-errors skill to systematically diagnose the issue. Correlate Dynatrace metrics with Log Analytics data and deployment history. Always provide impact assessment and actionable recommendations with rollback options. \ No newline at end of file diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/subagents/error-investigator.yaml b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/subagents/error-investigator.yaml new file mode 100644 index 000000000..61fb2f3a4 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/config/subagents/error-investigator.yaml @@ -0,0 +1,23 @@ +metadata: + name: error-investigator +spec: + instructions: subagents/error-investigator.instructions.md + handoffDescription: Investigates application errors by correlating Dynatrace metrics, + LAW logs, and deployment history to identify root cause + handoffs: [] + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - FindConnectedGitHubRepo + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - dynatrace_get-entity-name + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/connectors.json b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/connectors.json new file mode 100644 index 000000000..93f104448 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/connectors.json @@ -0,0 +1,28 @@ +{ + "toggles": { + "enableAppInsightsConnector": false, + "appInsightsResourceId": "", + "appInsightsAppId": "", + "enableLogAnalyticsConnector": true, + "lawResourceId": "/subscriptions/cbf44432-7f45-4906-a85d-d2b14a1e8328/resourceGroups/rg-contoso-swe/providers/Microsoft.OperationalInsights/workspaces/law-7defkiyvn3r44", + "enableAzureMonitorConnector": false, + "azureMonitorLookbackDays": 7 + }, + "connectors": [ + { + "name": "dynatrace", + "properties": { + "dataConnectorType": "Mcp", + "dataSource": "placeholder", + "extendedProperties": { + "type": "http", + "endpoint": "https://dhu66396.apps.dynatrace.com/platform-reserved/mcp-gateway/v0.1/servers/dynatrace-mcp/mcp", + "authType": "BearerToken", + "bearerToken": "${DYNATRACE_BEARER_TOKEN}", + "partnerType": "DynatraceMcp" + }, + "identity": "system" + } + } + ] +} diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/data/knowledge/.gitkeep b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/data/knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/data/synthesized-knowledge.json b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/data/synthesized-knowledge.json new file mode 100644 index 000000000..bd9dd5365 --- /dev/null +++ b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/data/synthesized-knowledge.json @@ -0,0 +1,7 @@ +[ + { + "path": "synthesizedKnowledge/.gitkeep", + "size": 0, + "lastModified": "2026-05-27T01:51:16+00:00" + } +] diff --git a/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/data/synthesized-knowledge/.gitkeep b/sreagent-templates/dg-tf-bash-clone-export.sXgdFX/dg-tf-bash/data/synthesized-knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/.gitignore b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/.gitignore new file mode 100644 index 000000000..f903c363c --- /dev/null +++ b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/.gitignore @@ -0,0 +1,5 @@ +# Secrets — never commit +connectors.secrets.env +*.secrets.env +# Generated verification spec +expected-config.json diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/agent.json b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/agent.json new file mode 100644 index 000000000..4c8aaf9b5 --- /dev/null +++ b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/agent.json @@ -0,0 +1,26 @@ +{ + "_description": "SRE Agent configuration — edit these values to clone to a new environment.", + "_exported_at": "2026-05-27T02:04:47Z", + "identity": { + "agentName": "dg-tf-ps", + "resourceGroup": "rg-dg-tf-ps", + "subscription": "cbf44432-7f45-4906-a85d-d2b14a1e8328", + "location": "swedencentral", + "targetResourceGroups": [ + "rg-contoso-prod", + "rg-contoso-staging" + ] + }, + "access": { + "accessLevel": "High", + "actionMode": "Review" + }, + "upgradeChannel": "Preview", + "defaultModelProvider": "Anthropic", + "monthlyAgentUnitLimit": 10000, + "tags": {}, + "toggles": { + "enableWebhookBridge": false, + "webhookBridgeTriggerUrl": "" + } +} diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/automations/http-triggers/pr-deployment-guard.yaml b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/automations/http-triggers/pr-deployment-guard.yaml new file mode 100644 index 000000000..3ed61699f --- /dev/null +++ b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/automations/http-triggers/pr-deployment-guard.yaml @@ -0,0 +1,7 @@ +name: pr-deployment-guard +spec: + description: Receives PR webhooks from GitHub and triggers the deployment guard + to analyze changes for production safety. + prompt: '' + handlingAgent: '' + agentMode: autonomous diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/common-prompts/investigation-guidelines.md b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/common-prompts/investigation-guidelines.md new file mode 100644 index 000000000..439ccc47e --- /dev/null +++ b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/common-prompts/investigation-guidelines.md @@ -0,0 +1,7 @@ +## Investigation guidelines + +- Always check the last 3 deployments for correlation +- Include timestamp, affected resource, and severity in all summaries +- Never take destructive actions without explicit approval +- Prefer read-only investigation before recommending changes +- Always provide an impact assessment (users affected, blast radius) \ No newline at end of file diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/common-prompts/investigation-guidelines.yaml b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/common-prompts/investigation-guidelines.yaml new file mode 100644 index 000000000..d7c1b4b8d --- /dev/null +++ b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/common-prompts/investigation-guidelines.yaml @@ -0,0 +1,15 @@ +metadata: + name: investigation-guidelines +spec: + prompt: '## Investigation guidelines + + + - Always check the last 3 deployments for correlation + + - Include timestamp, affected resource, and severity in all summaries + + - Never take destructive actions without explicit approval + + - Prefer read-only investigation before recommending changes + + - Always provide an impact assessment (users affected, blast radius)' diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/common-prompts/safety-rules.md b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/common-prompts/safety-rules.md new file mode 100644 index 000000000..efdb1dec9 --- /dev/null +++ b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/common-prompts/safety-rules.md @@ -0,0 +1,7 @@ +## Safety rules + +- Never delete resources in production without explicit approval +- Always prefer read-only investigation before taking action +- Escalate to human if confidence is below 80% +- Do not modify network security groups or firewall rules +- Do not access or display secrets, keys, or connection strings \ No newline at end of file diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/common-prompts/safety-rules.yaml b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/common-prompts/safety-rules.yaml new file mode 100644 index 000000000..efa6dd631 --- /dev/null +++ b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/common-prompts/safety-rules.yaml @@ -0,0 +1,15 @@ +metadata: + name: safety-rules +spec: + prompt: '## Safety rules + + + - Never delete resources in production without explicit approval + + - Always prefer read-only investigation before taking action + + - Escalate to human if confidence is below 80% + + - Do not modify network security groups or firewall rules + + - Do not access or display secrets, keys, or connection strings' diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/hooks/deny-prod-deletes.yaml b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/hooks/deny-prod-deletes.yaml new file mode 100644 index 000000000..54c43c90a --- /dev/null +++ b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/hooks/deny-prod-deletes.yaml @@ -0,0 +1,12 @@ +name: deny-prod-deletes +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If the tool targets a production resource (name contains 'prod' or 'prd'), + deny the action. Otherwise allow. + matcher: ^(delete_|remove_).* + timeout: 30 diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/hooks/require-approval-for-restarts.yaml b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/hooks/require-approval-for-restarts.yaml new file mode 100644 index 000000000..3e18543da --- /dev/null +++ b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/hooks/require-approval-for-restarts.yaml @@ -0,0 +1,12 @@ +name: require-approval-for-restarts +type: GlobalHook +tags: [] +properties: + eventType: PreToolUse + activationMode: always + hook: + type: prompt + prompt: If this action will restart or scale a resource, require human approval + before proceeding. + matcher: ^(restart_|scale_).* + timeout: 30 diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/skills/deployment-guard-analysis.yaml b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/skills/deployment-guard-analysis.yaml new file mode 100644 index 000000000..8f2d3a5db --- /dev/null +++ b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/skills/deployment-guard-analysis.yaml @@ -0,0 +1,7 @@ +metadata: + name: deployment-guard-analysis + description: '' + spec: + tools: [] +skillContent: skills/deployment-guard-analysis.md +additionalFiles: [] diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/skills/investigate-app-errors.yaml b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/skills/investigate-app-errors.yaml new file mode 100644 index 000000000..d793ddd54 --- /dev/null +++ b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/skills/investigate-app-errors.yaml @@ -0,0 +1,7 @@ +metadata: + name: investigate-app-errors + description: '' + spec: + tools: [] +skillContent: skills/investigate-app-errors.md +additionalFiles: [] diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/subagents/deployment-guard.instructions.md b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/subagents/deployment-guard.instructions.md new file mode 100644 index 000000000..6906f6466 --- /dev/null +++ b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/subagents/deployment-guard.instructions.md @@ -0,0 +1 @@ +You are the best engineer who guards production deployments operating in autonomous mode. Use the deployment-guard-analysis skill to assess PRs for production safety. Follow the full 9-step workflow: analyze the PR diff, perform static analysis, capture production baselines from Dynatrace and LAW, deploy to staging, send synthetic canary traffic, validate response correctness, monitor staging health, assess risk, and post a structured PR comment with your findings. \ No newline at end of file diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/subagents/deployment-guard.yaml b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/subagents/deployment-guard.yaml new file mode 100644 index 000000000..99ead646b --- /dev/null +++ b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/subagents/deployment-guard.yaml @@ -0,0 +1,31 @@ +metadata: + name: deployment-guard +spec: + instructions: subagents/deployment-guard.instructions.md + handoffDescription: Analyzes PRs by deploying to staging, comparing health against + production via Dynatrace + LAW, and posting risk assessment as a PR comment + handoffs: [] + tools: + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - CreateGithubIssue + - FindConnectedGitHubRepo + - QueryLogAnalyticsByWorkspaceId + - dynatrace_adaptive-anomaly-detector + - dynatrace_create-dql + - dynatrace_execute-dql + - dynatrace_explain-dql + - dynatrace_get-entity-id + - dynatrace_get-entity-name + - dynatrace_query-problems + - dynatrace_seasonal-baseline-anomaly-detector + - dynatrace_static-threshold-analyzer + - dynatrace_timeseries-forecast + - dynatrace_timeseries-novelty-detection + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/subagents/error-investigator.instructions.md b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/subagents/error-investigator.instructions.md new file mode 100644 index 000000000..cb2eeed54 --- /dev/null +++ b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/subagents/error-investigator.instructions.md @@ -0,0 +1 @@ +You are an application error investigator. When errors are reported, use the investigate-app-errors skill to systematically diagnose the issue. Correlate Dynatrace metrics with Log Analytics data and deployment history. Always provide impact assessment and actionable recommendations with rollback options. \ No newline at end of file diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/subagents/error-investigator.yaml b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/subagents/error-investigator.yaml new file mode 100644 index 000000000..61fb2f3a4 --- /dev/null +++ b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/config/subagents/error-investigator.yaml @@ -0,0 +1,23 @@ +metadata: + name: error-investigator +spec: + instructions: subagents/error-investigator.instructions.md + handoffDescription: Investigates application errors by correlating Dynatrace metrics, + LAW logs, and deployment history to identify root cause + handoffs: [] + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - FindConnectedGitHubRepo + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - dynatrace_get-entity-name + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/connectors.json b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/connectors.json new file mode 100644 index 000000000..93f104448 --- /dev/null +++ b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/connectors.json @@ -0,0 +1,28 @@ +{ + "toggles": { + "enableAppInsightsConnector": false, + "appInsightsResourceId": "", + "appInsightsAppId": "", + "enableLogAnalyticsConnector": true, + "lawResourceId": "/subscriptions/cbf44432-7f45-4906-a85d-d2b14a1e8328/resourceGroups/rg-contoso-swe/providers/Microsoft.OperationalInsights/workspaces/law-7defkiyvn3r44", + "enableAzureMonitorConnector": false, + "azureMonitorLookbackDays": 7 + }, + "connectors": [ + { + "name": "dynatrace", + "properties": { + "dataConnectorType": "Mcp", + "dataSource": "placeholder", + "extendedProperties": { + "type": "http", + "endpoint": "https://dhu66396.apps.dynatrace.com/platform-reserved/mcp-gateway/v0.1/servers/dynatrace-mcp/mcp", + "authType": "BearerToken", + "bearerToken": "${DYNATRACE_BEARER_TOKEN}", + "partnerType": "DynatraceMcp" + }, + "identity": "system" + } + } + ] +} diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/data/knowledge/.gitkeep b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/data/knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/data/synthesized-knowledge/.gitkeep b/sreagent-templates/dg-tf-ps-clone-export.c80sL5/dg-tf-ps/data/synthesized-knowledge/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/sreagent-templates/recipes/dynatrace-servicenow/.gitignore b/sreagent-templates/recipes/dynatrace-servicenow/.gitignore new file mode 100644 index 000000000..c6d45e9af --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/.gitignore @@ -0,0 +1 @@ +connectors.secrets.env diff --git a/sreagent-templates/recipes/dynatrace-servicenow/agent.json b/sreagent-templates/recipes/dynatrace-servicenow/agent.json new file mode 100644 index 000000000..262902a5a --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/agent.json @@ -0,0 +1,95 @@ +{ + "_scenario": "dynatrace-servicenow", + "_description": "SRE Agent with Dynatrace MCP, LAW, GitHub source code, ServiceNow incident response, enterprise knowledge, and tool governance.", + "_prerequisites": [ + "Azure subscription with SRE Agent RP access", + "Dynatrace environment with MCP gateway access and API token", + "ServiceNow instance with API access", + "Log Analytics workspace connected to your app", + "GitHub repo with app source code", + "(Optional) Existing UAMI and App Insights for the agent" + ], + "_prompts": { + "agentName": { + "ask": "Agent name", + "default": "my-sre-agent" + }, + "resourceGroup": { + "ask": "Resource group for the agent", + "default": "sre-agent-rg" + }, + "location": { + "ask": "Region", + "options": ["eastus2", "swedencentral", "uksouth", "australiaeast"], + "required": true + }, + "targetRGs": { + "ask": "Resource groups to monitor (comma-separated)", + "required": true + }, + "dtTenant": { + "ask": "Dynatrace tenant ID (e.g. dhu66396)", + "required": true + }, + "dtToken": { + "ask": "Dynatrace API token (bearer token for MCP gateway)", + "required": true, + "secret": true + }, + "lawId": { + "ask": "Log Analytics workspace resource ID", + "required": true + }, + "githubRepo": { + "ask": "GitHub repo (org/repo format)", + "required": true + }, + "snowInstance": { + "ask": "ServiceNow instance URL (e.g. https://dev181595.service-now.com)", + "required": true + }, + "snowUser": { + "ask": "ServiceNow username", + "required": true + }, + "snowPassword": { + "ask": "ServiceNow password", + "required": true, + "secret": true + }, + "existingUamiId": { + "ask": "Existing UAMI resource ID (leave blank to create new)", + "default": "" + }, + "existingAgentAppInsightsId": { + "ask": "Existing App Insights resource ID for agent telemetry (leave blank to create new)", + "default": "" + }, + "modelProvider": { + "ask": "AI model provider", + "options": ["Anthropic", "MicrosoftFoundry"], + "default": "Anthropic" + } + }, + "identity": { + "agentName": "{{agentName}}", + "resourceGroup": "{{resourceGroup}}", + "subscription": "", + "location": "{{location}}", + "targetResourceGroups": "{{targetRGs}}" + }, + "access": { + "accessLevel": "High", + "actionMode": "Review" + }, + "upgradeChannel": "Preview", + "defaultModelProvider": "{{modelProvider}}", + "monthlyAgentUnitLimit": 10000, + "tags": {}, + "toggles": { + "enableWebhookBridge": false, + "webhookBridgeTriggerUrl": "" + }, + "existingUamiId": "{{existingUamiId}}", + "existingAgentAppInsightsId": "{{existingAgentAppInsightsId}}" +} diff --git a/sreagent-templates/recipes/dynatrace-servicenow/automations/incident-filters/snow-p1p2.yaml b/sreagent-templates/recipes/dynatrace-servicenow/automations/incident-filters/snow-p1p2.yaml new file mode 100644 index 000000000..c8589afc6 --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/automations/incident-filters/snow-p1p2.yaml @@ -0,0 +1,14 @@ +metadata: + name: snow-p1p2 +spec: + incidentPlatform: ServiceNow + isEnabled: true + priorities: + - "1" + - "2" + handlingAgent: dynatrace-investigator + agentMode: autonomous + deepInvestigationEnabled: false + maxAutomatedInvestigationAttempts: 3 + mergeEnabled: true + mergeWindowHours: 1 diff --git a/sreagent-templates/recipes/dynatrace-servicenow/automations/incident-platforms/servicenow.yaml b/sreagent-templates/recipes/dynatrace-servicenow/automations/incident-platforms/servicenow.yaml new file mode 100644 index 000000000..c852b7fb9 --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/automations/incident-platforms/servicenow.yaml @@ -0,0 +1,5 @@ +name: servicenow +spec: + platformType: ServiceNow + connectionUrl: "{{snowInstance}}" + connectionKey: '{"endpoint":"{{snowInstance}}","username":"{{snowUser}}","password":"{{snowPassword}}"}' diff --git a/sreagent-templates/recipes/dynatrace-servicenow/config/common-prompts/safety-rules.md b/sreagent-templates/recipes/dynatrace-servicenow/config/common-prompts/safety-rules.md new file mode 100644 index 000000000..dcaa7af5e --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/config/common-prompts/safety-rules.md @@ -0,0 +1,5 @@ +## Safety rules + +- Never restart services without paging the on-call. +- Always confirm subscription before destructive ops. +- For any High accessLevel action, require human review even if actionMode=Automatic. diff --git a/sreagent-templates/recipes/dynatrace-servicenow/config/common-prompts/safety-rules.yaml b/sreagent-templates/recipes/dynatrace-servicenow/config/common-prompts/safety-rules.yaml new file mode 100644 index 000000000..76b4fef17 --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/config/common-prompts/safety-rules.yaml @@ -0,0 +1,11 @@ +metadata: + name: safety-rules +spec: + prompt: '## Safety rules + + + - Never restart services without paging the on-call. + + - Always confirm subscription before destructive ops. + + - For any High accessLevel action, require human review even if actionMode=Automatic.' diff --git a/sreagent-templates/recipes/dynatrace-servicenow/config/hooks/deny-prod-deletes.yaml b/sreagent-templates/recipes/dynatrace-servicenow/config/hooks/deny-prod-deletes.yaml new file mode 100644 index 000000000..00303686a --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/config/hooks/deny-prod-deletes.yaml @@ -0,0 +1,11 @@ +metadata: + name: deny-prod-deletes +spec: + eventType: PreToolUse + hook: + type: prompt + prompt: If the tool targets a production resource (name contains 'prod' or 'prd'), + deny. + matcher: ^(delete_|remove_).* + permissionDecision: deny + enabled: true diff --git a/sreagent-templates/recipes/dynatrace-servicenow/config/hooks/require-approval-for-restarts.yaml b/sreagent-templates/recipes/dynatrace-servicenow/config/hooks/require-approval-for-restarts.yaml new file mode 100644 index 000000000..3eae406c9 --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/config/hooks/require-approval-for-restarts.yaml @@ -0,0 +1,11 @@ +metadata: + name: require-approval-for-restarts +spec: + eventType: PreToolUse + hook: + type: prompt + prompt: If this action will restart or scale a resource, require human approval + before proceeding. + matcher: ^(restart_|scale_).* + permissionDecision: allow + enabled: true diff --git a/sreagent-templates/recipes/dynatrace-servicenow/config/repos/github-repo.yaml b/sreagent-templates/recipes/dynatrace-servicenow/config/repos/github-repo.yaml new file mode 100644 index 000000000..b29c262d3 --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/config/repos/github-repo.yaml @@ -0,0 +1,5 @@ +name: github-repo +spec: + url: "{{githubRepo}}" + branch: main + description: Connected GitHub repository diff --git a/sreagent-templates/recipes/dynatrace-servicenow/config/skills/investigate-app-errors.md b/sreagent-templates/recipes/dynatrace-servicenow/config/skills/investigate-app-errors.md new file mode 100644 index 000000000..1954d2041 --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/config/skills/investigate-app-errors.md @@ -0,0 +1,43 @@ +# Investigate Application Errors + +You are investigating an alert triggered by Dynatrace. Follow these steps: + +## Step 1: Understand the alert +Read the alert payload from the HTTP trigger. Identify the affected service, error type (4xx vs 5xx), and time window. + +## Step 2: Confirm the issue is real +Use Dynatrace MCP tools to query traces and error rates for the affected service. Also query LAW (ContainerAppConsoleLogs_CL) and tail Container Apps logs via az CLI. +Check if: +- Error rate is sustained (not a single blip) +- Multiple users/endpoints are affected +- The error started at a specific time (correlates with a deployment or config change) +Plot error rate over time to visualize the pattern. + +## Step 3: Gather evidence from Dynatrace +- Query distributed traces showing the error path (which service fails, what upstream calls it) +- Check service metrics: response time, throughput, error rate +- Look at logs around the error timestamp for stack traces or error messages + +## Step 4: Check for recent changes +- Use az CLI to check recent deployments and activity logs in the target resource group across all dependencies over last 24 hours +- If a GitHub repo is connected, check recent commits/PRs around the error start time + +## Step 5: Identify root cause from source code +If the error traces point to a specific endpoint or service: +- Look at the source code for that endpoint +- Check for common issues: null references, timeout configs, missing error handling, database query issues + +## Step 6: Suggest mitigation +Based on findings, suggest concrete actions: +- If deployment-related: rollback command +- If config-related: specific setting to change +- If code bug: describe the fix and affected file/line + +## Step 7: Create incident report +Create a GitHub issue with: +- **Summary**: One-line description +- **Impact**: Services affected, error rate, user impact +- **Timeline**: When it started, when detected +- **Evidence**: Charts, trace IDs, log excerpts +- **Root Cause**: What went wrong +- **Mitigation**: Steps taken or recommended diff --git a/sreagent-templates/recipes/dynatrace-servicenow/config/skills/investigate-app-errors.yaml b/sreagent-templates/recipes/dynatrace-servicenow/config/skills/investigate-app-errors.yaml new file mode 100644 index 000000000..d5f6be94c --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/config/skills/investigate-app-errors.yaml @@ -0,0 +1,25 @@ +metadata: + name: investigate-app-errors + description: Investigate application errors using Dynatrace telemetry and source + code. + spec: + tools: + - SearchMemory + - RunAzCliReadCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - dynatrace_create-dql + - dynatrace_execute-dql + - dynatrace_explain-dql + - dynatrace_query-problems + - dynatrace_get-problem-by-id + - dynatrace_get-entity-id + - dynatrace_get-entity-name + - dynatrace_find-troubleshooting-guides + - dynatrace_adaptive-anomaly-detector + - dynatrace_seasonal-baseline-anomaly-detector + - dynatrace_static-threshold-analyzer + - dynatrace_timeseries-forecast +skillContent: skills/investigate-app-errors.md +additionalFiles: [] diff --git a/sreagent-templates/recipes/dynatrace-servicenow/config/subagents/dynatrace-investigator.instructions.md b/sreagent-templates/recipes/dynatrace-servicenow/config/subagents/dynatrace-investigator.instructions.md new file mode 100644 index 000000000..dd2c483a5 --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/config/subagents/dynatrace-investigator.instructions.md @@ -0,0 +1,26 @@ +You are an expert in triaging and diagnosing incidents. When triggered, search the knowledge base for the relevant runbook, execute the diagnostic steps, collect evidence, and create a GitHub issue with your findings including root cause, evidence, and remediation actions. + +INVESTIGATION STRATEGY: +1. Always search memory first for similar incidents or relevant runbooks +2. Use Dynatrace MCP tools, AZ CLI and Log Analytics workspace tools to collect telemetry evidence: + - Traces for detailed request flows and error spans + - Logs for error messages and exceptions + - Metrics for performance trends and anomalies + - Service dependencies to identify impacted components +3. Use Azure CLI tools to investigate infrastructure and dependencies over last 24 hours +4. Examine source code for error handling, recent changes, and dependency configurations + +ANALYSIS APPROACH: +- Do a deep, thorough analysis to find the root cause backed by data +- Investigate if anything changed in dependencies (Azure resources, source code, deployments, configuration) +- Correlate error start times with change timestamps +- Use ExecutePythonCode to plot metrics charts when presenting evidence +- Prove root cause with concrete evidence, not speculation + +OUTPUT: +Create a GitHub issue with: +- Summary: What is failing and the impact +- Timeline: When it started and key events +- Evidence: Data from Dynatrace, Azure, logs, metrics with charts where helpful +- Root Cause: The proven cause backed by data +- Remediation: Specific steps to resolve the issue diff --git a/sreagent-templates/recipes/dynatrace-servicenow/config/subagents/dynatrace-investigator.yaml b/sreagent-templates/recipes/dynatrace-servicenow/config/subagents/dynatrace-investigator.yaml new file mode 100644 index 000000000..f0f34b988 --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/config/subagents/dynatrace-investigator.yaml @@ -0,0 +1,35 @@ +metadata: + name: dynatrace-investigator +spec: + instructions: subagents/dynatrace-investigator.instructions.md + handoffDescription: Investigates Dynatrace alerts using distributed traces, metrics, + logs, and source code analysis. + tools: + - SearchMemory + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - CreateGithubIssue + - FetchGithubIssues + - FindConnectedGitHubRepo + - PlotAreaChartWithCorrelation + - PlotBarChart + - PlotHeatmap + - dynatrace_create-dql + - dynatrace_execute-dql + - dynatrace_explain-dql + - dynatrace_query-problems + - dynatrace_get-problem-by-id + - dynatrace_get-entity-id + - dynatrace_get-entity-name + - dynatrace_find-troubleshooting-guides + - dynatrace_adaptive-anomaly-detector + - dynatrace_seasonal-baseline-anomaly-detector + - dynatrace_static-threshold-analyzer + - dynatrace_timeseries-forecast + agentType: Autonomous + temperature: 0.2 + handoffs: [] + enableSkills: true + allowedSkills: + - investigate-app-errors diff --git a/sreagent-templates/recipes/dynatrace-servicenow/connectors.json b/sreagent-templates/recipes/dynatrace-servicenow/connectors.json new file mode 100644 index 000000000..0818d8bed --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/connectors.json @@ -0,0 +1,30 @@ +{ + "toggles": { + "enableAppInsightsConnector": false, + "appInsightsResourceId": "", + "appInsightsAppId": "", + "enableLogAnalyticsConnector": "{{lawId:bool}}", + "lawResourceId": "{{lawId}}", + "enableAzureMonitorConnector": false, + "azureMonitorLookbackDays": 7, + "grafanaUrl": "", + "grafanaApiKey": "" + }, + "connectors": [ + { + "name": "dynatrace", + "properties": { + "dataConnectorType": "Mcp", + "dataSource": "placeholder", + "extendedProperties": { + "type": "http", + "endpoint": "https://{{dtTenant}}.apps.dynatrace.com/platform-reserved/mcp-gateway/v0.1/servers/dynatrace-mcp/mcp", + "authType": "BearerToken", + "bearerToken": "${DYNATRACE_BEARER_TOKEN}", + "partnerType": "DynatraceMcp" + }, + "identity": "system" + } + } + ] +} diff --git a/sreagent-templates/recipes/dynatrace-servicenow/data/enterprise-architecture.md b/sreagent-templates/recipes/dynatrace-servicenow/data/enterprise-architecture.md new file mode 100644 index 000000000..27600b4e8 --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/data/enterprise-architecture.md @@ -0,0 +1,76 @@ +# Enterprise App Architecture Reference + +## Overview + +This document describes the enterprise demo application: a Node.js API backed by PostgreSQL, running on a private AKS cluster with all monitoring routed through Azure Monitor Private Link Scope (AMPLS). + +## Components + +### Application Tier — AKS (Private Cluster) +- Private AKS cluster — API server has no public endpoint +- Node.js Express API serving /api/orders, /api/products, /api/health +- Application Insights SDK integrated for request telemetry +- All kubectl operations require `az aks command invoke` (ARM proxy) +- Namespace: default + +### Data Tier — PostgreSQL Flexible Server +- PostgreSQL 16, VNet-delegated subnet (no public access) +- Database: enterprise_db +- Tables: products, orders +- Entra auth or password auth depending on deployment +- Connection from app pods goes through VNet-internal routing + +### Networking +- **VNet**: Single VNet with three subnets + - `aks-subnet` (10.0.0.0/16) — AKS nodes and pods + - `db-subnet` (10.1.0.0/24) — PostgreSQL (delegated) + - `ampls-subnet` (10.2.0.0/24) — Private endpoint for AMPLS +- **NSG**: On AKS subnet, allow-all-outbound baseline +- **Private DNS Zones**: PostgreSQL + Azure Monitor (monitor, oms, ods, agentsvc) + +### Monitoring — AMPLS (Azure Monitor Private Link Scope) +- **App Insights**: Ingestion and query both set to PrivateOnly +- **Log Analytics**: Ingestion and query both set to PrivateOnly +- **AMPLS**: Links both AI and LAW, with a private endpoint in ampls-subnet +- **Private DNS Zones**: Four zones (privatelink.monitor.azure.com, privatelink.oms.opinsights.azure.com, privatelink.ods.opinsights.azure.com, privatelink.agentsvc.azure-automation.net) linked to the VNet +- All telemetry from AKS pods flows through the private endpoint — no public ingestion +- Alert rule: `app-5xx-errors` fires on requests/failed > 5 (auto-mitigate enabled) + +### Incident Management — ServiceNow +- P1/P2 incidents from ServiceNow route to the azure-monitor-investigator subagent +- Agent operates in Autonomous mode with merge window of 1 hour +- Hooks require approval for writes and block deletes + +## Request Flow + +``` +Client → AKS Ingress → enterprise-api pod → PostgreSQL (private) + ↓ + App Insights SDK + ↓ + AMPLS Private Endpoint + ↓ + Log Analytics Workspace +``` + +## Failure Modes + +| Failure | Symptom | How to detect | How to fix | +|---|---|---|---| +| PostgreSQL stopped | 500 on /api/orders, /api/health returns unhealthy | `az postgres flexible-server show --query state` | `az postgres flexible-server start` | +| PostgreSQL connection refused | 500 with ECONNREFUSED | App Insights dependency failures | Check NSG rules, VNet delegation | +| Pod crash loop | 502 from ingress | `az aks command invoke -- kubectl get pods` | Check logs, restart deployment | +| Slow queries | High latency on /api/products | App Insights request duration | Check missing indexes, ANALYZE | +| AMPLS misconfigured | No telemetry flowing | Check AI ingestion in portal | Verify private endpoint + DNS zones | + +## Access Patterns for the SRE Agent + +The agent is outside the VNet. All access is through the Azure control plane: + +| Resource | Read | Remediate | +|---|---|---| +| AKS | `az aks command invoke -- kubectl get/logs/describe` | `az aks command invoke -- kubectl delete/rollout restart` | +| PostgreSQL | `az postgres flexible-server show/list` | `az postgres flexible-server start/restart` | +| App Insights | Built-in connector (private query via AMPLS) | N/A (read-only) | +| Log Analytics | Built-in connector (private query via AMPLS) | N/A (read-only) | +| Activity Logs | KQL via LAW | N/A (read-only) | diff --git a/sreagent-templates/recipes/dynatrace-servicenow/data/enterprise-runbook.md b/sreagent-templates/recipes/dynatrace-servicenow/data/enterprise-runbook.md new file mode 100644 index 000000000..9c8f46df5 --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/data/enterprise-runbook.md @@ -0,0 +1,50 @@ +# Enterprise Runbook — Permitted Mitigations + +## Autonomous Actions (no human approval needed) + +These actions are pre-approved per organization policy. The agent can execute them immediately during an incident: + +### Database +- `az postgres flexible-server start` — restart a stopped PostgreSQL server +- `az postgres flexible-server restart` — restart a running PostgreSQL server +- `az postgres flexible-server parameter set` — change server parameters + +### Kubernetes (via az aks command invoke) +- `kubectl rollout restart deployment/` — restart pods +- `kubectl describe pod/` — diagnose pod issues +- `kubectl logs ` — read pod logs +- `kubectl get events` — check cluster events + +### Container Apps +- `az containerapp update` — update configuration (env vars, scaling) +- `az containerapp revision copy` — create new revision +- `az containerapp revision activate` — activate a revision + +## Require Human Approval (Ask hook) + +These actions are blocked until a human approves: + +### All Write Operations +- Any `RunAzCliWriteCommands` call triggers the approval hook +- The agent shows what it wants to do and waits for "yes" +- Reads (list, show, get, describe, logs, query) pass through without approval + +## Denied (blocked globally) + +These actions are never allowed: + +- `kubectl delete` anything +- Delete any Azure resource +- Scale infrastructure (node pools, VM sizes) +- IAM / role assignment changes +- Schema migrations or data modifications on PostgreSQL +- Network security group or firewall rule changes +- Access or display secrets, keys, or connection strings + +## Verification After Remediation + +After every remediation action: +1. Wait 2-3 minutes for the change to take effect +2. Re-check the affected endpoint (health check, Azure Monitor query) +3. Confirm error rate is back to baseline +4. Mark the incident as resolved if confirmed healthy diff --git a/sreagent-templates/recipes/dynatrace-servicenow/data/incident-report-template.md b/sreagent-templates/recipes/dynatrace-servicenow/data/incident-report-template.md new file mode 100644 index 000000000..4554f1dfe --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/data/incident-report-template.md @@ -0,0 +1,79 @@ +# Incident Report Template + +## Summary + + +**Incident ID**: [ServiceNow / Azure Monitor Alert ID] +**Severity**: [P1 / P2 / Sev0-4] +**Duration**: [Start time] - [End time] ([total duration]) +**Status**: [Resolved / Mitigated / Ongoing] + +--- + +## Impact + +- **Users Affected**: [Number or percentage] +- **Services Affected**: [List endpoints/services] +- **Data Loss**: [Yes/No] +- **SLA Impact**: [Uptime SLA breached?] + +--- + +## Timeline + +| Time (UTC) | Event | +|---|---| +| HH:MM | Alert triggered | +| HH:MM | SRE Agent investigation started | +| HH:MM | Root cause identified | +| HH:MM | Remediation executed (with approval) | +| HH:MM | Service restored | +| HH:MM | Verification confirmed recovery | +| HH:MM | Incident closed | + +--- + +## Evidence + +### Azure Monitor + + +### App Insights / Log Analytics + + +### Activity Log + + +### GitHub + + +--- + +## Root Cause + + + +**Category**: [Database / Network / Deployment / Configuration / Code Bug / Capacity] + +--- + +## Remediation + +### Actions Taken by SRE Agent +1. [Automated action with hook approval reference] +2. [Verification step] + +### Follow-Up Actions +| Action | Owner | Due Date | Status | +|---|---|---|---| +| [Fix description] | [Team] | [Date] | [Open/Done] | + +--- + +## Compliance Check + + +- Deployment method: [CI/CD Pipeline / Portal / CLI] +- Compliance status: [COMPLIANT / NON-COMPLIANT] +- Caller identity: [Service Principal / User Principal] +- Image labels verified: [Yes/No] diff --git a/sreagent-templates/recipes/dynatrace-servicenow/tool-permissions.json b/sreagent-templates/recipes/dynatrace-servicenow/tool-permissions.json new file mode 100644 index 000000000..e9cd3114c --- /dev/null +++ b/sreagent-templates/recipes/dynatrace-servicenow/tool-permissions.json @@ -0,0 +1,13 @@ +{ + "allow": [ + "RunInTerminal", + "ExecutePythonCode" + ], + "ask": [ + "RunAzCliWriteCommands" + ], + "deny": [ + "FetchWebpage", + "SearchWebpage" + ] +} diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/.gitignore b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/.gitignore new file mode 100644 index 000000000..ba950ef32 --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/.gitignore @@ -0,0 +1,3 @@ +connectors.secrets.env +*.secrets.env +data/ diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/README.md b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/README.md new file mode 100644 index 000000000..bfbb8283c --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/README.md @@ -0,0 +1,124 @@ +# law-dynatrace-github-httptrigger-prvalidation + +PR Deployment Guard: an SRE Agent that reviews every pull request by deploying changes to staging, running canary tests against production baselines via Log Analytics + Dynatrace, and posting a risk assessment as a PR comment — before the code is merged. + +The agent receives PR events from GitHub via an HTTP trigger (Logic App webhook bridge), analyzes the diff, deploys to staging, sends synthetic traffic, compares health metrics, and comments on the PR with a LOW / MEDIUM / HIGH / CRITICAL risk rating. + +## Prerequisites + +- Azure subscription with **production** and **staging** resource groups +- Log Analytics workspace connected to your Container Apps / App Services +- Dynatrace environment with MCP gateway access and API token +- GitHub repo with app source code +- All [CLI tools](../../README.md#prerequisites) installed (`./bin/install-prerequisites.sh --check`) + +## Quick Start + +### Step 1 — Generate agent config + +```bash +./bin/new-agent.sh --recipe law-dynatrace-github-httptrigger-prvalidation --non-interactive \ + --set agentName=contoso-sre \ + --set resourceGroup=rg-sre-contoso \ + --set location=eastus2 \ + --set lawId=/subscriptions//resourceGroups//providers/Microsoft.OperationalInsights/workspaces/ \ + --set dtTenant=abc12345 \ + --set dtToken=dt0c01.xxx \ + --set githubRepo=contoso/trading-app \ + --set targetRGs=rg-contoso-prod,rg-contoso-staging \ + -o contoso-sre/ +``` + +### Step 2 — Deploy + +| Backend | Command | +|---|---| +| Bicep | `./bin/deploy.sh contoso-sre/` | +| Terraform | `./bin/deploy-tf.sh contoso-sre/` | +| PowerShell | `./bin/ps/Deploy-Agent.ps1 -InputPath contoso-sre/` | + +### Step 3 — Set up the Dynatrace secret + +```bash +echo "DYNATRACE_BEARER_TOKEN=dt0c01.your-token-here" > contoso-sre/connectors.secrets.env +``` + +Then redeploy or run `./bin/deploy.sh contoso-sre/` to apply. + +### Step 4 — Wire up GitHub PR workflow + +Copy the sample workflow to your app repo: + +```bash +cp contoso-sre/sample-github-workflow.yml \ + /path/to/your-app/.github/workflows/sre-agent-pr-guard.yml +``` + +Add the webhook URL as a GitHub secret: + +```bash +# Get the Logic App trigger URL from the agent's webhook bridge +WEBHOOK_URL=$(az resource show \ + --resource-group rg-sre-contoso \ + --resource-type Microsoft.Logic/workflows \ + --name \ + --query "properties.accessEndpoint" -o tsv) + +gh secret set SRE_AGENT_WEBHOOK_URL --repo contoso/trading-app --body "$WEBHOOK_URL" +``` + +### Step 5 — Test it + +Open a PR on your app repo — the GitHub workflow sends the PR event to the agent, which triggers the deployment guard. The agent will: + +1. Read the PR diff +2. Capture production baseline metrics from Dynatrace + LAW +3. Deploy changes to staging +4. Send synthetic canary traffic +5. Compare staging health against production +6. Post a risk assessment comment on the PR + +## Parameters + +| Param | Required | Example | Description | +|---|---|---|---| +| agentName | ✅ | `contoso-sre` | Agent name (lowercase, hyphens) | +| resourceGroup | ✅ | `rg-sre-contoso` | Resource group for the agent | +| location | ✅ | `eastus2` | Azure region | +| targetRGs | ✅ | `rg-contoso-prod,rg-contoso-staging` | Resource groups the agent monitors | +| lawId | ✅ | `/subscriptions/.../workspaces/...` | Log Analytics workspace resource ID | +| dtTenant | ✅ | `abc12345` | Dynatrace tenant ID | +| dtToken | ✅ | `dt0c01.xxx` | Dynatrace API token (stored as secret) | +| githubRepo | ✅ | `contoso/trading-app` | GitHub org/repo | +| modelProvider | | `Anthropic` | AI model provider (Anthropic or Azure OpenAI) | + +## What You Get + +| Category | Items | +|---|---| +| **Connectors** | Log Analytics, Dynatrace MCP | +| **Skills** | deployment-guard-analysis, investigate-app-errors | +| **Subagents** | deployment-guard, error-investigator | +| **HTTP Trigger** | pr-deployment-guard (receives GitHub PR webhooks) | +| **Hooks** | deny-prod-deletes, require-approval-for-restarts | +| **Common Prompts** | investigation-guidelines, safety-rules | +| **GitHub Repo** | Connected for diff analysis and PR comments | + +## Architecture + +``` +GitHub PR → GitHub Actions workflow → Logic App webhook bridge → SRE Agent HTTP trigger + ↓ + deployment-guard subagent + ↓ + ┌───────────────┼───────────────┐ + ↓ ↓ ↓ + Read PR diff Deploy to staging Query Dynatrace + ↓ + LAW baselines + Canary traffic + ↓ + Compare health + ↓ + Post PR comment with + risk assessment +``` diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/agent.json b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/agent.json new file mode 100644 index 000000000..8ac03742e --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/agent.json @@ -0,0 +1,89 @@ +{ + "_scenario": "law-dynatrace-github-httptrigger-prvalidation", + "_description": "PR Deployment Guard: SRE Agent that reviews every PR by deploying to staging, comparing health against production via LAW + Dynatrace, and posting risk assessments as PR comments.", + "_prerequisites": [ + "Azure subscription with production and staging resource groups", + "Log Analytics workspace connected to your Container Apps or App Services", + "Dynatrace environment with MCP gateway access and API token", + "GitHub repo with app source code", + "GitHub Actions or equivalent CI/CD to send PR webhooks" + ], + "_prompts": { + "agentName": { + "ask": "Agent name (lowercase, hyphens ok)", + "default": "my-sre-agent" + }, + "resourceGroup": { + "ask": "Resource group for the agent", + "default": "sre-agent-rg" + }, + "location": { + "ask": "Region", + "options": [ + "eastus2", + "swedencentral", + "uksouth", + "australiaeast" + ], + "required": true + }, + "targetRGs": { + "ask": "Resource groups the agent can access (comma-separated). Include your app's prod and staging RGs \u2014 the agent needs these to deploy to staging and read container app config. The LAW/AppInsights RG is NOT needed here if you provided the full resource ID above.", + "required": true + }, + "lawId": { + "ask": "Log Analytics workspace resource ID", + "required": true + }, + "dtTenant": { + "ask": "Dynatrace tenant ID (e.g. abc12345)", + "required": true + }, + "dtToken": { + "ask": "Dynatrace API token", + "required": true, + "secret": true + }, + "githubRepo": { + "ask": "GitHub repo (org/repo format, e.g. contoso/trading-app)", + "required": true + }, + "existingUamiId": { + "ask": "Existing UAMI resource ID (leave blank to create new)", + "default": "" + }, + "modelProvider": { + "ask": "AI model provider", + "options": [ + "Anthropic", + "Azure OpenAI" + ], + "default": "Anthropic" + }, + "existingAgentAppInsightsId": { + "ask": "Existing App Insights resource ID for agent telemetry (leave blank to create new)", + "default": "" + } + }, + "identity": { + "agentName": "{{agentName}}", + "resourceGroup": "{{resourceGroup}}", + "subscription": "", + "location": "{{location}}", + "targetResourceGroups": "{{targetRGs}}" + }, + "access": { + "accessLevel": "High", + "actionMode": "Review" + }, + "upgradeChannel": "Preview", + "defaultModelProvider": "{{modelProvider}}", + "monthlyAgentUnitLimit": 10000, + "tags": {}, + "toggles": { + "enableWebhookBridge": true, + "webhookBridgeTriggerUrl": "" + }, + "existingUamiId": "{{existingUamiId}}", + "existingAgentAppInsightsId": "{{existingAgentAppInsightsId}}" +} \ No newline at end of file diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/automations/http-triggers/pr-deployment-guard.yaml b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/automations/http-triggers/pr-deployment-guard.yaml new file mode 100644 index 000000000..0e652635d --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/automations/http-triggers/pr-deployment-guard.yaml @@ -0,0 +1,6 @@ +name: pr-deployment-guard +spec: + description: Receives PR webhooks from GitHub and triggers the deployment guard to analyze changes for production safety. + prompt: A PR webhook has been received from the connected GitHub repo. Follow the deployment-guard-analysis skill. + handlingAgent: deployment-guard + agentMode: autonomous diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/common-prompts/investigation-guidelines.yaml b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/common-prompts/investigation-guidelines.yaml new file mode 100644 index 000000000..d7c1b4b8d --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/common-prompts/investigation-guidelines.yaml @@ -0,0 +1,15 @@ +metadata: + name: investigation-guidelines +spec: + prompt: '## Investigation guidelines + + + - Always check the last 3 deployments for correlation + + - Include timestamp, affected resource, and severity in all summaries + + - Never take destructive actions without explicit approval + + - Prefer read-only investigation before recommending changes + + - Always provide an impact assessment (users affected, blast radius)' diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/common-prompts/safety-rules.yaml b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/common-prompts/safety-rules.yaml new file mode 100644 index 000000000..efa6dd631 --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/common-prompts/safety-rules.yaml @@ -0,0 +1,15 @@ +metadata: + name: safety-rules +spec: + prompt: '## Safety rules + + + - Never delete resources in production without explicit approval + + - Always prefer read-only investigation before taking action + + - Escalate to human if confidence is below 80% + + - Do not modify network security groups or firewall rules + + - Do not access or display secrets, keys, or connection strings' diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/hooks/deny-prod-deletes.yaml b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/hooks/deny-prod-deletes.yaml new file mode 100644 index 000000000..4545f0aae --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/hooks/deny-prod-deletes.yaml @@ -0,0 +1,11 @@ +metadata: + name: deny-prod-deletes +spec: + eventType: PreToolUse + hook: + type: prompt + prompt: If the tool targets a production resource (name contains 'prod' or 'prd'), + deny the action. Otherwise allow. + matcher: ^(delete_|remove_).* + permissionDecision: deny + enabled: true diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/hooks/require-approval-for-restarts.yaml b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/hooks/require-approval-for-restarts.yaml new file mode 100644 index 000000000..3eae406c9 --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/hooks/require-approval-for-restarts.yaml @@ -0,0 +1,11 @@ +metadata: + name: require-approval-for-restarts +spec: + eventType: PreToolUse + hook: + type: prompt + prompt: If this action will restart or scale a resource, require human approval + before proceeding. + matcher: ^(restart_|scale_).* + permissionDecision: allow + enabled: true diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/repos/github-repo.yaml b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/repos/github-repo.yaml new file mode 100644 index 000000000..b29c262d3 --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/repos/github-repo.yaml @@ -0,0 +1,5 @@ +name: github-repo +spec: + url: "{{githubRepo}}" + branch: main + description: Connected GitHub repository diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/skills/deployment-guard-analysis.md b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/skills/deployment-guard-analysis.md new file mode 100644 index 000000000..9fd80e96a --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/skills/deployment-guard-analysis.md @@ -0,0 +1,21 @@ +You are a deployment guard. When triggered by a PR webhook, you assess if the change is safe for production. + +Step 1: Read the PR diff from the connected GitHub repo. Identify what changed — app code, IaC, config, DB schema, dependencies. + +Step 2: Static analysis — check for breaking changes: API contract changes, removed endpoints, changed DB schemas, renamed env vars, missing error handling. + +Step 3: Capture production baseline. Use Dynatrace DQL to query current error rates, latency p50/p95/p99, throughput. Use az CLI to check ContainerAppConsoleLogs_CL in LAW. Also capture baseline API responses by sending test requests to production endpoints and recording the response structure, status codes, and key data fields. + +Step 4: Deploy the PR changes to the STAGING environment using az containerapp update. This is a separate environment from production — deploy the new image there. + +Step 5: Send synthetic test traffic to the staging services to exercise the code paths affected by the PR. Use ExecutePythonCode to send HTTP requests to the staging endpoints (e.g. GET /orders, POST /orders, GET /health) for 2-3 minutes. This is canary testing — you need real traffic to surface regressions like timeouts, 500s, or latency spikes. + +Step 6: Validate response correctness — compare staging API responses against the production baseline captured in Step 3. Look for any differences in response bodies, status codes, data fields, or behavior. The app may return 200 OK but serve degraded or incorrect data. + +Step 7: Monitor staging health for 5 minutes. Query Dynatrace and LAW for the staging services. Compare all metrics and response patterns against the production baseline. Use PlotAreaChartWithCorrelation to visualize. + +Step 8: Risk assessment — LOW (no functional or performance changes), MEDIUM (minor changes), HIGH (behavioral or performance regression detected), CRITICAL (staging failing or data integrity compromised). + +Step 9: Post a structured PR comment with: risk level, changes analyzed, static analysis findings, canary test results, any behavioral regressions found, health comparison table (prod baseline vs staging), and recommendation. + +Tools to use: RunAzCliReadCommands, RunAzCliWriteCommands, ExecutePythonCode, PlotAreaChartWithCorrelation, PlotBarChart, CreateGithubIssue, FindConnectedGitHubRepo, and all dynatrace MCP tools. diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/skills/deployment-guard-analysis.yaml b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/skills/deployment-guard-analysis.yaml new file mode 100644 index 000000000..c2db2ab13 --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/skills/deployment-guard-analysis.yaml @@ -0,0 +1,28 @@ +metadata: + name: deployment-guard-analysis + description: Deployment guard that assesses PR safety for production by analyzing + diffs, capturing baselines, deploying to staging, running canary tests, validating + response correctness, and comparing health metrics. + spec: + tools: + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - CreateGithubIssue + - FindConnectedGitHubRepo + - QueryLogAnalyticsByWorkspaceId + - dynatrace_adaptive-anomaly-detector + - dynatrace_create-dql + - dynatrace_execute-dql + - dynatrace_explain-dql + - dynatrace_get-entity-id + - dynatrace_get-entity-name + - dynatrace_query-problems + - dynatrace_seasonal-baseline-anomaly-detector + - dynatrace_static-threshold-analyzer + - dynatrace_timeseries-forecast + - dynatrace_timeseries-novelty-detection +skillContent: skills/deployment-guard-analysis.md +additionalFiles: [] diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/skills/investigate-app-errors.md b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/skills/investigate-app-errors.md new file mode 100644 index 000000000..508a81608 --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/skills/investigate-app-errors.md @@ -0,0 +1,20 @@ +You are an application error investigator. When errors are reported, follow this workflow: + +1. **Identify the error**: Get the error details — HTTP status codes, exception types, affected endpoints, timestamps. + +2. **Check recent deployments**: Use az CLI to list recent Container App revisions or deployments. Correlate error start time with deployment timestamps. + +3. **Query Dynatrace**: Use DQL to query error rates, response times, and throughput for the affected services. Look for anomalies that started around the same time. + +4. **Query Log Analytics**: Check ContainerAppConsoleLogs_CL and ContainerAppSystemLogs_CL for exceptions, crash loops, or OOM kills. + +5. **Check dependencies**: Query Dynatrace for dependency health — databases, external APIs, message queues. An upstream failure may be the root cause. + +6. **Correlate findings**: Build a timeline of events — deployment, config change, traffic spike, dependency failure — and identify the most likely root cause. + +7. **Recommend fix**: Provide actionable recommendations — rollback, config change, scaling, or code fix with the specific file/line if the GitHub repo is connected. + +Always include: +- Impact assessment (users affected, error rate, duration) +- Root cause confidence level +- Recommended action with rollback option diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/skills/investigate-app-errors.yaml b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/skills/investigate-app-errors.yaml new file mode 100644 index 000000000..669dddcb8 --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/skills/investigate-app-errors.yaml @@ -0,0 +1,16 @@ +metadata: + name: investigate-app-errors + description: Investigate application errors using Dynatrace DQL and Log Analytics + to correlate errors with deployments, infrastructure changes, and dependencies. + spec: + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - ExecutePythonCode + - PlotAreaChartWithCorrelation +skillContent: skills/investigate-app-errors.md +additionalFiles: [] diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/subagents/deployment-guard.instructions.md b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/subagents/deployment-guard.instructions.md new file mode 100644 index 000000000..28019290a --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/subagents/deployment-guard.instructions.md @@ -0,0 +1 @@ +You are the best engineer who guards production deployments operating in autonomous mode. Use the deployment-guard-analysis skill to assess PRs for production safety. Follow the full 9-step workflow: analyze the PR diff, perform static analysis, capture production baselines from Dynatrace and LAW, deploy to staging, send synthetic canary traffic, validate response correctness, monitor staging health, assess risk, and post a structured PR comment with your findings. diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/subagents/deployment-guard.yaml b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/subagents/deployment-guard.yaml new file mode 100644 index 000000000..b5189a30f --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/subagents/deployment-guard.yaml @@ -0,0 +1,34 @@ +metadata: + name: deployment-guard +spec: + instructions: subagents/deployment-guard.instructions.md + handoffDescription: Analyzes PRs by deploying to staging, comparing health against + production via Dynatrace + LAW, and posting risk assessment as a PR comment + handoffs: [] + allowedSkills: + - deployment-guard-analysis + - investigate-app-errors + tools: + - RunAzCliReadCommands + - RunAzCliWriteCommands + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - CreateGithubIssue + - FindConnectedGitHubRepo + - QueryLogAnalyticsByWorkspaceId + - dynatrace_adaptive-anomaly-detector + - dynatrace_create-dql + - dynatrace_execute-dql + - dynatrace_explain-dql + - dynatrace_get-entity-id + - dynatrace_get-entity-name + - dynatrace_query-problems + - dynatrace_seasonal-baseline-anomaly-detector + - dynatrace_static-threshold-analyzer + - dynatrace_timeseries-forecast + - dynatrace_timeseries-novelty-detection + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/subagents/error-investigator.instructions.md b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/subagents/error-investigator.instructions.md new file mode 100644 index 000000000..41b3c7a46 --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/subagents/error-investigator.instructions.md @@ -0,0 +1 @@ +You are an application error investigator. When errors are reported, use the investigate-app-errors skill to systematically diagnose the issue. Correlate Dynatrace metrics with Log Analytics data and deployment history. Always provide impact assessment and actionable recommendations with rollback options. diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/subagents/error-investigator.yaml b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/subagents/error-investigator.yaml new file mode 100644 index 000000000..ce2bb1300 --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/config/subagents/error-investigator.yaml @@ -0,0 +1,25 @@ +metadata: + name: error-investigator +spec: + instructions: subagents/error-investigator.instructions.md + handoffDescription: Investigates application errors by correlating Dynatrace metrics, + LAW logs, and deployment history to identify root cause + handoffs: [] + allowedSkills: + - investigate-app-errors + tools: + - RunAzCliReadCommands + - QueryLogAnalyticsByWorkspaceId + - ExecutePythonCode + - PlotAreaChartWithCorrelation + - PlotBarChart + - FindConnectedGitHubRepo + - dynatrace_execute-dql + - dynatrace_create-dql + - dynatrace_query-problems + - dynatrace_get-entity-id + - dynatrace_get-entity-name + maxReflectionCount: 0 + customReflectionNote: '' + commonPrompts: [] + enableVanillaMode: false diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/connectors.json b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/connectors.json new file mode 100644 index 000000000..0818d8bed --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/connectors.json @@ -0,0 +1,30 @@ +{ + "toggles": { + "enableAppInsightsConnector": false, + "appInsightsResourceId": "", + "appInsightsAppId": "", + "enableLogAnalyticsConnector": "{{lawId:bool}}", + "lawResourceId": "{{lawId}}", + "enableAzureMonitorConnector": false, + "azureMonitorLookbackDays": 7, + "grafanaUrl": "", + "grafanaApiKey": "" + }, + "connectors": [ + { + "name": "dynatrace", + "properties": { + "dataConnectorType": "Mcp", + "dataSource": "placeholder", + "extendedProperties": { + "type": "http", + "endpoint": "https://{{dtTenant}}.apps.dynatrace.com/platform-reserved/mcp-gateway/v0.1/servers/dynatrace-mcp/mcp", + "authType": "BearerToken", + "bearerToken": "${DYNATRACE_BEARER_TOKEN}", + "partnerType": "DynatraceMcp" + }, + "identity": "system" + } + } + ] +} diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/expected-config.json b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/expected-config.json new file mode 100644 index 000000000..62c56fa86 --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/expected-config.json @@ -0,0 +1,46 @@ +{ + "_description": "Expected configuration for law-dynatrace-httptrigger recipe. Used by verify-agent.sh to validate deployments.", + "_scenario": "law-dynatrace-github-httptrigger-prvalidation", + "agent": { + "accessLevel": "High", + "actionMode": "Review", + "upgradeChannel": "Preview", + "defaultModelProvider": "Anthropic", + "incidentPlatform": "None" + }, + "connectors": [ + { + "name": "log-analytics", + "type": "LogAnalytics" + }, + { + "name": "dynatrace", + "type": "Mcp" + } + ], + "skills": [ + "deployment-guard-analysis", + "investigate-app-errors" + ], + "subagents": [ + "deployment-guard", + "error-investigator" + ], + "hooks": [ + "deny-prod-deletes", + "require-approval-for-restarts" + ], + "commonPrompts": [ + "investigation-guidelines", + "safety-rules" + ], + "scheduledTasks": [], + "responsePlans": [], + "httpTriggers": [ + { + "name": "pr-deployment-guard", + "handlingAgent": "deployment-guard" + } + ], + "repos": [] +} \ No newline at end of file diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/roles.yaml b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/roles.yaml new file mode 100644 index 000000000..9ec1aa266 --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/roles.yaml @@ -0,0 +1,19 @@ +# Required roles/credentials for the law-dynatrace-httptrigger recipe. +# deploy.sh processes this after the UAMI is created. + +roles: + # GitHub repos — prints OAuth URL or uses GITHUB_PAT env var + - name: GitHub OAuth + type: manual + instructions: | + To connect GitHub repos, either: + 1. Set GITHUB_PAT env var before deploy: export GITHUB_PAT=ghp_xxx + 2. Or after deploy, open the OAuth URL printed by apply-extras.sh + + # Dynatrace MCP — requires bearer token in connectors.secrets.env + - name: Dynatrace MCP + type: manual + instructions: | + Create a Dynatrace API token with scopes: entities.read, events.read, metrics.read, problems.read + Save it in connectors.secrets.env: + DYNATRACE_BEARER_TOKEN=dt0c01.xxx diff --git a/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/sample-github-workflow.yml b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/sample-github-workflow.yml new file mode 100644 index 000000000..82883626a --- /dev/null +++ b/sreagent-templates/recipes/law-dynatrace-github-httptrigger-prvalidation/sample-github-workflow.yml @@ -0,0 +1,40 @@ +# Sample GitHub Actions workflow for your application repo. +# This sends PR events to the SRE Agent via the Logic App webhook bridge, +# which triggers the deployment-guard-analysis skill. +# +# Setup: +# 1. Copy this file to your app repo: .github/workflows/sre-agent-pr-guard.yml +# 2. Add a repo secret SRE_AGENT_WEBHOOK_URL with the Logic App trigger URL +# (find it in the Azure portal under the Logic App's trigger settings, +# or run: az resource show ... to get the callback URL) + +name: SRE Agent — PR Deployment Guard + +on: + pull_request: + types: [opened, synchronize, reopened] + +jobs: + notify-sre-agent: + runs-on: ubuntu-latest + steps: + - name: Trigger SRE Agent via webhook bridge + env: + WEBHOOK_URL: ${{ secrets.SRE_AGENT_WEBHOOK_URL }} + run: | + curl -s -X POST "$WEBHOOK_URL" \ + -H "Content-Type: application/json" \ + -d '{ + "event": "pull_request", + "action": "${{ github.event.action }}", + "pr_number": ${{ github.event.pull_request.number }}, + "pr_title": "${{ github.event.pull_request.title }}", + "pr_url": "${{ github.event.pull_request.html_url }}", + "pr_diff_url": "${{ github.event.pull_request.diff_url }}", + "pr_author": "${{ github.event.pull_request.user.login }}", + "repo": "${{ github.repository }}", + "head_ref": "${{ github.event.pull_request.head.ref }}", + "base_ref": "${{ github.event.pull_request.base.ref }}", + "head_sha": "${{ github.event.pull_request.head.sha }}" + }' + echo "Webhook sent to SRE Agent" diff --git a/sreagent-templates/terraform/main.tf b/sreagent-templates/terraform/main.tf index 12617f929..6564df339 100644 --- a/sreagent-templates/terraform/main.tf +++ b/sreagent-templates/terraform/main.tf @@ -171,6 +171,21 @@ resource "azapi_resource" "sre_agent" { EnableHttpTriggers = true EnableV2AgentLoop = true } + vnetConfiguration = var.vnet_subnet_id != "" ? { + subnetResourceId = var.vnet_subnet_id + } : null + sandboxConfiguration = var.egress_mode != "Unrestricted" ? { + egress = { + mode = var.egress_mode + allowedHosts = var.allowed_hosts + allowedRegistries = var.allowed_registries + allowedCodeRepositories = var.allowed_code_repositories + allowHttpMcpServerNetworkAccess = var.allow_http_mcp_server_network_access + vnetConfiguration = var.egress_mode == "AzureVNet" ? { + usePrivateDnsResolution = var.use_private_dns_resolution + } : null + } + } : null } } @@ -217,6 +232,7 @@ resource "azapi_resource" "connector" { # ── Monitoring Reader on agent RG ── resource "azurerm_role_assignment" "monitoring_reader" { + count = var.skip_role_assignments || !local.create_identity ? 0 : 1 scope = azurerm_resource_group.agent.id role_definition_name = "Monitoring Reader" principal_id = local.effective_principal_id @@ -226,6 +242,7 @@ resource "azurerm_role_assignment" "monitoring_reader" { # ── SRE Agent Administrator — deployer on the agent ── resource "azurerm_role_assignment" "deployer_admin" { + count = var.skip_role_assignments ? 0 : 1 scope = azapi_resource.sre_agent.id role_definition_id = "/subscriptions/${data.azurerm_subscription.current.subscription_id}/providers/Microsoft.Authorization/roleDefinitions/${local.sre_agent_admin_role_id}" principal_id = data.azurerm_client_config.current.object_id @@ -235,6 +252,7 @@ resource "azurerm_role_assignment" "deployer_admin" { # ── SRE Agent Administrator — UAMI on the agent ── resource "azurerm_role_assignment" "uami_admin" { + count = var.skip_role_assignments ? 0 : 1 scope = azapi_resource.sre_agent.id role_definition_id = "/subscriptions/${data.azurerm_subscription.current.subscription_id}/providers/Microsoft.Authorization/roleDefinitions/${local.sre_agent_admin_role_id}" principal_id = local.effective_principal_id @@ -244,7 +262,7 @@ resource "azurerm_role_assignment" "uami_admin" { # ── Target RG: Reader ── resource "azurerm_role_assignment" "target_reader" { - for_each = toset(var.target_resource_groups) + for_each = var.skip_role_assignments || !local.create_identity ? toset([]) : toset(var.target_resource_groups) scope = "/subscriptions/${data.azurerm_subscription.current.subscription_id}/resourceGroups/${each.value}" role_definition_name = "Reader" principal_id = local.effective_principal_id @@ -254,7 +272,7 @@ resource "azurerm_role_assignment" "target_reader" { # ── Target RG: Log Analytics Reader ── resource "azurerm_role_assignment" "target_log_reader" { - for_each = toset(var.target_resource_groups) + for_each = var.skip_role_assignments || !local.create_identity ? toset([]) : toset(var.target_resource_groups) scope = "/subscriptions/${data.azurerm_subscription.current.subscription_id}/resourceGroups/${each.value}" role_definition_name = "Log Analytics Reader" principal_id = local.effective_principal_id @@ -264,7 +282,7 @@ resource "azurerm_role_assignment" "target_log_reader" { # ── Target RG: Contributor (High access only) ── resource "azurerm_role_assignment" "target_contributor" { - for_each = var.access_level == "High" ? toset(var.target_resource_groups) : toset([]) + for_each = !var.skip_role_assignments && local.create_identity && var.access_level == "High" ? toset(var.target_resource_groups) : toset([]) scope = "/subscriptions/${data.azurerm_subscription.current.subscription_id}/resourceGroups/${each.value}" role_definition_name = "Contributor" principal_id = local.effective_principal_id @@ -276,7 +294,7 @@ resource "azurerm_role_assignment" "target_contributor" { # Same roles as UAMI: Reader + Log Analytics Reader + Contributor (if High). resource "azurerm_role_assignment" "smi_target_reader" { - for_each = toset(var.target_resource_groups) + for_each = var.skip_role_assignments ? toset([]) : toset(var.target_resource_groups) scope = "/subscriptions/${data.azurerm_subscription.current.subscription_id}/resourceGroups/${each.value}" role_definition_name = "Reader" principal_id = azapi_resource.sre_agent.identity[0].principal_id @@ -284,7 +302,7 @@ resource "azurerm_role_assignment" "smi_target_reader" { } resource "azurerm_role_assignment" "smi_target_log_reader" { - for_each = toset(var.target_resource_groups) + for_each = var.skip_role_assignments ? toset([]) : toset(var.target_resource_groups) scope = "/subscriptions/${data.azurerm_subscription.current.subscription_id}/resourceGroups/${each.value}" role_definition_name = "Log Analytics Reader" principal_id = azapi_resource.sre_agent.identity[0].principal_id @@ -292,7 +310,7 @@ resource "azurerm_role_assignment" "smi_target_log_reader" { } resource "azurerm_role_assignment" "smi_target_contributor" { - for_each = var.access_level == "High" ? toset(var.target_resource_groups) : toset([]) + for_each = !var.skip_role_assignments && var.access_level == "High" ? toset(var.target_resource_groups) : toset([]) scope = "/subscriptions/${data.azurerm_subscription.current.subscription_id}/resourceGroups/${each.value}" role_definition_name = "Contributor" principal_id = azapi_resource.sre_agent.identity[0].principal_id diff --git a/sreagent-templates/terraform/variables.tf b/sreagent-templates/terraform/variables.tf index 665041990..ba0c78dac 100644 --- a/sreagent-templates/terraform/variables.tf +++ b/sreagent-templates/terraform/variables.tf @@ -109,6 +109,63 @@ variable "existing_agent_app_insights_id" { default = "" } +# ── RBAC ── + +variable "skip_role_assignments" { + description = "Skip all role assignments. Set to true when RBAC is pre-configured or on redeploy to avoid RoleAssignmentExists errors." + type = bool + default = false +} + +# ── Network / VNet ── + +variable "vnet_subnet_id" { + description = "Full ARM resource ID of a delegated subnet (Microsoft.App/environments) for VNet integration. Leave empty for no VNet." + type = string + default = "" +} + +variable "egress_mode" { + description = "Sandbox egress mode: Unrestricted (default), Limited, or AzureVNet." + type = string + default = "Unrestricted" + + validation { + condition = contains(["Unrestricted", "Limited", "AzureVNet"], var.egress_mode) + error_message = "egress_mode must be Unrestricted, Limited, or AzureVNet." + } +} + +variable "allowed_hosts" { + description = "Additional hosts the sandbox may reach (e.g. *.contoso.com). Only used in Limited/AzureVNet modes." + type = list(string) + default = [] +} + +variable "allowed_registries" { + description = "Registry catalog IDs (pypi, npmjs, nuget-org) whose hosts are allowed. Only used in Limited/AzureVNet modes." + type = list(string) + default = [] +} + +variable "allowed_code_repositories" { + description = "Code-repo providers (Github, AzureDevOps) whose hosts are allowed. Only used in Limited/AzureVNet modes." + type = list(string) + default = [] +} + +variable "allow_http_mcp_server_network_access" { + description = "Allow remote HTTP MCP server endpoints in sandbox egress." + type = bool + default = true +} + +variable "use_private_dns_resolution" { + description = "Use VNet private DNS resolver instead of platform default. Only for AzureVNet mode." + type = bool + default = false +} + # ── Connector toggles ── variable "enable_app_insights_connector" { diff --git a/sreagent-templates/tests/e2e/E2E-RESULTS-law-dt-httptrigger.md b/sreagent-templates/tests/e2e/E2E-RESULTS-law-dt-httptrigger.md new file mode 100644 index 000000000..2f605d37e --- /dev/null +++ b/sreagent-templates/tests/e2e/E2E-RESULTS-law-dt-httptrigger.md @@ -0,0 +1,81 @@ +# E2E Test Results — law-dynatrace-httptrigger Recipe + +**Date:** 2026-05-26 +**Subscription:** cbf44432-7f45-4906-a85d-d2b14a1e8328 +**Region:** swedencentral + +## Dry-Run Test (all backends) + +| Check | Result | +|---|---| +| new-agent.sh | ✅ | +| skills count (2) | ✅ | +| skill tools (8 each) | ✅ | +| skill .md content | ✅ | +| subagents count (2) | ✅ | +| hooks count (2) | ✅ | +| prompts count (2) | ✅ | +| http-triggers count (1) | ✅ | +| connectors.json | ✅ | +| deploy.sh --dry-run (Bicep) | ✅ | +| az bicep build | ✅ | +| deploy-tf.sh --dry-run (Terraform) | ✅ | +| terraform validate | ✅ | +| PS New-Agent.ps1 | ✅ | +| azd assemble | ✅ | +| **no {{placeholders}}** | ❌ (false positive: GitHub Actions `${{ }}` in sample workflow) | +| **tfvars skills/subagents/prompts** | ❌ (pre-existing terraform dry-run gap, same as dynatrace-mcp) | + +**Result: 27/31 passed** (4 failures are pre-existing/false-positive, not recipe-specific) + +## E2E Full Matrix (5 backends × 7 steps) + +| Backend | new-agent | deploy | verify | re-deploy | verify-update | clone | verify-clone | +|---|---|---|---|---|---|---|---| +| **bicep-bash** | ✅ | ✅* | ✅ (20/20) | ✅* | ✅ | ✅ | ✅ (16/16) | +| **bicep-ps** | ✅** | ✅ | ✅ (20/20) | ✅ | ✅ | ✅ | ✅ (16/16) | +| **tf-bash** | ✅ | ✅ | ❌→✅ | ✅* | ✅ | ✅ | ✅ (16/16) | +| **tf-ps** | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **azd-bash** | ✅ | ✅*** | ❌→✅ | ✅*** | ✅ | ✅*** | ✅ (16/16) | + +**Legend:** +- `*` = rc=5 from jq parse error on GitHub OAuth URL print (pre-existing deploy.sh issue, agent deploys correctly) +- `**` = PS New-Agent.ps1 Count property warning (cosmetic, config generated correctly) +- `***` = azd exit codes unreliable, but agent created and verified +- `❌→✅` = verify fails on first deploy (timing), passes after re-deploy +- tf-ps = Known P0 bug: Deploy-Tf.ps1 broken for all recipes (not recipe-specific) + +## Agents Created (10 total) + +| Agent | RG | Backend | Type | +|---|---|---|---| +| dg-bicep-bash | rg-dg-bicep-bash | bicep-bash | original | +| dg-bicep-bash-cl | rg-dg-bicep-bash-cl | bicep-bash | clone | +| dg-bicep-ps | rg-dg-bicep-ps | bicep-ps | original | +| dg-bicep-ps-cl | rg-dg-bicep-ps-cl | bicep-ps | clone | +| dg-tf-bash | rg-dg-tf-bash | tf-bash | original | +| dg-tf-bash-cl | rg-dg-tf-bash-cl | tf-bash | clone | +| dg-azd-bash | rg-dg-azd-bash | azd-bash | original | +| dg-azd-bash-cl | rg-dg-azd-bash-cl | azd-bash | clone | +| deployment-guard-lab | rg-deployment-guard-lab | bicep-bash | manual test | +| deployment-guard-lab (clone export) | — | bicep-bash | export only | + +## Verify Output (representative — bicep-ps) + +``` +Skills: 2 (deployment-guard-analysis, investigate-app-errors) +Subagents: 2 (deployment-guard, error-investigator) +Hooks: 2 (deny-prod-deletes, require-approval-for-restarts) +Common Prompts: 2 (investigation-guidelines, safety-rules) +Connectors: 2 (log-analytics: LogAnalytics, dynatrace: Mcp) +HTTP Triggers: 1 (pr-deployment-guard) +Results: 20 passed, 0 failed +``` + +## Known Issues (pre-existing, not recipe-specific) + +1. **deploy.sh exit code 5**: jq parse error when printing GitHub OAuth URL. Deploy succeeds — verify passes. +2. **tf-ps Deploy-Tf.ps1**: P0 bug — fails for all recipes, not just this one. +3. **tfvars dry-run**: terraform `deploy-tf.sh --dry-run` doesn't populate skills/subagents/prompts in tfvars. +4. **`${{ }}` false positive**: sample GitHub workflow uses `${{ github.event.* }}` which grep matches as `{{`. +5. **azd exit codes**: `azd up` returns non-zero even when deploy succeeds. Verify confirms agents work. diff --git a/sreagent-templates/tests/e2e/test-dg-azd-bash.sh b/sreagent-templates/tests/e2e/test-dg-azd-bash.sh new file mode 100755 index 000000000..d883e7d53 --- /dev/null +++ b/sreagent-templates/tests/e2e/test-dg-azd-bash.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash +set -o pipefail + +# ─── E2E Test: law-dynatrace-httptrigger × azd-bash ─── + +SUB="cbf44432-7f45-4906-a85d-d2b14a1e8328" +LAW_CONTOSO="/subscriptions/$SUB/resourceGroups/rg-contoso-swe/providers/Microsoft.OperationalInsights/workspaces/law-7defkiyvn3r44" +DT_TENANT="dhu66396" +DT_TOKEN="${DT_TOKEN:?Set DT_TOKEN}" +GITHUB_REPO="dm-chelupati/contoso-trading" +REGION="swedencentral" + +AGENT="dg-azd-bash" +RG="rg-dg-azd-bash" +DIR="/tmp/e2e-dg-azd-bash" +CLONE_AGENT="dg-azd-bash-cl" +CLONE_RG="rg-dg-azd-bash-cl" +LOG="/tmp/e2e-dg-azd-bash.log" + +PASS=0; FAIL=0; RESULTS=() +record() { + local name="$1" rc="$2" + if [[ $rc -eq 0 ]]; then RESULTS+=("PASS: $name"); ((PASS++)) + else RESULTS+=("FAIL: $name (rc=$rc)"); ((FAIL++)); fi +} + +exec > >(tee "$LOG") 2>&1 +echo "Starting E2E: law-dynatrace-httptrigger × azd-bash at $(date)" +cd "$(dirname "$0")/../.." || exit 1 + +echo "" +echo "=== STEP 1: new-agent ===" +rm -rf "$DIR" +./bin/new-agent.sh \ + --recipe law-dynatrace-github-httptrigger-prvalidation \ + --non-interactive \ + --set agentName="$AGENT" \ + --set resourceGroup="$RG" \ + --set location="$REGION" \ + --set targetRGs=rg-contoso-prod,rg-contoso-staging \ + --set lawId="$LAW_CONTOSO" \ + --set dtTenant="$DT_TENANT" \ + --set dtToken="$DT_TOKEN" \ + --set githubRepo="$GITHUB_REPO" \ + -o "$DIR/" +record "new-agent" $? + +echo "" +echo "=== STEP 2: deploy (azd) ===" +mkdir -p "./agents/$AGENT" +cp -r "$DIR/"* "./agents/$AGENT/" 2>/dev/null || true +azd env select "$AGENT" --no-prompt 2>/dev/null || azd env new "$AGENT" --no-prompt +azd env set AZURE_AGENT_NAME "$AGENT" --no-prompt +azd env set AZURE_RESOURCE_GROUP "$RG" --no-prompt +azd env set AZURE_LOCATION "$REGION" --no-prompt +azd env set AZURE_SUBSCRIPTION_ID "$SUB" --no-prompt +azd env set AZURE_LAW_ID "$LAW_CONTOSO" --no-prompt +azd env set DT_TENANT "$DT_TENANT" --no-prompt +azd env set DT_TOKEN "$DT_TOKEN" --no-prompt +azd up --no-prompt +record "deploy-azd" $? + +echo "" +echo "=== STEP 3: verify (waiting 15s for data-plane) ===" +sleep 15 +./bin/verify-agent.sh "$SUB" "$RG" "$AGENT" --expected "$DIR/" +record "verify" $? + +echo "" +echo "=== STEP 4: re-deploy / update (azd) ===" +echo -e "\n# Updated by e2e test" >> "$DIR/config/skills/deployment-guard-analysis.md" +cp -r "$DIR/"* "./agents/$AGENT/" 2>/dev/null || true +azd env select "$AGENT" +azd up --no-prompt +record "re-deploy-azd" $? + +echo "" +echo "=== STEP 5: verify after update ===" +./bin/verify-agent.sh "$SUB" "$RG" "$AGENT" --expected "$DIR/" +record "verify-update" $? + +echo "" +echo "=== STEP 6: clone ===" +echo y | ./bin/clone-agent.sh \ + --from-agent "$AGENT" \ + --from-rg "$RG" \ + --from-sub "$SUB" \ + --agent-name "$CLONE_AGENT" \ + --resource-group "$CLONE_RG" \ + --location "$REGION" +record "clone" $? + +echo "" +echo "=== STEP 7: verify clone ===" +./bin/verify-agent.sh "$SUB" "$CLONE_RG" "$CLONE_AGENT" +record "verify-clone" $? + +echo "" +echo "════════════════════════════════════════" +echo " E2E RESULTS: law-dynatrace-httptrigger × azd-bash" +echo "════════════════════════════════════════" +for r in "${RESULTS[@]}"; do echo " $r"; done +echo "────────────────────────────────────────" +echo " TOTAL: $PASS passed, $FAIL failed" +echo "════════════════════════════════════════" +[[ $FAIL -eq 0 ]] && exit 0 || exit 1 diff --git a/sreagent-templates/tests/e2e/test-dg-bicep-bash.sh b/sreagent-templates/tests/e2e/test-dg-bicep-bash.sh new file mode 100755 index 000000000..f6df745f8 --- /dev/null +++ b/sreagent-templates/tests/e2e/test-dg-bicep-bash.sh @@ -0,0 +1,128 @@ +#!/usr/bin/env bash +set -o pipefail + +# ─── E2E Test: law-dynatrace-httptrigger × bicep-bash ─── + +SUB="cbf44432-7f45-4906-a85d-d2b14a1e8328" +LAW_CONTOSO="/subscriptions/$SUB/resourceGroups/rg-contoso-swe/providers/Microsoft.OperationalInsights/workspaces/law-7defkiyvn3r44" +DT_TENANT="dhu66396" +DT_TOKEN="${DT_TOKEN:?Set DT_TOKEN}" +GITHUB_REPO="dm-chelupati/contoso-trading" +REGION="swedencentral" + +AGENT="dg-bicep-bash" +RG="rg-dg-bicep-bash" +DIR="/tmp/e2e-dg-bicep-bash" +CLONE_AGENT="dg-bicep-bash-cl" +CLONE_RG="rg-dg-bicep-bash-cl" +LOG="/tmp/e2e-dg-bicep-bash.log" + +PASS=0; FAIL=0; RESULTS=() +record() { + local name="$1" rc="$2" + if [[ $rc -eq 0 ]]; then RESULTS+=("PASS: $name"); ((PASS++)) + else RESULTS+=("FAIL: $name (rc=$rc)"); ((FAIL++)); fi +} + +exec > >(tee "$LOG") 2>&1 +echo "Starting E2E: law-dynatrace-httptrigger × bicep-bash at $(date)" +cd "$(dirname "$0")/../.." || exit 1 + +echo "" +echo "=== STEP 1: new-agent ===" +rm -rf "$DIR" +./bin/new-agent.sh \ + --recipe law-dynatrace-github-httptrigger-prvalidation \ + --non-interactive \ + --set agentName="$AGENT" \ + --set resourceGroup="$RG" \ + --set location="$REGION" \ + --set targetRGs=rg-contoso-prod,rg-contoso-staging \ + --set lawId="$LAW_CONTOSO" \ + --set dtTenant="$DT_TENANT" \ + --set dtToken="$DT_TOKEN" \ + --set githubRepo="$GITHUB_REPO" \ + -o "$DIR/" +record "new-agent" $? + +echo "" +echo "=== STEP 2: deploy ===" +./bin/deploy.sh "$DIR/" --force +record "deploy" $? + +echo "" +echo "=== STEP 3: verify (waiting 15s for data-plane) ===" +sleep 15 +./bin/verify-agent.sh "$SUB" "$RG" "$AGENT" --expected "$DIR/" +record "verify" $? + +echo "" +echo "=== STEP 4: re-deploy (update — tweak skill) ===" +# Add a line to the deployment-guard skill to test update path +echo -e "\n# Updated by e2e test" >> "$DIR/config/skills/deployment-guard-analysis.md" +./bin/deploy.sh "$DIR/" --force +record "re-deploy" $? + +echo "" +echo "=== STEP 5: verify after update ===" +./bin/verify-agent.sh "$SUB" "$RG" "$AGENT" --expected "$DIR/" +record "verify-update" $? + +echo "" +echo "=== STEP 5b: create memory via chat ===" +AGENT_EP=$(az resource show --resource-group "$RG" --resource-type Microsoft.App/agents --name "$AGENT" --query "properties.agentEndpoint" -o tsv) +TOKEN=$(az account get-access-token --resource "https://azuresre.dev" --query accessToken -o tsv) +# Create a thread with a message that triggers memory save +HTTP_CODE=$(curl -s -o /tmp/e2e-thread.json -w "%{http_code}" -X POST "$AGENT_EP/api/v1/threads" \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"StartMessage":"Save these as my user preferences: For contoso-trading, prod RG is rg-contoso-prod, staging is rg-contoso-staging. Always check DATABASE_URL env var consistency. Flag >2x latency as HIGH."}') +if [[ "$HTTP_CODE" == "200" || "$HTTP_CODE" == "201" ]]; then + echo "Memory thread created (HTTP $HTTP_CODE). Waiting 60s for agent to process..." + sleep 60 + # Check if memory was created + MEM_COUNT=$(curl -s "$AGENT_EP/api/v1/WorkspaceMemory/list" -H "Authorization: Bearer $TOKEN" | python3 -c "import sys,json; d=json.load(sys.stdin); print(len([f for f in d.get('files',[]) if f['size']>0]))" 2>/dev/null || echo "0") + echo "Synthesized knowledge files with content: $MEM_COUNT" + [[ "$MEM_COUNT" -gt 0 ]] && record "create-memory" 0 || record "create-memory" 1 +else + echo "Thread creation returned HTTP $HTTP_CODE" + record "create-memory" 1 +fi + +echo "" +echo "=== STEP 6: clone ===" +echo y | ./bin/clone-agent.sh \ + --from-agent "$AGENT" \ + --from-rg "$RG" \ + --from-sub "$SUB" \ + --agent-name "$CLONE_AGENT" \ + --resource-group "$CLONE_RG" \ + --location "$REGION" +record "clone" $? + +echo "" +echo "=== STEP 7: verify clone ===" +./bin/verify-agent.sh "$SUB" "$CLONE_RG" "$CLONE_AGENT" +record "verify-clone" $? + +echo "" +echo "=== STEP 7b: verify clone has memory ===" +CLONE_EP=$(az resource show --resource-group "$CLONE_RG" --resource-type Microsoft.App/agents --name "$CLONE_AGENT" --query "properties.agentEndpoint" -o tsv 2>/dev/null) +if [[ -n "$CLONE_EP" ]]; then + TOKEN=$(az account get-access-token --resource "https://azuresre.dev" --query accessToken -o tsv) + CLONE_MEM=$(curl -s "$CLONE_EP/api/v1/WorkspaceMemory/list" -H "Authorization: Bearer $TOKEN" | python3 -c "import sys,json; d=json.load(sys.stdin); print(len([f for f in d.get('files',[]) if f['size']>0]))" 2>/dev/null || echo "0") + echo "Clone synthesized knowledge files with content: $CLONE_MEM" + [[ "$CLONE_MEM" -gt 0 ]] && record "clone-has-memory" 0 || record "clone-has-memory" 1 +else + record "clone-has-memory" 1 +fi + +echo "" +echo "════════════════════════════════════════" +echo " E2E RESULTS: law-dynatrace-httptrigger × bicep-bash" +echo "════════════════════════════════════════" +for r in "${RESULTS[@]}"; do echo " $r"; done +echo "────────────────────────────────────────" +echo " TOTAL: $PASS passed, $FAIL failed" +echo "════════════════════════════════════════" +[[ $FAIL -eq 0 ]] && exit 0 || exit 1 diff --git a/sreagent-templates/tests/e2e/test-dg-bicep-ps.sh b/sreagent-templates/tests/e2e/test-dg-bicep-ps.sh new file mode 100755 index 000000000..703c88a77 --- /dev/null +++ b/sreagent-templates/tests/e2e/test-dg-bicep-ps.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash +set -o pipefail + +# ─── E2E Test: law-dynatrace-httptrigger × bicep-ps ─── + +SUB="cbf44432-7f45-4906-a85d-d2b14a1e8328" +LAW_CONTOSO="/subscriptions/$SUB/resourceGroups/rg-contoso-swe/providers/Microsoft.OperationalInsights/workspaces/law-7defkiyvn3r44" +DT_TENANT="dhu66396" +DT_TOKEN="${DT_TOKEN:?Set DT_TOKEN}" +GITHUB_REPO="dm-chelupati/contoso-trading" +REGION="swedencentral" + +AGENT="dg-bicep-ps" +RG="rg-dg-bicep-ps" +DIR="/tmp/e2e-dg-bicep-ps" +CLONE_AGENT="dg-bicep-ps-cl" +CLONE_RG="rg-dg-bicep-ps-cl" +LOG="/tmp/e2e-dg-bicep-ps.log" + +PASS=0; FAIL=0; RESULTS=() +record() { + local name="$1" rc="$2" + if [[ $rc -eq 0 ]]; then RESULTS+=("PASS: $name"); ((PASS++)) + else RESULTS+=("FAIL: $name (rc=$rc)"); ((FAIL++)); fi +} + +exec > >(tee "$LOG") 2>&1 +echo "Starting E2E: law-dynatrace-httptrigger × bicep-ps at $(date)" +cd "$(dirname "$0")/../.." || exit 1 + +echo "" +echo "=== STEP 1: new-agent (PS) ===" +rm -rf "$DIR" +pwsh -NoProfile -Command "& './bin/ps/New-Agent.ps1' \ + -Recipe 'law-dynatrace-github-httptrigger-prvalidation' \ + -NonInteractive \ + -Set @{ \ + agentName='$AGENT'; \ + resourceGroup='$RG'; \ + location='$REGION'; \ + targetRGs='rg-contoso-prod,rg-contoso-staging'; \ + lawId='$LAW_CONTOSO'; \ + dtTenant='$DT_TENANT'; \ + dtToken='$DT_TOKEN'; \ + githubRepo='$GITHUB_REPO' \ + } \ + -Output '$DIR/'" +record "new-agent-ps" $? + +echo "" +echo "=== STEP 2: deploy (PS) ===" +pwsh -NoProfile -Command "& './bin/ps/Deploy-Agent.ps1' -InputPath '$DIR' -Force" +record "deploy-ps" $? + +echo "" +echo "=== STEP 3: verify (PS) ===" +pwsh -NoProfile -Command "& './bin/ps/Verify-Agent.ps1' \ + -Subscription '$SUB' \ + -ResourceGroup '$RG' \ + -AgentName '$AGENT' \ + -Expected '$DIR/'" +record "verify-ps" $? + +echo "" +echo "=== STEP 4: re-deploy / update (PS) ===" +echo -e "\n# Updated by e2e test" >> "$DIR/config/skills/deployment-guard-analysis.md" +pwsh -NoProfile -Command "& './bin/ps/Deploy-Agent.ps1' -InputPath '$DIR' -Force" +record "re-deploy-ps" $? + +echo "" +echo "=== STEP 5: verify after update (PS) ===" +pwsh -NoProfile -Command "& './bin/ps/Verify-Agent.ps1' \ + -Subscription '$SUB' \ + -ResourceGroup '$RG' \ + -AgentName '$AGENT' \ + -Expected '$DIR/'" +record "verify-update-ps" $? + +echo "" +echo "=== STEP 6: clone (PS) ===" +pwsh -NoProfile -Command "& './bin/ps/Clone-Agent.ps1' \ + -FromAgent '$AGENT' \ + -FromResourceGroup '$RG' \ + -FromSubscription '$SUB' \ + -AgentName '$CLONE_AGENT' \ + -ResourceGroup '$CLONE_RG' \ + -Location '$REGION' \ + -Force" +record "clone-ps" $? + +echo "" +echo "=== STEP 7: verify clone (PS) ===" +pwsh -NoProfile -Command "& './bin/ps/Verify-Agent.ps1' \ + -Subscription '$SUB' \ + -ResourceGroup '$CLONE_RG' \ + -AgentName '$CLONE_AGENT'" +record "verify-clone-ps" $? + +echo "" +echo "════════════════════════════════════════" +echo " E2E RESULTS: law-dynatrace-httptrigger × bicep-ps" +echo "════════════════════════════════════════" +for r in "${RESULTS[@]}"; do echo " $r"; done +echo "────────────────────────────────────────" +echo " TOTAL: $PASS passed, $FAIL failed" +echo "════════════════════════════════════════" +[[ $FAIL -eq 0 ]] && exit 0 || exit 1 diff --git a/sreagent-templates/tests/e2e/test-dg-tf-bash.sh b/sreagent-templates/tests/e2e/test-dg-tf-bash.sh new file mode 100755 index 000000000..c04e61f88 --- /dev/null +++ b/sreagent-templates/tests/e2e/test-dg-tf-bash.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +set -o pipefail + +# ─── E2E Test: law-dynatrace-httptrigger × tf-bash ─── + +SUB="cbf44432-7f45-4906-a85d-d2b14a1e8328" +LAW_CONTOSO="/subscriptions/$SUB/resourceGroups/rg-contoso-swe/providers/Microsoft.OperationalInsights/workspaces/law-7defkiyvn3r44" +DT_TENANT="dhu66396" +DT_TOKEN="${DT_TOKEN:?Set DT_TOKEN}" +GITHUB_REPO="dm-chelupati/contoso-trading" +REGION="swedencentral" + +AGENT="dg-tf-bash" +RG="rg-dg-tf-bash" +DIR="/tmp/e2e-dg-tf-bash" +CLONE_AGENT="dg-tf-bash-cl" +CLONE_RG="rg-dg-tf-bash-cl" +LOG="/tmp/e2e-dg-tf-bash.log" + +PASS=0; FAIL=0; RESULTS=() +record() { + local name="$1" rc="$2" + if [[ $rc -eq 0 ]]; then RESULTS+=("PASS: $name"); ((PASS++)) + else RESULTS+=("FAIL: $name (rc=$rc)"); ((FAIL++)); fi +} + +exec > >(tee "$LOG") 2>&1 +echo "Starting E2E: law-dynatrace-httptrigger × tf-bash at $(date)" +cd "$(dirname "$0")/../.." || exit 1 + +echo "" +echo "=== STEP 1: new-agent ===" +rm -rf "$DIR" +./bin/new-agent.sh \ + --recipe law-dynatrace-github-httptrigger-prvalidation \ + --non-interactive \ + --set agentName="$AGENT" \ + --set resourceGroup="$RG" \ + --set location="$REGION" \ + --set targetRGs=rg-contoso-prod,rg-contoso-staging \ + --set lawId="$LAW_CONTOSO" \ + --set dtTenant="$DT_TENANT" \ + --set dtToken="$DT_TOKEN" \ + --set githubRepo="$GITHUB_REPO" \ + -o "$DIR/" +record "new-agent" $? + +echo "" +echo "=== STEP 2: deploy (terraform) ===" +./bin/deploy-tf.sh "$DIR/" +record "deploy-tf" $? + +echo "" +echo "=== STEP 3: verify (waiting 15s for data-plane) ===" +sleep 15 +./bin/verify-agent.sh "$SUB" "$RG" "$AGENT" --expected "$DIR/" +record "verify" $? + +echo "" +echo "=== STEP 4: re-deploy (update) ===" +echo -e "\n# Updated by e2e test" >> "$DIR/config/skills/deployment-guard-analysis.md" +./bin/deploy-tf.sh "$DIR/" +record "re-deploy-tf" $? + +echo "" +echo "=== STEP 5: verify after update ===" +./bin/verify-agent.sh "$SUB" "$RG" "$AGENT" --expected "$DIR/" +record "verify-update" $? + +echo "" +echo "=== STEP 6: clone ===" +echo y | ./bin/clone-agent.sh \ + --from-agent "$AGENT" \ + --from-rg "$RG" \ + --from-sub "$SUB" \ + --agent-name "$CLONE_AGENT" \ + --resource-group "$CLONE_RG" \ + --location "$REGION" \ + --backend terraform +record "clone-tf" $? + +echo "" +echo "=== STEP 7: verify clone ===" +./bin/verify-agent.sh "$SUB" "$CLONE_RG" "$CLONE_AGENT" +record "verify-clone" $? + +echo "" +echo "════════════════════════════════════════" +echo " E2E RESULTS: law-dynatrace-httptrigger × tf-bash" +echo "════════════════════════════════════════" +for r in "${RESULTS[@]}"; do echo " $r"; done +echo "────────────────────────────────────────" +echo " TOTAL: $PASS passed, $FAIL failed" +echo "════════════════════════════════════════" +[[ $FAIL -eq 0 ]] && exit 0 || exit 1 diff --git a/sreagent-templates/tests/e2e/test-dg-tf-ps.sh b/sreagent-templates/tests/e2e/test-dg-tf-ps.sh new file mode 100755 index 000000000..5dc5c4358 --- /dev/null +++ b/sreagent-templates/tests/e2e/test-dg-tf-ps.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash +set -o pipefail + +# ─── E2E Test: law-dynatrace-httptrigger × tf-ps ─── + +SUB="cbf44432-7f45-4906-a85d-d2b14a1e8328" +LAW_CONTOSO="/subscriptions/$SUB/resourceGroups/rg-contoso-swe/providers/Microsoft.OperationalInsights/workspaces/law-7defkiyvn3r44" +DT_TENANT="dhu66396" +DT_TOKEN="${DT_TOKEN:?Set DT_TOKEN}" +GITHUB_REPO="dm-chelupati/contoso-trading" +REGION="swedencentral" + +AGENT="dg-tf-ps" +RG="rg-dg-tf-ps" +DIR="/tmp/e2e-dg-tf-ps" +CLONE_AGENT="dg-tf-ps-cl" +CLONE_RG="rg-dg-tf-ps-cl" +LOG="/tmp/e2e-dg-tf-ps.log" + +PASS=0; FAIL=0; RESULTS=() +record() { + local name="$1" rc="$2" + if [[ $rc -eq 0 ]]; then RESULTS+=("PASS: $name"); ((PASS++)) + else RESULTS+=("FAIL: $name (rc=$rc)"); ((FAIL++)); fi +} + +exec > >(tee "$LOG") 2>&1 +echo "Starting E2E: law-dynatrace-httptrigger × tf-ps at $(date)" +cd "$(dirname "$0")/../.." || exit 1 + +echo "" +echo "=== STEP 1: new-agent (PS) ===" +rm -rf "$DIR" +pwsh -NoProfile -Command "& './bin/ps/New-Agent.ps1' \ + -Recipe 'law-dynatrace-github-httptrigger-prvalidation' \ + -NonInteractive \ + -Set @{ \ + agentName='$AGENT'; \ + resourceGroup='$RG'; \ + location='$REGION'; \ + targetRGs='rg-contoso-prod,rg-contoso-staging'; \ + lawId='$LAW_CONTOSO'; \ + dtTenant='$DT_TENANT'; \ + dtToken='$DT_TOKEN'; \ + githubRepo='$GITHUB_REPO' \ + } \ + -Output '$DIR/'" +record "new-agent-ps" $? + +echo "" +echo "=== STEP 2: deploy (terraform via PS) ===" +pwsh -NoProfile -Command "& './bin/ps/Deploy-Tf.ps1' -InputPath '$DIR'" +record "deploy-tf-ps" $? + +echo "" +echo "=== STEP 3: verify ===" +./bin/verify-agent.sh "$SUB" "$RG" "$AGENT" --expected "$DIR/" +record "verify" $? + +echo "" +echo "=== STEP 4: re-deploy / update ===" +echo -e "\n# Updated by e2e test" >> "$DIR/config/skills/deployment-guard-analysis.md" +pwsh -NoProfile -Command "& './bin/ps/Deploy-Tf.ps1' -InputPath '$DIR'" +record "re-deploy-tf-ps" $? + +echo "" +echo "=== STEP 5: verify after update ===" +./bin/verify-agent.sh "$SUB" "$RG" "$AGENT" --expected "$DIR/" +record "verify-update" $? + +echo "" +echo "=== STEP 6: clone ===" +echo y | ./bin/clone-agent.sh \ + --from-agent "$AGENT" \ + --from-rg "$RG" \ + --from-sub "$SUB" \ + --agent-name "$CLONE_AGENT" \ + --resource-group "$CLONE_RG" \ + --location "$REGION" \ + --backend terraform +record "clone-tf" $? + +echo "" +echo "=== STEP 7: verify clone ===" +./bin/verify-agent.sh "$SUB" "$CLONE_RG" "$CLONE_AGENT" +record "verify-clone" $? + +echo "" +echo "════════════════════════════════════════" +echo " E2E RESULTS: law-dynatrace-httptrigger × tf-ps" +echo "════════════════════════════════════════" +for r in "${RESULTS[@]}"; do echo " $r"; done +echo "────────────────────────────────────────" +echo " TOTAL: $PASS passed, $FAIL failed" +echo "════════════════════════════════════════" +[[ $FAIL -eq 0 ]] && exit 0 || exit 1 diff --git a/sreagent-templates/tests/lib/test-helpers.sh b/sreagent-templates/tests/lib/test-helpers.sh index 5922c6d90..fe3940675 100755 --- a/sreagent-templates/tests/lib/test-helpers.sh +++ b/sreagent-templates/tests/lib/test-helpers.sh @@ -59,8 +59,8 @@ validate_config_dir() { assert_eq "incident-platforms" "$(count_yaml "$OUT/automations/incident-platforms")" "$exp_platforms" assert_eq "http-triggers" "$(count_yaml "$OUT/automations/http-triggers")" "$exp_httptrig" - # No unreplaced placeholders - local leftover=$(grep -rc '{{' "$OUT/" 2>/dev/null | awk -F: '{s+=$2}END{print s+0}') + # No unreplaced placeholders (exclude ${{ which is GitHub Actions syntax) + local leftover=$(grep -r '{{' "$OUT/" 2>/dev/null | grep -v '\${{' | grep -vc '^$' 2>/dev/null || echo 0) assert_eq "no {{placeholders}}" "$leftover" "0" # connectors.json exists diff --git a/sreagent-templates/tests/test-dry-run-law-dt-httptrigger.sh b/sreagent-templates/tests/test-dry-run-law-dt-httptrigger.sh new file mode 100755 index 000000000..4c9748efd --- /dev/null +++ b/sreagent-templates/tests/test-dry-run-law-dt-httptrigger.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# tests/test-dry-run-law-dt-httptrigger.sh — law-dynatrace-httptrigger: 4 backends × dry-run +set -uo pipefail +cd "$(dirname "$0")/.." +REPORT="/tmp/test-dry-run-law-dt-httptrigger.txt"; > "$REPORT" +source tests/lib/test-helpers.sh + +RECIPE="law-dynatrace-github-httptrigger-prvalidation" +EXTRA_SETS="lawId=/sub/fake;dtTenant=fake;dtToken=fake;githubRepo=https://github.com/fake/repo" +EXP_SKILLS=2 EXP_SA=2 EXP_HOOKS=2 EXP_PROMPTS=2 EXP_SCHED=0 EXP_FILTERS=0 EXP_PLAT=0 EXP_HT=1 +OUT="/tmp/dryrun-${RECIPE}" + +log "═══ $RECIPE ═══" +log "── bash new-agent ──" +rm -rf "$OUT" +SET_ARGS="--set agentName=dry-${RECIPE} --set resourceGroup=rg-dry --set location=swedencentral --set targetRGs=rg-fake-prod,rg-fake-staging" +IFS_OLD="$IFS"; IFS=';'; for s in $EXTRA_SETS; do [[ -n "$s" ]] && SET_ARGS="$SET_ARGS --set $s"; done; IFS="$IFS_OLD" +eval "./bin/new-agent.sh --recipe $RECIPE --non-interactive $SET_ARGS -o $OUT" > /tmp/dryrun-new.log 2>&1 +if [[ -f "$OUT/agent.json" ]]; then pass "new-agent"; else fail "new-agent"; print_summary "$RECIPE"; exit 1; fi + +validate_config_dir "$OUT" $EXP_SKILLS $EXP_SA $EXP_HOOKS $EXP_PROMPTS $EXP_SCHED $EXP_FILTERS $EXP_PLAT $EXP_HT +validate_assembled_content "$OUT" +validate_bicep_dryrun "$OUT" +validate_tf_dryrun "$OUT" $EXP_SKILLS $EXP_SA $EXP_PROMPTS +validate_ps_newagent "$RECIPE" "$EXTRA_SETS" +validate_azd_dryrun "$OUT" + +print_summary "$RECIPE" +exit $?