From 45697b9ebdf0e67edf55ea603eb6cc9f0b2422b0 Mon Sep 17 00:00:00 2001
From: stxkxs <stxkxs@users.noreply.github.com>
Date: Sun, 7 Jun 2026 17:46:25 -0700
Subject: [PATCH 1/2] feat(operator): enable eval-runtime + SLO per-env and
 inject the eval-runner IRSA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Turns on the operator's folded eval-runtime + SLO (charts/operator 0.2.0) at the gitops layer and
wires the per-cluster eval IRSA, replacing what the retiring eks-agent-platform/gitops overlay's
ApplicationSet used to patch.

- addons-agent-operator.yaml: the valuesObject gains evalRuntime.serviceAccount.roleArn +
  evalReportsBucket, read from the eks-agent-platform/eval-runner-role-arn and eval-reports-bucket
  cluster-Secret annotations (published by cluster-bootstrap). The role ARN embeds the account id,
  so it stays out of the public values file — same pattern as the operator role. Both render empty
  until cluster-bootstrap publishes the annotations, and the chart skips the SA annotation / leaves
  the bucket blank when empty, so this is forward-compatible.
- addons/ai-platform/operator/values.yaml: evalRuntime.enabled + slo.enabled set explicitly (they
  match the chart defaults; stated so the deployment intent is visible at the gitops layer).
  rollouts (needs the Argo Rollouts CRD) and slo.alerting (needs the pager/slack Secrets) stay off.

Pairs with the landing-zone change that publishes the two eval annotations from the eval-runtime
terraform outputs.

Refs nanohype/eks-agent-platform#33
---
 addons/ai-platform/operator/values.yaml    | 14 ++++++++++++++
 applicationsets/addons-agent-operator.yaml |  9 +++++++++
 2 files changed, 23 insertions(+)

diff --git a/addons/ai-platform/operator/values.yaml b/addons/ai-platform/operator/values.yaml
index a091539..b633dfd 100644
--- a/addons/ai-platform/operator/values.yaml
+++ b/addons/ai-platform/operator/values.yaml
@@ -17,3 +17,17 @@ webhooks:
 # networkPolicy.engine defaults to cilium (the cluster CNI). The operator's
 # egress to the kube-apiserver uses a Cilium reserved identity that a vanilla
 # Kubernetes NetworkPolicy cannot match, so leave it on cilium.
+
+# The operator's own runtime, folded into the chart (0.2.0+). Enabled on every
+# agent-platform cluster (these match the chart defaults; set explicitly so the
+# deployment intent is visible at the gitops layer).
+#   - eval-runtime: the Argo WorkflowTemplate + SA/RBAC. The eval-runner role ARN
+#     and report bucket are injected per-cluster by the ApplicationSet above.
+#   - slo: the operator's PrometheusRule + CR-state metrics ConfigMap.
+# rollouts.enabled (needs the Argo Rollouts CRD) and slo.alerting.enabled (needs
+# the pagerduty/slack Secrets) stay off here; production flips alerting on in
+# values-production.yaml once those Secrets are provisioned.
+evalRuntime:
+  enabled: true
+slo:
+  enabled: true
diff --git a/applicationsets/addons-agent-operator.yaml b/applicationsets/addons-agent-operator.yaml
index 6dae9b1..e2cd416 100644
--- a/applicationsets/addons-agent-operator.yaml
+++ b/applicationsets/addons-agent-operator.yaml
@@ -62,6 +62,15 @@ spec:
               serviceAccount:
                 annotations:
                   eks.amazonaws.com/role-arn: '{{ index .metadata.annotations "eks-agent-platform/operator-role-arn" }}'
+              # eval-runtime IRSA + bucket. The role ARN embeds the account id, so
+              # like the operator role it is read from a cluster-Secret annotation
+              # published by cluster-bootstrap, never committed here. Both render
+              # empty until cluster-bootstrap publishes the annotations (the chart
+              # skips the SA annotation / leaves the bucket empty when blank).
+              evalRuntime:
+                serviceAccount:
+                  roleArn: '{{ index .metadata.annotations "eks-agent-platform/eval-runner-role-arn" }}'
+                evalReportsBucket: '{{ index .metadata.annotations "eks-agent-platform/eval-reports-bucket" }}'
             valueFiles:
               - $values/addons/ai-platform/operator/values.yaml
               - $values/addons/ai-platform/operator/values-{{ index .metadata.labels "environment" }}.yaml

From ba905678bf2b8aac748d44489597e7fbb5464176 Mon Sep 17 00:00:00 2001
From: stxkxs <stxkxs@users.noreply.github.com>
Date: Sun, 7 Jun 2026 17:50:40 -0700
Subject: [PATCH 2/2] feat(operator): port the per-env operator sizing into
 eks-gitops

The retiring eks-agent-platform/gitops overlay carried deliberate per-env operator tuning that the
live eks-gitops values lacked (they were bare, so every env ran chart defaults). Carry it across so
it isn't lost when gitops/ is deleted, and so production is sized as intended rather than as dev:

- dev: replicaCount 1, leader election off, budget reconcile 5m, lower requests.
- staging: replicaCount 2, budget reconcile 30m, modest requests/limits.
- production: replicaCount 3, higher per-reconciler concurrency (platform/gateway/runtime/budget/eval
  = 5/5/10/2/4), larger requests/limits, PDB minAvailable 2.

config.environment/region/oidc + the SA role-arn stay ApplicationSet-injected (account-specific),
so they remain absent from these files. Values that already match the chart defaults
(priorityClassName) are left to the chart.

Refs nanohype/eks-agent-platform#33
---
 addons/ai-platform/operator/values-dev.yaml   | 14 +++++++++
 .../operator/values-production.yaml           | 29 +++++++++++++++++++
 .../ai-platform/operator/values-staging.yaml  | 14 +++++++++
 3 files changed, 57 insertions(+)

diff --git a/addons/ai-platform/operator/values-dev.yaml b/addons/ai-platform/operator/values-dev.yaml
index b3df1e7..b2ff1c8 100644
--- a/addons/ai-platform/operator/values-dev.yaml
+++ b/addons/ai-platform/operator/values-dev.yaml
@@ -1,3 +1,17 @@
 # eks-agent-platform operator — dev deltas only (base is values.yaml).
 # config.environment is injected from the cluster Secret label by the
 # ApplicationSet, so it is intentionally not set here.
+
+# Single replica + no leader election — dev needs no HA, and skipping the lease
+# removes a startup dependency.
+replicaCount: 1
+leaderElection:
+  enabled: false
+reconcilers:
+  budget:
+    # Faster budget reconcile in dev for a tighter feedback loop.
+    requeueInterval: 5m
+resources:
+  requests:
+    cpu: 100m
+    memory: 128Mi
diff --git a/addons/ai-platform/operator/values-production.yaml b/addons/ai-platform/operator/values-production.yaml
index 07b9569..711cd3a 100644
--- a/addons/ai-platform/operator/values-production.yaml
+++ b/addons/ai-platform/operator/values-production.yaml
@@ -1,3 +1,32 @@
 # eks-agent-platform operator — production deltas only (base is values.yaml).
 # config.environment is injected from the cluster Secret label by the
 # ApplicationSet, so it is intentionally not set here.
+
+# Three replicas with leader election for HA; one active reconciler, two warm.
+replicaCount: 3
+leaderElection:
+  enabled: true
+# Higher per-reconciler concurrency for production fleet sizes.
+reconcilers:
+  platform:
+    concurrent: 5
+  gateway:
+    concurrent: 5
+  runtime:
+    concurrent: 10
+  budget:
+    concurrent: 2
+    requeueInterval: 1h
+  eval:
+    concurrent: 4
+resources:
+  requests:
+    cpu: 500m
+    memory: 512Mi
+  limits:
+    cpu: 2000m
+    memory: 2Gi
+# Keep two replicas available through voluntary disruptions (node drains).
+podDisruptionBudget:
+  enabled: true
+  minAvailable: 2
diff --git a/addons/ai-platform/operator/values-staging.yaml b/addons/ai-platform/operator/values-staging.yaml
index e9f7372..016c0db 100644
--- a/addons/ai-platform/operator/values-staging.yaml
+++ b/addons/ai-platform/operator/values-staging.yaml
@@ -1,3 +1,17 @@
 # eks-agent-platform operator — staging deltas only (base is values.yaml).
 # config.environment is injected from the cluster Secret label by the
 # ApplicationSet, so it is intentionally not set here.
+
+replicaCount: 2
+leaderElection:
+  enabled: true
+reconcilers:
+  budget:
+    requeueInterval: 30m
+resources:
+  requests:
+    cpu: 200m
+    memory: 256Mi
+  limits:
+    cpu: 1000m
+    memory: 1Gi