From 45697b9ebdf0e67edf55ea603eb6cc9f0b2422b0 Mon Sep 17 00:00:00 2001 From: stxkxs Date: Sun, 7 Jun 2026 17:46:25 -0700 Subject: [PATCH 1/2] feat(operator): enable eval-runtime + SLO per-env and inject the eval-runner IRSA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turns on the operator's folded eval-runtime + SLO (charts/operator 0.2.0) at the gitops layer and wires the per-cluster eval IRSA, replacing what the retiring eks-agent-platform/gitops overlay's ApplicationSet used to patch. - addons-agent-operator.yaml: the valuesObject gains evalRuntime.serviceAccount.roleArn + evalReportsBucket, read from the eks-agent-platform/eval-runner-role-arn and eval-reports-bucket cluster-Secret annotations (published by cluster-bootstrap). The role ARN embeds the account id, so it stays out of the public values file — same pattern as the operator role. Both render empty until cluster-bootstrap publishes the annotations, and the chart skips the SA annotation / leaves the bucket blank when empty, so this is forward-compatible. - addons/ai-platform/operator/values.yaml: evalRuntime.enabled + slo.enabled set explicitly (they match the chart defaults; stated so the deployment intent is visible at the gitops layer). rollouts (needs the Argo Rollouts CRD) and slo.alerting (needs the pager/slack Secrets) stay off. Pairs with the landing-zone change that publishes the two eval annotations from the eval-runtime terraform outputs. Refs nanohype/eks-agent-platform#33 --- addons/ai-platform/operator/values.yaml | 14 ++++++++++++++ applicationsets/addons-agent-operator.yaml | 9 +++++++++ 2 files changed, 23 insertions(+) diff --git a/addons/ai-platform/operator/values.yaml b/addons/ai-platform/operator/values.yaml index a091539..b633dfd 100644 --- a/addons/ai-platform/operator/values.yaml +++ b/addons/ai-platform/operator/values.yaml @@ -17,3 +17,17 @@ webhooks: # networkPolicy.engine defaults to cilium (the cluster CNI). The operator's # egress to the kube-apiserver uses a Cilium reserved identity that a vanilla # Kubernetes NetworkPolicy cannot match, so leave it on cilium. + +# The operator's own runtime, folded into the chart (0.2.0+). Enabled on every +# agent-platform cluster (these match the chart defaults; set explicitly so the +# deployment intent is visible at the gitops layer). +# - eval-runtime: the Argo WorkflowTemplate + SA/RBAC. The eval-runner role ARN +# and report bucket are injected per-cluster by the ApplicationSet above. +# - slo: the operator's PrometheusRule + CR-state metrics ConfigMap. +# rollouts.enabled (needs the Argo Rollouts CRD) and slo.alerting.enabled (needs +# the pagerduty/slack Secrets) stay off here; production flips alerting on in +# values-production.yaml once those Secrets are provisioned. +evalRuntime: + enabled: true +slo: + enabled: true diff --git a/applicationsets/addons-agent-operator.yaml b/applicationsets/addons-agent-operator.yaml index 6dae9b1..e2cd416 100644 --- a/applicationsets/addons-agent-operator.yaml +++ b/applicationsets/addons-agent-operator.yaml @@ -62,6 +62,15 @@ spec: serviceAccount: annotations: eks.amazonaws.com/role-arn: '{{ index .metadata.annotations "eks-agent-platform/operator-role-arn" }}' + # eval-runtime IRSA + bucket. The role ARN embeds the account id, so + # like the operator role it is read from a cluster-Secret annotation + # published by cluster-bootstrap, never committed here. Both render + # empty until cluster-bootstrap publishes the annotations (the chart + # skips the SA annotation / leaves the bucket empty when blank). + evalRuntime: + serviceAccount: + roleArn: '{{ index .metadata.annotations "eks-agent-platform/eval-runner-role-arn" }}' + evalReportsBucket: '{{ index .metadata.annotations "eks-agent-platform/eval-reports-bucket" }}' valueFiles: - $values/addons/ai-platform/operator/values.yaml - $values/addons/ai-platform/operator/values-{{ index .metadata.labels "environment" }}.yaml From ba905678bf2b8aac748d44489597e7fbb5464176 Mon Sep 17 00:00:00 2001 From: stxkxs Date: Sun, 7 Jun 2026 17:50:40 -0700 Subject: [PATCH 2/2] feat(operator): port the per-env operator sizing into eks-gitops The retiring eks-agent-platform/gitops overlay carried deliberate per-env operator tuning that the live eks-gitops values lacked (they were bare, so every env ran chart defaults). Carry it across so it isn't lost when gitops/ is deleted, and so production is sized as intended rather than as dev: - dev: replicaCount 1, leader election off, budget reconcile 5m, lower requests. - staging: replicaCount 2, budget reconcile 30m, modest requests/limits. - production: replicaCount 3, higher per-reconciler concurrency (platform/gateway/runtime/budget/eval = 5/5/10/2/4), larger requests/limits, PDB minAvailable 2. config.environment/region/oidc + the SA role-arn stay ApplicationSet-injected (account-specific), so they remain absent from these files. Values that already match the chart defaults (priorityClassName) are left to the chart. Refs nanohype/eks-agent-platform#33 --- addons/ai-platform/operator/values-dev.yaml | 14 +++++++++ .../operator/values-production.yaml | 29 +++++++++++++++++++ .../ai-platform/operator/values-staging.yaml | 14 +++++++++ 3 files changed, 57 insertions(+) diff --git a/addons/ai-platform/operator/values-dev.yaml b/addons/ai-platform/operator/values-dev.yaml index b3df1e7..b2ff1c8 100644 --- a/addons/ai-platform/operator/values-dev.yaml +++ b/addons/ai-platform/operator/values-dev.yaml @@ -1,3 +1,17 @@ # eks-agent-platform operator — dev deltas only (base is values.yaml). # config.environment is injected from the cluster Secret label by the # ApplicationSet, so it is intentionally not set here. + +# Single replica + no leader election — dev needs no HA, and skipping the lease +# removes a startup dependency. +replicaCount: 1 +leaderElection: + enabled: false +reconcilers: + budget: + # Faster budget reconcile in dev for a tighter feedback loop. + requeueInterval: 5m +resources: + requests: + cpu: 100m + memory: 128Mi diff --git a/addons/ai-platform/operator/values-production.yaml b/addons/ai-platform/operator/values-production.yaml index 07b9569..711cd3a 100644 --- a/addons/ai-platform/operator/values-production.yaml +++ b/addons/ai-platform/operator/values-production.yaml @@ -1,3 +1,32 @@ # eks-agent-platform operator — production deltas only (base is values.yaml). # config.environment is injected from the cluster Secret label by the # ApplicationSet, so it is intentionally not set here. + +# Three replicas with leader election for HA; one active reconciler, two warm. +replicaCount: 3 +leaderElection: + enabled: true +# Higher per-reconciler concurrency for production fleet sizes. +reconcilers: + platform: + concurrent: 5 + gateway: + concurrent: 5 + runtime: + concurrent: 10 + budget: + concurrent: 2 + requeueInterval: 1h + eval: + concurrent: 4 +resources: + requests: + cpu: 500m + memory: 512Mi + limits: + cpu: 2000m + memory: 2Gi +# Keep two replicas available through voluntary disruptions (node drains). +podDisruptionBudget: + enabled: true + minAvailable: 2 diff --git a/addons/ai-platform/operator/values-staging.yaml b/addons/ai-platform/operator/values-staging.yaml index e9f7372..016c0db 100644 --- a/addons/ai-platform/operator/values-staging.yaml +++ b/addons/ai-platform/operator/values-staging.yaml @@ -1,3 +1,17 @@ # eks-agent-platform operator — staging deltas only (base is values.yaml). # config.environment is injected from the cluster Secret label by the # ApplicationSet, so it is intentionally not set here. + +replicaCount: 2 +leaderElection: + enabled: true +reconcilers: + budget: + requeueInterval: 30m +resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: 1000m + memory: 1Gi