diff --git a/addons/ai-platform/operator/values-dev.yaml b/addons/ai-platform/operator/values-dev.yaml index b3df1e7..b2ff1c8 100644 --- a/addons/ai-platform/operator/values-dev.yaml +++ b/addons/ai-platform/operator/values-dev.yaml @@ -1,3 +1,17 @@ # eks-agent-platform operator — dev deltas only (base is values.yaml). # config.environment is injected from the cluster Secret label by the # ApplicationSet, so it is intentionally not set here. + +# Single replica + no leader election — dev needs no HA, and skipping the lease +# removes a startup dependency. +replicaCount: 1 +leaderElection: + enabled: false +reconcilers: + budget: + # Faster budget reconcile in dev for a tighter feedback loop. + requeueInterval: 5m +resources: + requests: + cpu: 100m + memory: 128Mi diff --git a/addons/ai-platform/operator/values-production.yaml b/addons/ai-platform/operator/values-production.yaml index 07b9569..711cd3a 100644 --- a/addons/ai-platform/operator/values-production.yaml +++ b/addons/ai-platform/operator/values-production.yaml @@ -1,3 +1,32 @@ # eks-agent-platform operator — production deltas only (base is values.yaml). # config.environment is injected from the cluster Secret label by the # ApplicationSet, so it is intentionally not set here. + +# Three replicas with leader election for HA; one active reconciler, two warm. +replicaCount: 3 +leaderElection: + enabled: true +# Higher per-reconciler concurrency for production fleet sizes. +reconcilers: + platform: + concurrent: 5 + gateway: + concurrent: 5 + runtime: + concurrent: 10 + budget: + concurrent: 2 + requeueInterval: 1h + eval: + concurrent: 4 +resources: + requests: + cpu: 500m + memory: 512Mi + limits: + cpu: 2000m + memory: 2Gi +# Keep two replicas available through voluntary disruptions (node drains). +podDisruptionBudget: + enabled: true + minAvailable: 2 diff --git a/addons/ai-platform/operator/values-staging.yaml b/addons/ai-platform/operator/values-staging.yaml index e9f7372..016c0db 100644 --- a/addons/ai-platform/operator/values-staging.yaml +++ b/addons/ai-platform/operator/values-staging.yaml @@ -1,3 +1,17 @@ # eks-agent-platform operator — staging deltas only (base is values.yaml). # config.environment is injected from the cluster Secret label by the # ApplicationSet, so it is intentionally not set here. + +replicaCount: 2 +leaderElection: + enabled: true +reconcilers: + budget: + requeueInterval: 30m +resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: 1000m + memory: 1Gi diff --git a/addons/ai-platform/operator/values.yaml b/addons/ai-platform/operator/values.yaml index a091539..b633dfd 100644 --- a/addons/ai-platform/operator/values.yaml +++ b/addons/ai-platform/operator/values.yaml @@ -17,3 +17,17 @@ webhooks: # networkPolicy.engine defaults to cilium (the cluster CNI). The operator's # egress to the kube-apiserver uses a Cilium reserved identity that a vanilla # Kubernetes NetworkPolicy cannot match, so leave it on cilium. + +# The operator's own runtime, folded into the chart (0.2.0+). Enabled on every +# agent-platform cluster (these match the chart defaults; set explicitly so the +# deployment intent is visible at the gitops layer). +# - eval-runtime: the Argo WorkflowTemplate + SA/RBAC. The eval-runner role ARN +# and report bucket are injected per-cluster by the ApplicationSet above. +# - slo: the operator's PrometheusRule + CR-state metrics ConfigMap. +# rollouts.enabled (needs the Argo Rollouts CRD) and slo.alerting.enabled (needs +# the pagerduty/slack Secrets) stay off here; production flips alerting on in +# values-production.yaml once those Secrets are provisioned. +evalRuntime: + enabled: true +slo: + enabled: true diff --git a/applicationsets/addons-agent-operator.yaml b/applicationsets/addons-agent-operator.yaml index 6dae9b1..e2cd416 100644 --- a/applicationsets/addons-agent-operator.yaml +++ b/applicationsets/addons-agent-operator.yaml @@ -62,6 +62,15 @@ spec: serviceAccount: annotations: eks.amazonaws.com/role-arn: '{{ index .metadata.annotations "eks-agent-platform/operator-role-arn" }}' + # eval-runtime IRSA + bucket. The role ARN embeds the account id, so + # like the operator role it is read from a cluster-Secret annotation + # published by cluster-bootstrap, never committed here. Both render + # empty until cluster-bootstrap publishes the annotations (the chart + # skips the SA annotation / leaves the bucket empty when blank). + evalRuntime: + serviceAccount: + roleArn: '{{ index .metadata.annotations "eks-agent-platform/eval-runner-role-arn" }}' + evalReportsBucket: '{{ index .metadata.annotations "eks-agent-platform/eval-reports-bucket" }}' valueFiles: - $values/addons/ai-platform/operator/values.yaml - $values/addons/ai-platform/operator/values-{{ index .metadata.labels "environment" }}.yaml