Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions addons/ai-platform/operator/values-dev.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# eks-agent-platform operator — dev deltas only (base is values.yaml).
# config.environment is injected from the cluster Secret label by the
# ApplicationSet, so it is intentionally not set here.

# Single replica + no leader election — dev needs no HA, and skipping the lease
# removes a startup dependency.
replicaCount: 1
leaderElection:
enabled: false
reconcilers:
budget:
# Faster budget reconcile in dev for a tighter feedback loop.
requeueInterval: 5m
resources:
requests:
cpu: 100m
memory: 128Mi
29 changes: 29 additions & 0 deletions addons/ai-platform/operator/values-production.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,32 @@
# eks-agent-platform operator — production deltas only (base is values.yaml).
# config.environment is injected from the cluster Secret label by the
# ApplicationSet, so it is intentionally not set here.

# Three replicas with leader election for HA; one active reconciler, two warm.
replicaCount: 3
leaderElection:
enabled: true
# Higher per-reconciler concurrency for production fleet sizes.
reconcilers:
platform:
concurrent: 5
gateway:
concurrent: 5
runtime:
concurrent: 10
budget:
concurrent: 2
requeueInterval: 1h
eval:
concurrent: 4
resources:
requests:
cpu: 500m
memory: 512Mi
limits:
cpu: 2000m
memory: 2Gi
# Keep two replicas available through voluntary disruptions (node drains).
podDisruptionBudget:
enabled: true
minAvailable: 2
14 changes: 14 additions & 0 deletions addons/ai-platform/operator/values-staging.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# eks-agent-platform operator — staging deltas only (base is values.yaml).
# config.environment is injected from the cluster Secret label by the
# ApplicationSet, so it is intentionally not set here.

replicaCount: 2
leaderElection:
enabled: true
reconcilers:
budget:
requeueInterval: 30m
resources:
requests:
cpu: 200m
memory: 256Mi
limits:
cpu: 1000m
memory: 1Gi
14 changes: 14 additions & 0 deletions addons/ai-platform/operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,17 @@ webhooks:
# networkPolicy.engine defaults to cilium (the cluster CNI). The operator's
# egress to the kube-apiserver uses a Cilium reserved identity that a vanilla
# Kubernetes NetworkPolicy cannot match, so leave it on cilium.

# The operator's own runtime, folded into the chart (0.2.0+). Enabled on every
# agent-platform cluster (these match the chart defaults; set explicitly so the
# deployment intent is visible at the gitops layer).
# - eval-runtime: the Argo WorkflowTemplate + SA/RBAC. The eval-runner role ARN
# and report bucket are injected per-cluster by the ApplicationSet above.
# - slo: the operator's PrometheusRule + CR-state metrics ConfigMap.
# rollouts.enabled (needs the Argo Rollouts CRD) and slo.alerting.enabled (needs
# the pagerduty/slack Secrets) stay off here; production flips alerting on in
# values-production.yaml once those Secrets are provisioned.
evalRuntime:
enabled: true
slo:
enabled: true
9 changes: 9 additions & 0 deletions applicationsets/addons-agent-operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,15 @@ spec:
serviceAccount:
annotations:
eks.amazonaws.com/role-arn: '{{ index .metadata.annotations "eks-agent-platform/operator-role-arn" }}'
# eval-runtime IRSA + bucket. The role ARN embeds the account id, so
# like the operator role it is read from a cluster-Secret annotation
# published by cluster-bootstrap, never committed here. Both render
# empty until cluster-bootstrap publishes the annotations (the chart
# skips the SA annotation / leaves the bucket empty when blank).
evalRuntime:
serviceAccount:
roleArn: '{{ index .metadata.annotations "eks-agent-platform/eval-runner-role-arn" }}'
evalReportsBucket: '{{ index .metadata.annotations "eks-agent-platform/eval-reports-bucket" }}'
valueFiles:
- $values/addons/ai-platform/operator/values.yaml
- $values/addons/ai-platform/operator/values-{{ index .metadata.labels "environment" }}.yaml
Expand Down