diff --git a/CHANGELOG.md b/CHANGELOG.md index f4ce5ea5..ec32da9a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +### Changed + +- Set `maxSurge=1` and `maxUnavailable=0` on the OPA DaemonSet rolling update strategy to eliminate + availability gaps during rolling updates ([#819]). + +[#819]: https://github.com/stackabletech/opa-operator/pull/819 + ## [26.3.0] - 2026-03-16 ## [26.3.0-rc1] - 2026-03-16 diff --git a/rust/operator-binary/src/controller.rs b/rust/operator-binary/src/controller.rs index f3d1567c..41f9edbe 100644 --- a/rust/operator-binary/src/controller.rs +++ b/rust/operator-binary/src/controller.rs @@ -34,7 +34,7 @@ use stackable_operator::{ k8s_openapi::{ DeepMerge, api::{ - apps::v1::{DaemonSet, DaemonSetSpec}, + apps::v1::{DaemonSet, DaemonSetSpec, DaemonSetUpdateStrategy, RollingUpdateDaemonSet}, core::v1::{ ConfigMap, EmptyDirVolumeSource, EnvVar, EnvVarSource, HTTPGetAction, ObjectFieldSelector, Probe, SecretVolumeSource, ServiceAccount, @@ -1153,6 +1153,13 @@ fn build_server_rolegroup_daemonset( ..LabelSelector::default() }, template: pod_template, + update_strategy: Some(DaemonSetUpdateStrategy { + type_: Some("RollingUpdate".to_string()), + rolling_update: Some(RollingUpdateDaemonSet { + max_surge: Some(IntOrString::Int(1)), + max_unavailable: Some(IntOrString::Int(0)), + }), + }), ..DaemonSetSpec::default() }; diff --git a/rust/operator-binary/src/service.rs b/rust/operator-binary/src/service.rs index 5cfbbd4f..2b23ed99 100644 --- a/rust/operator-binary/src/service.rs +++ b/rust/operator-binary/src/service.rs @@ -63,6 +63,12 @@ pub(crate) fn build_server_role_service( type_: Some(opa.spec.cluster_config.listener_class.k8s_service_type()), ports: Some(data_service_ports(opa.spec.cluster_config.tls_enabled())), selector: Some(service_selector_labels.into()), + // This ensures that products (e.g. Trino) on a node always talk to the OPA pod on the + // same node, avoiding cross-node latency. The downside is that if the local OPA pod is + // unavailable, requests fail instead of falling back to another node. + // TODO: Once our minimum supported Kubernetes version is 1.35, use + // `trafficDistribution: PreferSameNode` instead, which prefers the local node but + // gracefully falls back to other nodes if the local pod is unavailable. internal_traffic_policy: Some("Local".to_string()), ..ServiceSpec::default() }; diff --git a/tests/templates/kuttl/smoke/10-assert.yaml.j2 b/tests/templates/kuttl/smoke/10-assert.yaml.j2 index 6b86f13f..76703fb2 100644 --- a/tests/templates/kuttl/smoke/10-assert.yaml.j2 +++ b/tests/templates/kuttl/smoke/10-assert.yaml.j2 @@ -9,6 +9,11 @@ kind: DaemonSet metadata: name: test-opa-server-default spec: + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 template: spec: containers: