diff --git a/.gitignore b/.gitignore
index 92b40475..8faae3a7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,5 @@
-*.sh
 *.lock
 dist/
 .vscode
-temp/
\ No newline at end of file
+temp/
+*.tgz
diff --git a/Justfile b/Justfile
index 7cd0c6b3..d741f615 100644
--- a/Justfile
+++ b/Justfile
@@ -28,3 +28,30 @@ package:
 validate: lint test
     @echo "=== All validations passed ==="
 
+# Create the kind cluster and shared fixtures used by the functional suite
+functional-setup:
+    ./functional-tests/setup.sh
+
+# Tear down fixtures (pass --cluster to also delete the kind cluster)
+functional-teardown *ARGS:
+    ./functional-tests/teardown.sh {{ARGS}}
+
+# Run one scenario against the already-set-up kind cluster, e.g.
+#   just functional-scenario off off on on sidecar
+# tls/auth/shard/rep are on|off; istio is off|sidecar|ambient.
+functional-scenario tls auth shard rep istio:
+    ./functional-tests/run-scenario.sh {{tls}} {{auth}} {{shard}} {{rep}} {{istio}}
+
+# Run the full 48-scenario matrix (set FILTER='tls=on istio=ambient' to narrow)
+functional-run:
+    ./functional-tests/run-all.sh
+
+# Run the extra (non-matrix) regression scenarios on their own
+functional-extras:
+    ./functional-tests/run-extra-scenarios.sh
+
+# Full functional suite: setup + matrix + teardown including cluster
+functional-test:
+    ./functional-tests/setup.sh
+    ./functional-tests/run-all.sh
+    ./functional-tests/teardown.sh --cluster
diff --git a/functional-tests/kind-config.yaml b/functional-tests/kind-config.yaml
new file mode 100644
index 00000000..3c58a0b5
--- /dev/null
+++ b/functional-tests/kind-config.yaml
@@ -0,0 +1,5 @@
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+name: valkey-functional
+nodes:
+  - role: control-plane
diff --git a/functional-tests/lib.sh b/functional-tests/lib.sh
new file mode 100755
index 00000000..f4adda7c
--- /dev/null
+++ b/functional-tests/lib.sh
@@ -0,0 +1,68 @@
+# Shared helpers for Valkey functional tests.
+# Sourced by every script under functional-tests/.
+
+set -euo pipefail
+
+HERE=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
+REPO_ROOT=$(cd -- "${HERE}/.." && pwd)
+CHART_DIR=${REPO_ROOT}/valkey
+
+CLUSTER_NAME=${VALKEY_KIND_CLUSTER:-valkey-functional}
+KUBE_CONTEXT=kind-${CLUSTER_NAME}
+NAMESPACE=${VALKEY_FUNCTIONAL_NAMESPACE:-default}
+RELEASE=${VALKEY_RELEASE:-valkey}
+
+AUTH_SECRET=valkey-auth
+TLS_SECRET=valkey-tls
+# Three testbenches, covering every shape of mesh participation:
+#   valkey-testbench          — never gets an Envoy sidecar (istio=off
+#                               scenarios, or when Istio isn't installed at
+#                               all). Opts out of both sidecar injection
+#                               and ambient capture.
+#   valkey-testbench-injected — sidecar-injected (istio=on, mode=sidecar).
+#   valkey-testbench-ambient  — ambient-enrolled (istio=on, mode=ambient):
+#                               no sidecar, ztunnel captures its traffic so
+#                               it presents the expected SPIFFE identity to
+#                               Valkey pods' AuthorizationPolicy.
+TESTBENCH_POD=valkey-testbench
+TESTBENCH_POD_INJECTED=valkey-testbench-injected
+TESTBENCH_POD_AMBIENT=valkey-testbench-ambient
+# Deliberately hostile: spaces, shell metacharacters ($, `, &, !), a backslash,
+# and a double-quote. Every auth=on scenario then exercises both layers of
+# quoting on the chart side:
+#   - the init container's ACL hash pipe (printf %s | sha256sum)
+#   - the masterauth line in valkey.conf (must be quoted+escaped)
+#   - the cluster-init Job's REDISCLI_AUTH path
+#   - the helm-test pod's `cat /valkey-auth/...-password | xargs valkey-cli -a`
+# Keeping these in one place means every future auth=on scenario inherits the
+# coverage for free.
+AUTH_PASSWORD='p@ss w/ spaces & $chars `backticks` "quoted" \backslash'
+
+ISTIO_NAMESPACE=istio-system
+
+log() { printf '=== %s ===\n' "$*"; }
+
+kctl() { kubectl --context="${KUBE_CONTEXT}" --namespace="${NAMESPACE}" "$@"; }
+hctl() { helm  --kube-context="${KUBE_CONTEXT}" --namespace="${NAMESPACE}" "$@"; }
+
+# kubectl exec into a testbench. First arg is the pod name; rest is the command.
+testbench_exec_in() {
+    local pod=$1; shift
+    kctl exec "${pod}" -c "${pod}" -- "$@"
+}
+
+wait_for_testbench() {
+    local pod=$1
+    kctl wait --for=condition=Ready "pod/${pod}" --timeout=180s
+}
+
+istio_installed() {
+    kubectl --context="${KUBE_CONTEXT}" get namespace "${ISTIO_NAMESPACE}" >/dev/null 2>&1
+}
+
+# Whether the cluster has Istio's ambient data plane (ztunnel DaemonSet)
+# installed. Scenarios that require ambient exit-skip if this returns false.
+istio_ambient_installed() {
+    kubectl --context="${KUBE_CONTEXT}" -n "${ISTIO_NAMESPACE}" \
+        get daemonset ztunnel >/dev/null 2>&1
+}
diff --git a/functional-tests/run-all.sh b/functional-tests/run-all.sh
new file mode 100755
index 00000000..3ddbfbc4
--- /dev/null
+++ b/functional-tests/run-all.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+# Drive every scenario in the matrix, sequentially. Assumes `setup.sh`
+# has already created the kind cluster and fixtures.
+
+HERE=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=lib.sh
+. "${HERE}/lib.sh"
+
+# 48 scenarios: every combination of tls/auth/shard/rep × istio=
+# off|sidecar|ambient. The istio dimension is three-valued rather than two
+# because sidecar and ambient share almost nothing below the chart-owned
+# templates — different label paths, different mTLS enforcement points
+# (Envoy vs ztunnel), different rendered resources (DestinationRule only
+# in sidecar; AuthorizationPolicy in both but enforced differently). Keep
+# them both in the matrix so a regression in one mode can't hide behind a
+# passing result in the other.
+SCENARIOS=()
+for istio in off sidecar ambient; do
+    for tls in off on; do
+        for auth in off on; do
+            for shard in off on; do
+                for rep in off on; do
+                    SCENARIOS+=("${tls} ${auth} ${shard} ${rep} ${istio}")
+                done
+            done
+        done
+    done
+done
+
+# Optional filter: `FILTER='tls=on istio=ambient'` runs only matching
+# scenarios. Filter values for `istio` are off|sidecar|ambient; `on` is
+# accepted as an alias for "sidecar or ambient" to keep old habits working.
+matches() {
+    local tls=$1 auth=$2 shard=$3 rep=$4 istio=$5
+    for sel in ${FILTER:-}; do
+        local k=${sel%=*} v=${sel#*=}
+        local have
+        case "${k}" in
+            tls)   have=${tls} ;;
+            auth)  have=${auth} ;;
+            shard) have=${shard} ;;
+            rep)   have=${rep} ;;
+            istio)
+                if [[ ${v} == on ]]; then
+                    [[ ${istio} == sidecar || ${istio} == ambient ]] || return 1
+                    continue
+                fi
+                have=${istio}
+                ;;
+            *) echo "bad filter key: ${k}" >&2; exit 2 ;;
+        esac
+        [[ ${have} == "${v}" ]] || return 1
+    done
+    return 0
+}
+
+passed=0
+failed=0
+skipped=0
+failures=()
+
+for s in "${SCENARIOS[@]}"; do
+    # shellcheck disable=SC2086
+    read -r tls auth shard rep istio <<<"${s}"
+    if ! matches "${tls}" "${auth}" "${shard}" "${rep}" "${istio}"; then
+        continue
+    fi
+
+    # Ambient scenarios require ztunnel to be installed. setup.sh now
+    # installs the ambient profile by default, but a user running against
+    # a pre-existing cluster might have only the sidecar data plane —
+    # skip rather than fail in that case so the rest of the matrix still
+    # runs.
+    if [[ ${istio} == ambient ]] && ! istio_ambient_installed; then
+        log "SKIP: tls=${tls} auth=${auth} shard=${shard} rep=${rep} istio=${istio} (ztunnel not installed)"
+        skipped=$(( skipped + 1 ))
+        continue
+    fi
+
+    log "SCENARIO: tls=${tls} auth=${auth} shard=${shard} rep=${rep} istio=${istio}"
+    if "${HERE}/run-scenario.sh" "${tls}" "${auth}" "${shard}" "${rep}" "${istio}"; then
+        passed=$(( passed + 1 ))
+    else
+        failed=$(( failed + 1 ))
+        failures+=("tls=${tls} auth=${auth} shard=${shard} rep=${rep} istio=${istio}")
+    fi
+done
+
+echo
+log "Matrix summary: ${passed} passed, ${failed} failed, ${skipped} skipped"
+if (( failed > 0 )); then
+    printf '  failed: %s\n' "${failures[@]}"
+    exit 1
+fi
+
+# Extra, non-matrix regressions (aclConfig+metrics, default-deny netpol,
+# cross-release MEET isolation, ambient validator footguns, Prometheus
+# scraping, etc.). Each one is independent of the tls/auth/shard/rep
+# combinations — folding them into the matrix would just pay the
+# install/teardown cost N times to exercise the same single assertion.
+# Skipped when FILTER is set: filters are matrix-scoped, so the extras
+# wouldn't match anyway and running them would be surprising.
+if [[ -z ${FILTER:-} ]]; then
+    "${HERE}/run-extra-scenarios.sh"
+fi
diff --git a/functional-tests/run-extra-scenarios.sh b/functional-tests/run-extra-scenarios.sh
new file mode 100755
index 00000000..87f0ef59
--- /dev/null
+++ b/functional-tests/run-extra-scenarios.sh
@@ -0,0 +1,1084 @@
+#!/usr/bin/env bash
+# Targeted regressions that don't fit the tls/auth/shard/rep/istio matrix.
+# Each scenario is self-contained: install, assert, uninstall.
+
+HERE=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=lib.sh
+. "${HERE}/lib.sh"
+
+RESULTS=()
+pass() { RESULTS+=("PASS: $1"); }
+fail() { RESULTS+=("FAIL: $1: $2"); return 1; }
+
+cleanup_release() {
+    hctl uninstall "${RELEASE}" 2>/dev/null || true
+    kctl delete pvc --selector="app.kubernetes.io/instance=${RELEASE}" --ignore-not-found >/dev/null
+}
+
+# ---------------------------------------------------------------------------
+# Scenario: auth.enabled=true with aclConfig only (no aclUsers) and metrics
+# enabled. This used to CrashLoop the exporter with CreateContainerConfigError
+# because the chart pointed REDIS_PASSWORD at a key `default-password` that
+# only exists when there's an inline aclUsers.default.password. The fix is to
+# only wire REDIS_PASSWORD when a real key exists.
+# ---------------------------------------------------------------------------
+scenario_aclconfig_metrics() {
+    local name="aclConfig-only + metrics exporter must not crash"
+    log "SCENARIO: ${name}"
+    cleanup_release
+
+    # Use an alternate release name to avoid colliding with the shared
+    # fixture secret `valkey-auth` (managed by setup.sh, not Helm). The chart
+    # generates `${release}-auth`, so a different release ⇒ a different secret.
+    local release="${RELEASE}-aclcfg"
+    hctl uninstall "${release}" 2>/dev/null || true
+    kctl delete pvc --selector="app.kubernetes.io/instance=${release}" --ignore-not-found >/dev/null
+
+    if ! hctl install "${release}" "${CHART_DIR}" \
+            --set=metrics.enabled=true \
+            --set=auth.enabled=true \
+            --set-string="auth.aclConfig=user default on >simplepass ~* &* +@all" \
+            --wait --timeout=180s >/dev/null; then
+        fail "${name}" "helm install failed"
+        hctl uninstall "${release}" 2>/dev/null || true
+        return
+    fi
+
+    # Main container must be Running, metrics sidecar must be Ready. The bug
+    # made the metrics container stick in CreateContainerConfigError forever —
+    # no amount of probe-waiting would ever flip it to Ready.
+    local pod
+    pod=$(kctl get pod -l "app.kubernetes.io/instance=${release}" \
+        -o jsonpath='{.items[0].metadata.name}')
+    if ! kctl wait "pod/${pod}" \
+            --for=condition=Ready --timeout=120s >/dev/null; then
+        local status
+        status=$(kctl get "pod/${pod}" -o jsonpath='{.status.containerStatuses[*].state}')
+        fail "${name}" "pod never became Ready (state=${status})"
+        hctl uninstall "${release}" 2>/dev/null || true
+        return
+    fi
+
+    # Metrics endpoint actually responds. Use `kubectl port-forward` into a
+    # local port — lets us hit the exporter from the host with curl, without
+    # relying on either container having an HTTP client.
+    local pf_port=19121 pf_pid
+    kctl port-forward "pod/${pod}" "${pf_port}:9121" >/dev/null 2>&1 &
+    pf_pid=$!
+    # Give port-forward a moment to establish.
+    for _ in $(seq 1 20); do
+        if curl -sf --max-time 1 "http://127.0.0.1:${pf_port}/metrics" \
+                >/dev/null 2>&1; then
+            break
+        fi
+        sleep 0.5
+    done
+
+    local metrics_out
+    metrics_out=$(curl -sf --max-time 5 "http://127.0.0.1:${pf_port}/metrics" \
+        2>/dev/null || true)
+    kill "${pf_pid}" 2>/dev/null || true
+    wait "${pf_pid}" 2>/dev/null || true
+
+    if ! grep -q 'redis_exporter_' <<<"${metrics_out}"; then
+        fail "${name}" "/metrics did not serve redis_exporter_* counters"
+        hctl uninstall "${release}" 2>/dev/null || true
+        return
+    fi
+
+    hctl uninstall "${release}" 2>/dev/null || true
+    kctl delete pvc --selector="app.kubernetes.io/instance=${release}" --ignore-not-found >/dev/null
+    pass "${name}"
+}
+
+# ---------------------------------------------------------------------------
+# Scenario: default-deny NetworkPolicy. Previously `networkPolicy.ingress: []`
+# rendered an invalid policy (policyTypes: []), which the API accepts but is a
+# no-op. The fix gates on hasKey, so an empty list still opts in.
+# ---------------------------------------------------------------------------
+scenario_default_deny_netpol() {
+    local name="networkPolicy.ingress=[] produces a real default-deny policy"
+    log "SCENARIO: ${name}"
+    cleanup_release
+
+    if ! hctl install "${RELEASE}" "${CHART_DIR}" \
+            --set-json='networkPolicy={"ingress":[]}' \
+            --wait --timeout=120s >/dev/null; then
+        fail "${name}" "helm install failed"
+        return
+    fi
+
+    # The original bug: `networkPolicy.ingress: []` rendered `policyTypes: []`,
+    # which Kubernetes treats as "no policy in either direction" — silently
+    # allowing all traffic despite the user clearly opting into default-deny.
+    # The fix is to gate on hasKey, not truthiness.
+    #
+    # Checking via the API alone is fragile (kube-apiserver drops empty lists
+    # on serialization), so:
+    #   1) Assert policyTypes contains Ingress.
+    #   2) Actually attempt a TCP connection from the testbench — a real
+    #      default-deny policy blocks it; a no-op policy lets it through.
+    local types
+    types=$(kctl get networkpolicy "${RELEASE}" \
+        -o jsonpath='{.spec.policyTypes[*]}')
+    if [[ ${types} != *Ingress* ]]; then
+        fail "${name}" "policyTypes=${types} (want to include Ingress)"
+        return
+    fi
+
+    # Live traffic check. Use a short timeout — a default-deny policy drops
+    # SYN packets, so the testbench will sit in CONNECT until the timeout.
+    set +e
+    testbench_exec_in "${TESTBENCH_POD}" sh -c \
+        "timeout 5 valkey-cli -h valkey.${NAMESPACE}.svc.cluster.local ping" \
+        >/dev/null 2>&1
+    local rc=$?
+    set -e
+    if (( rc == 0 )); then
+        fail "${name}" "ping succeeded — default-deny ingress policy is a no-op"
+        return
+    fi
+
+    cleanup_release
+    pass "${name}"
+}
+
+# ---------------------------------------------------------------------------
+# Scenario: frontend Service must never expose the cluster bus port. The bus
+# port is pod-to-pod gossip; routing it through a round-robin ClusterIP
+# misdirects clients to arbitrary nodes.
+# ---------------------------------------------------------------------------
+scenario_bus_port_hidden() {
+    local name="frontend service does not expose the cluster bus port"
+    log "SCENARIO: ${name}"
+    cleanup_release
+
+    if ! hctl install "${RELEASE}" "${CHART_DIR}" \
+            --set=cluster.enabled=true \
+            --set=cluster.persistence.size=100Mi \
+            --set=cluster.shards=3 \
+            --set=cluster.replicasPerShard=0 \
+            --set=cluster.busPort=16379 \
+            --wait --timeout=300s >/dev/null; then
+        fail "${name}" "helm install failed"
+        return
+    fi
+
+    local frontend_ports headless_ports
+    frontend_ports=$(kctl get service "${RELEASE}" \
+        -o jsonpath='{.spec.ports[*].name}')
+    headless_ports=$(kctl get service "${RELEASE}-headless" \
+        -o jsonpath='{.spec.ports[*].name}')
+
+    if grep -qw tcp-bus <<<"${frontend_ports}"; then
+        fail "${name}" "frontend exposes tcp-bus (ports=${frontend_ports})"
+        return
+    fi
+    if ! grep -qw tcp-bus <<<"${headless_ports}"; then
+        fail "${name}" "headless missing tcp-bus (ports=${headless_ports})"
+        return
+    fi
+
+    cleanup_release
+    pass "${name}"
+}
+
+# ---------------------------------------------------------------------------
+# Scenario: readiness probe must exist on the valkey container. Previously
+# only startup+liveness were defined, so a pod that lost server health but
+# kept the TCP socket would keep receiving traffic.
+# ---------------------------------------------------------------------------
+scenario_readiness_probe_exists() {
+    local name="valkey container declares a readiness probe"
+    log "SCENARIO: ${name}"
+    cleanup_release
+
+    if ! hctl install "${RELEASE}" "${CHART_DIR}" \
+            --wait --timeout=120s >/dev/null; then
+        fail "${name}" "helm install failed"
+        return
+    fi
+
+    local probe
+    probe=$(kctl get deployment "${RELEASE}" \
+        -o jsonpath='{.spec.template.spec.containers[0].readinessProbe.exec.command}')
+    if [[ -z ${probe} ]]; then
+        fail "${name}" "readinessProbe is missing"
+        return
+    fi
+    # And it must be the NOAUTH-tolerant flavour.
+    if ! grep -q 'NOAUTH' <<<"${probe}"; then
+        fail "${name}" "readinessProbe does not tolerate NOAUTH (${probe})"
+        return
+    fi
+
+    cleanup_release
+    pass "${name}"
+}
+
+# ---------------------------------------------------------------------------
+# Scenario: two independent Valkey clusters in the same namespace must stay
+# independent. Valkey's CLUSTER MEET has no auth, so a MEET issued by (or
+# forwarded through) a node in cluster A can merge cluster B into it. The
+# chart's cluster-isolation NetworkPolicy pins the bus port to same-release
+# pods; without it, a stray MEET wins.
+#
+# This test:
+#   1) installs `valkey-a` and `valkey-b` in the same namespace, cluster mode;
+#   2) issues CLUSTER MEET from a node in A targeting a node in B;
+#   3) waits for gossip to propagate;
+#   4) asserts A still has its original 3 nodes (not 6).
+#
+# Also runs a negative twin with `cluster.isolation.enabled=false` to prove
+# the assertion has teeth — if isolation is the thing keeping them apart,
+# disabling it must let the merge happen.
+# ---------------------------------------------------------------------------
+
+# Install one cluster-mode release with a given name and isolation flag.
+# Globals it expects: NAMESPACE, CHART_DIR, KUBE_CONTEXT.
+install_cluster() {
+    local release=$1 isolation=$2
+    hctl install "${release}" "${CHART_DIR}" \
+        --set=cluster.enabled=true \
+        --set=cluster.persistence.size=100Mi \
+        --set=cluster.shards=3 \
+        --set=cluster.replicasPerShard=0 \
+        --set="cluster.isolation.enabled=${isolation}" \
+        --wait --timeout=300s >/dev/null
+}
+
+# Count unique nodes reported by `cluster nodes` on pod-0 of the given release.
+# Returns 0 if the query itself fails (counts as "indeterminate").
+count_cluster_nodes() {
+    local release=$1
+    # Filter blanks + the "myself" marker to get the real node count.
+    kctl exec "${release}-0" -c "${release}" -- sh -c \
+        "valkey-cli cluster nodes 2>/dev/null | awk 'NF {print \$1}' | sort -u | wc -l" \
+        2>/dev/null | tr -d '[:space:]' || echo 0
+}
+
+# Fire CLUSTER MEET from src_release pod-0 targeting dst_release pod-0.
+poison_meet() {
+    local src_release=$1 dst_release=$2
+    local dst_ip
+    dst_ip=$(kctl get pod "${dst_release}-0" -o jsonpath='{.status.podIP}')
+    [[ -n ${dst_ip} ]] || return 1
+    kctl exec "${src_release}-0" -c "${src_release}" -- \
+        valkey-cli cluster meet "${dst_ip}" 6379 >/dev/null 2>&1 || true
+}
+
+cleanup_pair() {
+    hctl uninstall valkey-iso-a 2>/dev/null || true
+    hctl uninstall valkey-iso-b 2>/dev/null || true
+    kctl delete pvc --selector='app.kubernetes.io/instance=valkey-iso-a' --ignore-not-found >/dev/null
+    kctl delete pvc --selector='app.kubernetes.io/instance=valkey-iso-b' --ignore-not-found >/dev/null
+}
+
+scenario_two_clusters_isolated() {
+    local name="two cluster-mode releases in one namespace stay isolated"
+    log "SCENARIO: ${name}"
+    cleanup_pair
+
+    if ! install_cluster valkey-iso-a true; then
+        fail "${name}" "install of valkey-iso-a failed"; cleanup_pair; return
+    fi
+    if ! install_cluster valkey-iso-b true; then
+        fail "${name}" "install of valkey-iso-b failed"; cleanup_pair; return
+    fi
+
+    # Baseline — each cluster should see exactly 3 nodes (3 shards, 0 replicas).
+    local a_before b_before
+    a_before=$(count_cluster_nodes valkey-iso-a)
+    b_before=$(count_cluster_nodes valkey-iso-b)
+    if [[ ${a_before} != 3 || ${b_before} != 3 ]]; then
+        fail "${name}" "baseline wrong (a=${a_before}, b=${b_before}; want 3+3)"
+        cleanup_pair; return
+    fi
+
+    # Try to merge B into A.
+    poison_meet valkey-iso-a valkey-iso-b
+
+    # After a MEET, Valkey adds the peer to `cluster nodes` immediately as a
+    # handshake placeholder — so a count of 4 for a few seconds is EXPECTED
+    # whether or not the merge ultimately succeeds. The real signal is what
+    # happens *after* the handshake timeout: if bus connectivity exists, the
+    # node stays (count stays at 4+); if isolation blocks the bus, the
+    # handshake fails and the placeholder is evicted (count returns to 3).
+    #
+    # Cluster node-timeout defaults to 15s; give the failure detector
+    # multiple intervals to fire, then sample.
+    sleep 45
+
+    # After settling, the merge must NOT have stuck.
+    local a_after b_after
+    a_after=$(count_cluster_nodes valkey-iso-a)
+    b_after=$(count_cluster_nodes valkey-iso-b)
+
+    if [[ ${a_after} != 3 || ${b_after} != 3 ]]; then
+        fail "${name}" "clusters merged (a=${a_after}, b=${b_after}; want 3+3 after settle)"
+        cleanup_pair; return
+    fi
+
+    cleanup_pair
+    pass "${name}"
+}
+
+# Negative twin: without isolation, the SAME MEET must succeed — otherwise
+# the positive test isn't proving what we think it's proving.
+scenario_isolation_off_lets_merge_happen() {
+    local name="disabling isolation lets CLUSTER MEET actually merge (teeth check)"
+    log "SCENARIO: ${name}"
+    cleanup_pair
+
+    if ! install_cluster valkey-iso-a false; then
+        fail "${name}" "install of valkey-iso-a failed"; cleanup_pair; return
+    fi
+    if ! install_cluster valkey-iso-b false; then
+        fail "${name}" "install of valkey-iso-b failed"; cleanup_pair; return
+    fi
+
+    poison_meet valkey-iso-a valkey-iso-b
+
+    # Mirror the positive test's 45-second settle window: we're asking the
+    # SAME question (has the handshake completed?) and need the same amount
+    # of time for the node-timeout to fire.
+    sleep 45
+
+    local a_after
+    a_after=$(count_cluster_nodes valkey-iso-a)
+    if [[ ${a_after} -le 3 ]]; then
+        fail "${name}" "MEET did not merge even without isolation (a=${a_after}); positive test cannot prove isolation works"
+        cleanup_pair; return
+    fi
+
+    cleanup_pair
+    pass "${name}"
+}
+
+# ---------------------------------------------------------------------------
+# Ambient-only regressions. Each of these tests a behaviour that's
+# independent of the tls/auth/shard/rep dimensions, so it lives here
+# rather than inflating the matrix with 16 copies of the same assertion.
+# Each self-skips if the cluster lacks the ambient data plane.
+# ---------------------------------------------------------------------------
+
+install_ambient_cluster() {
+    local release=$1
+    hctl install "${release}" "${CHART_DIR}" \
+        --set=istio.enabled=true \
+        --set=istio.mode=ambient \
+        --set=cluster.enabled=true \
+        --set=cluster.persistence.size=100Mi \
+        --set=cluster.shards=3 \
+        --set=cluster.replicasPerShard=0 \
+        --set=cluster.isolation.enabled=false \
+        --wait --timeout=300s >/dev/null
+}
+
+count_cluster_nodes_ambient() {
+    local release=$1
+    kctl exec "${release}-0" -c "${release}" -- sh -c \
+        "valkey-cli cluster nodes 2>/dev/null | awk 'NF {print \$1}' | sort -u | wc -l" \
+        2>/dev/null | tr -d '[:space:]' || echo 0
+}
+
+poison_meet_ambient() {
+    local src_release=$1 dst_release=$2 dst_ip
+    dst_ip=$(kctl get pod "${dst_release}-0" -o jsonpath='{.status.podIP}')
+    [[ -n ${dst_ip} ]] || return 1
+    kctl exec "${src_release}-0" -c "${src_release}" -- \
+        valkey-cli cluster meet "${dst_ip}" 6379 >/dev/null 2>&1 || true
+}
+
+cleanup_ambient_pair() {
+    hctl uninstall valkey-amb-a 2>/dev/null || true
+    hctl uninstall valkey-amb-b 2>/dev/null || true
+    kctl delete pvc --selector='app.kubernetes.io/instance=valkey-amb-a' --ignore-not-found >/dev/null
+    kctl delete pvc --selector='app.kubernetes.io/instance=valkey-amb-b' --ignore-not-found >/dev/null
+}
+
+# Cross-release CLUSTER MEET must be blocked by the ambient
+# AuthorizationPolicy. Analogous to scenario_two_clusters_isolated above
+# but driven at L4 via ztunnel rather than by NetworkPolicy (the
+# NetworkPolicy is intentionally skipped in ambient — it would drop
+# HBONE). The ONLY thing stopping the merge here is the AP, so we
+# disable cluster.isolation.enabled to force that.
+scenario_ambient_authz_blocks_cross_release_meet() {
+    local name="ambient: AuthorizationPolicy blocks cross-release CLUSTER MEET"
+    log "SCENARIO: ${name}"
+    if ! istio_ambient_installed; then
+        log "SKIP: ${name} (ztunnel not installed)"
+        return
+    fi
+    cleanup_ambient_pair
+
+    if ! install_ambient_cluster valkey-amb-a; then
+        fail "${name}" "install of valkey-amb-a failed"; cleanup_ambient_pair; return
+    fi
+    if ! install_ambient_cluster valkey-amb-b; then
+        fail "${name}" "install of valkey-amb-b failed"; cleanup_ambient_pair; return
+    fi
+    kctl wait --for=condition=complete job/valkey-amb-a-cluster-init --timeout=300s >/dev/null
+    kctl wait --for=condition=complete job/valkey-amb-b-cluster-init --timeout=300s >/dev/null
+
+    local a_before b_before
+    a_before=$(count_cluster_nodes_ambient valkey-amb-a)
+    b_before=$(count_cluster_nodes_ambient valkey-amb-b)
+    if [[ ${a_before} != 3 || ${b_before} != 3 ]]; then
+        fail "${name}" "baseline wrong (a=${a_before}, b=${b_before}; want 3+3)"
+        cleanup_ambient_pair; return
+    fi
+
+    poison_meet_ambient valkey-amb-a valkey-amb-b
+
+    # Same rationale as the sidecar-mode isolation test: after the MEET,
+    # `cluster nodes` on A briefly shows 4 as a handshake placeholder.
+    # The real signal is post-settle. Node-timeout defaults to 15s; give
+    # it multiple intervals.
+    sleep 45
+
+    local a_after b_after
+    a_after=$(count_cluster_nodes_ambient valkey-amb-a)
+    b_after=$(count_cluster_nodes_ambient valkey-amb-b)
+    if [[ ${a_after} != 3 || ${b_after} != 3 ]]; then
+        fail "${name}" "clusters merged despite AuthorizationPolicy (a=${a_after}, b=${b_after}; want 3+3)"
+        cleanup_ambient_pair; return
+    fi
+
+    cleanup_ambient_pair
+    pass "${name}"
+}
+
+# The chart must refuse to install in ambient+cluster mode when the
+# AuthorizationPolicy is explicitly disabled — dropping it leaves the bus
+# port with NO cross-release protection (the NetworkPolicy is also
+# skipped in ambient to avoid blocking HBONE). Fail-closed at template
+# time so nobody silently ships an open cluster.
+scenario_ambient_ap_disabled_refused() {
+    local name="ambient: chart refuses install when authorizationPolicy.enabled=false + cluster"
+    log "SCENARIO: ${name}"
+    cleanup_release
+
+    local out rc
+    set +e
+    out=$(hctl install "${RELEASE}" "${CHART_DIR}" \
+            --set=istio.enabled=true \
+            --set=istio.mode=ambient \
+            --set=cluster.enabled=true \
+            --set=cluster.shards=3 \
+            --set=cluster.replicasPerShard=0 \
+            --set=cluster.persistence.size=100Mi \
+            --set=istio.authorizationPolicy.enabled=false \
+            --dry-run 2>&1)
+    rc=$?
+    set -e
+
+    if (( rc == 0 )); then
+        fail "${name}" "dry-run succeeded but should have failed: ${out}"
+        return
+    fi
+    if ! grep -q 'cluster-bus port unprotected' <<<"${out}"; then
+        fail "${name}" "got error without the expected message (rc=${rc}): ${out}"
+        return
+    fi
+    pass "${name}"
+}
+
+# The chart must refuse when ambient + cluster + serviceAccount.create=false
+# with no explicit name, because every release collapses to the namespace's
+# `default` SA and the AP can no longer distinguish releases. Repro'd live
+# during review: two clusters merged despite both having the AP rendered.
+scenario_ambient_shared_default_sa_refused() {
+    local name="ambient: chart refuses install when serviceAccount defaults to namespace-wide 'default'"
+    log "SCENARIO: ${name}"
+    cleanup_release
+
+    local out rc
+    set +e
+    out=$(hctl install "${RELEASE}" "${CHART_DIR}" \
+            --set=istio.enabled=true \
+            --set=istio.mode=ambient \
+            --set=cluster.enabled=true \
+            --set=cluster.shards=3 \
+            --set=cluster.replicasPerShard=0 \
+            --set=cluster.persistence.size=100Mi \
+            --set=serviceAccount.create=false \
+            --dry-run 2>&1)
+    rc=$?
+    set -e
+
+    if (( rc == 0 )); then
+        fail "${name}" "dry-run succeeded but should have failed: ${out}"
+        return
+    fi
+    if ! grep -q "serviceAccount.create=false AND serviceAccount.name empty" <<<"${out}"; then
+        fail "${name}" "got error without the expected message (rc=${rc}): ${out}"
+        return
+    fi
+    pass "${name}"
+}
+
+# Custom trustDomain must propagate into the AuthorizationPolicy principal.
+# A cluster with `istio.trustDomain=my.mesh.example.com` whose AP still
+# emits `cluster.local/…` would self-deny: same-release callers present an
+# identity under the CUSTOM trust domain but the AP's ALLOW rule only
+# matches the hardcoded one, so the bus port default-denies even for its
+# own pods.
+# We don't actually reconfigure Istio's trust domain here — that's a
+# cluster-wide concern, not chart-level — so the install does NOT fully
+# converge. The test inspects the rendered AP to confirm the principal
+# string follows the override. That's the piece the chart owns.
+scenario_ambient_trustdomain_override() {
+    local name="ambient: AP principal follows istio.trustDomain override"
+    log "SCENARIO: ${name}"
+    if ! istio_ambient_installed; then
+        log "SKIP: ${name} (ztunnel not installed)"
+        return
+    fi
+    cleanup_release
+
+    if ! hctl install "${RELEASE}" "${CHART_DIR}" \
+            --set=istio.enabled=true \
+            --set=istio.mode=ambient \
+            --set=cluster.enabled=true \
+            --set=cluster.shards=3 \
+            --set=cluster.replicasPerShard=0 \
+            --set=cluster.persistence.size=100Mi \
+            --set=istio.trustDomain=my.mesh.example.com \
+            --wait --timeout=240s >/dev/null 2>&1; then
+        # Expected: install won't converge because the actual mesh trust
+        # domain is still cluster.local. We only need the AP rendered to
+        # verify the principal string.
+        :
+    fi
+
+    local principals
+    principals=$(kctl get authorizationpolicy "${RELEASE}-cluster-bus" \
+        -o jsonpath='{.spec.rules[0].from[0].source.principals[*]}' 2>/dev/null)
+    if [[ ${principals} != "my.mesh.example.com/ns/${NAMESPACE}/sa/${RELEASE}" ]]; then
+        fail "${name}" "AP principals=${principals}, want my.mesh.example.com/ns/${NAMESPACE}/sa/${RELEASE}"
+        return
+    fi
+
+    cleanup_release
+    pass "${name}"
+}
+
+# Prometheus scraping the metrics exporter must work in ambient mode. The
+# AuthorizationPolicy is ALLOW-only, which triggers Istio default-deny for
+# any non-matching traffic — if the chart forgets to include the metrics
+# port in the open rule, production Prometheus stacks silently stop
+# seeing Valkey metrics the moment someone enables Istio.
+scenario_ambient_prometheus_scrape() {
+    local name="ambient: in-mesh Prometheus can scrape metrics exporter"
+    log "SCENARIO: ${name}"
+    if ! istio_ambient_installed; then
+        log "SKIP: ${name} (ztunnel not installed)"
+        return
+    fi
+    cleanup_release
+
+    if ! hctl install "${RELEASE}" "${CHART_DIR}" \
+            --set=istio.enabled=true \
+            --set=istio.mode=ambient \
+            --set=cluster.enabled=true \
+            --set=cluster.shards=3 \
+            --set=cluster.replicasPerShard=0 \
+            --set=cluster.persistence.size=100Mi \
+            --set=metrics.enabled=true \
+            --wait --timeout=300s >/dev/null; then
+        fail "${name}" "helm install failed"; return
+    fi
+    kctl wait --for=condition=complete "job/${RELEASE}-cluster-init" --timeout=300s >/dev/null
+
+    # An ambient-enrolled curl pod simulates an in-mesh Prometheus.
+    local scraper="scrape-${RELEASE}-$$"
+    kctl delete pod "${scraper}" --ignore-not-found --wait=true >/dev/null
+    kctl run "${scraper}" \
+        --image=curlimages/curl \
+        --labels='istio.io/dataplane-mode=ambient' \
+        --restart=Never \
+        --command -- sleep 300 >/dev/null
+    kctl wait --for=condition=Ready "pod/${scraper}" --timeout=120s >/dev/null
+
+    local code out
+    set +e
+    out=$(kctl exec "${scraper}" -c "${scraper}" -- \
+        curl -sS --max-time 10 -w '\nHTTP=%{http_code}\n' \
+        "http://${RELEASE}-metrics.${NAMESPACE}.svc.cluster.local:9121/metrics" 2>&1)
+    set -e
+    code=$(awk -F= '/^HTTP=/{print $2}' <<<"${out}")
+
+    kctl delete pod "${scraper}" --ignore-not-found --wait=false >/dev/null
+
+    if [[ ${code} != "200" ]]; then
+        fail "${name}" "scrape returned HTTP=${code:-<empty>}, body was: ${out}"
+        return
+    fi
+    if ! grep -q '^redis_' <<<"${out}"; then
+        fail "${name}" "HTTP 200 but body lacks redis_* metrics"
+        return
+    fi
+
+    cleanup_release
+    pass "${name}"
+}
+
+# ---------------------------------------------------------------------------
+# Scenario: `kubectl rollout restart` on a replicated cluster must not cause
+# client-visible disruption. The preStop hook runs `CLUSTER FAILOVER` on
+# every primary before SIGTERM, so the shard already has a new primary by
+# the time the old pod terminates. We assert this by:
+#
+#   1) Installing cluster.shards=3, cluster.replicasPerShard=1 (6 pods).
+#   2) Recording each pod's role (master/slave) — this is our baseline.
+#   3) Writing a known key through any pod (cluster redirects handle placement).
+#   4) `kubectl rollout restart` the STS and waiting for the rollout.
+#   5) Re-checking cluster_state, master/slave counts, and the key's value.
+#   6) Comparing new roles to baseline: since every primary is asked to hand
+#      off to its own replica, every primary/replica pair should have flipped
+#      ordinals. We assert AT LEAST ONE pod's role changed — any weaker check
+#      would pass even if the hook never ran and the cluster simply waited
+#      through node-timeout failovers.
+#
+# If the preStop hook is broken or absent, steps 5-6 still "work" in the sense
+# that the cluster eventually self-heals via node-timeout, but:
+#   - there's a 15s+ window of unavailability per primary,
+#   - and the pod role stays the same after restart (the restarted pod
+#     re-joins as primary because its nodes.conf persisted), so the role-flip
+#     assertion catches it.
+# ---------------------------------------------------------------------------
+scenario_rollout_restart_orderly_failover() {
+    local name="rollout restart performs orderly CLUSTER FAILOVER (no client-visible gap)"
+    log "SCENARIO: ${name}"
+    cleanup_release
+
+    # nodeTimeout pinned high (3 min) so cluster-node-timeout auto-failover
+    # CANNOT fire during the rollout — a normal per-pod restart takes ~10-30s
+    # and the whole rollout ~2-3min, so with a 15s default timeout the
+    # observed role-flip signal could be produced either by preStop OR by
+    # auto-failover of an in-flight primary. Bumping to 180s guarantees any
+    # observed flip is the work of preStop.
+    if ! hctl install "${RELEASE}" "${CHART_DIR}" \
+            --set=cluster.enabled=true \
+            --set=cluster.persistence.size=100Mi \
+            --set=cluster.shards=3 \
+            --set=cluster.replicasPerShard=1 \
+            --set=cluster.nodeTimeout=180000 \
+            --wait --timeout=300s >/dev/null; then
+        fail "${name}" "helm install failed"
+        return
+    fi
+    kctl wait --for=condition=complete "job/${RELEASE}-cluster-init" --timeout=300s >/dev/null
+
+    # Gossip convergence lags job completion: the init Job returns "done"
+    # once `cluster create` is ACK'd, but `cluster_state:ok` requires every
+    # node to have seen every other node's PING/PONG. Writing canary data
+    # or triggering a rollout before that window closes lets the preStop
+    # script's own `cluster_state != ok` early-exit fire, bypassing the
+    # graceful FAILOVER and silently dropping in-memory writes when the
+    # primary pod is replaced.
+    local s
+    for _ in $(seq 1 60); do
+        s=$(kctl exec "${RELEASE}-0" -c "${RELEASE}" -- \
+            valkey-cli cluster info 2>/dev/null \
+            | awk -F: '/^cluster_state:/{print $2}' | tr -d '\r\n' || true)
+        [[ ${s} == ok ]] && break
+        sleep 2
+    done
+    if [[ ${s} != ok ]]; then
+        fail "${name}" "cluster_state=${s:-<unavailable>} after install (want ok before rollout)"
+        cleanup_release; return
+    fi
+
+    # Capture the role of every pod pre-restart. Keyed by pod ordinal so we
+    # can compare "same ordinal, different role" after.
+    snapshot_roles() {
+        local n=6 i role
+        for i in $(seq 0 $((n - 1))); do
+            role=$(kctl exec "${RELEASE}-${i}" -c "${RELEASE}" -- \
+                valkey-cli info replication 2>/dev/null \
+                | awk -F: '/^role:/{print $2}' | tr -d '\r\n' || true)
+            printf '%s=%s\n' "${i}" "${role}"
+        done
+    }
+
+    local before
+    before=$(snapshot_roles)
+    local masters_before slaves_before
+    masters_before=$(printf '%s\n' "${before}" | grep -c '=master' || true)
+    slaves_before=$(printf '%s\n' "${before}" | grep -c '=slave\|=replica' || true)
+    if [[ ${masters_before} != 3 || ${slaves_before} != 3 ]]; then
+        fail "${name}" "baseline wrong: masters=${masters_before} slaves=${slaves_before} (want 3+3)"
+        cleanup_release; return
+    fi
+
+    # Write a canary key so we can prove data integrity after the rollout.
+    # Must write through a CLUSTER-aware client so slot routing works —
+    # valkey-cli -c follows MOVED redirects. The value contains shell
+    # metacharacters for the same reason AUTH_PASSWORD does.
+    local canary_key="prestop-canary-$$"
+    local canary_val='rollout-ok $shell "quote" \back`tick`'
+    if ! kctl exec "${RELEASE}-0" -c "${RELEASE}" -- \
+            valkey-cli -c set "${canary_key}" "${canary_val}" >/dev/null 2>&1; then
+        fail "${name}" "initial SET failed"
+        cleanup_release; return
+    fi
+
+    # The actual rollout. Default updateStrategy=RollingUpdate → pods
+    # restart one at a time from highest ordinal (podManagementPolicy
+    # controls creation/deletion parallelism, not rolling-update pacing).
+    # Each primary-pod restart should trigger a preStop FAILOVER; each
+    # replica-pod restart should no-op.
+    log "triggering rollout restart"
+    kctl rollout restart "statefulset/${RELEASE}" >/dev/null
+
+    # Rollout must complete within terminationGracePeriodSeconds * 6 + a
+    # little slack — each pod can take up to the grace period in the
+    # worst case (preStop timeout + SIGTERM flush).
+    if ! kctl rollout status "statefulset/${RELEASE}" --timeout=600s >/dev/null; then
+        fail "${name}" "rollout status never converged"
+        cleanup_release; return
+    fi
+
+    # Give gossip a moment to settle post-rollout — cluster_state flips to
+    # :ok only after every node sees every other node, and the last pod to
+    # restart may still be converging when rollout status returns.
+    local state
+    for _ in $(seq 1 30); do
+        state=$(kctl exec "${RELEASE}-0" -c "${RELEASE}" -- \
+            valkey-cli cluster info 2>/dev/null \
+            | awk -F: '/^cluster_state:/{print $2}' | tr -d '\r\n' || true)
+        [[ ${state} == ok ]] && break
+        sleep 2
+    done
+    if [[ ${state} != ok ]]; then
+        fail "${name}" "cluster_state=${state:-<unavailable>} after rollout (want ok)"
+        cleanup_release; return
+    fi
+
+    # Still 3 masters / 3 slaves — i.e. the handovers completed and every
+    # shard has the right shape.
+    local after masters_after slaves_after
+    after=$(snapshot_roles)
+    masters_after=$(printf '%s\n' "${after}" | grep -c '=master' || true)
+    slaves_after=$(printf '%s\n' "${after}" | grep -c '=slave\|=replica' || true)
+    if [[ ${masters_after} != 3 || ${slaves_after} != 3 ]]; then
+        fail "${name}" "post-rollout shape wrong: masters=${masters_after} slaves=${slaves_after} (want 3+3)"
+        cleanup_release; return
+    fi
+
+    # Canary key survives (via MOVED redirect if the slot moved to a
+    # different primary).
+    local got
+    got=$(kctl exec "${RELEASE}-0" -c "${RELEASE}" -- \
+        valkey-cli -c get "${canary_key}" 2>/dev/null || true)
+    if [[ ${got} != "${canary_val}" ]]; then
+        fail "${name}" "canary key lost: got='${got}' want='${canary_val}'"
+        cleanup_release; return
+    fi
+
+    # Expect every primary's ordinal to flip: the rollout restarts each pod
+    # once, each primary-pod restart's preStop hands off to a replica, and
+    # the ex-primary returns as replica. So of the 3 original primaries,
+    # all 3 should now be replicas on those ordinals ⇒ at least 3 flips.
+    # With nodeTimeout pinned high above, no other mechanism can produce
+    # flips during the rollout window, so this is a precise signal.
+    # A broken / missing preStop yields 0 flips (every pod persists its
+    # role in nodes.conf and rejoins as that role).
+    local flips=0 line ordinal role_before role_after
+    for line in ${before}; do
+        ordinal=${line%=*}
+        role_before=${line#*=}
+        role_after=$(printf '%s\n' "${after}" | awk -F= -v o="${ordinal}" '$1 == o {print $2}')
+        if [[ ${role_before} != "${role_after}" ]]; then
+            flips=$(( flips + 1 ))
+        fi
+    done
+    if (( flips < 3 )); then
+        fail "${name}" "only ${flips}/6 ordinals flipped — expected >=3 (every primary's preStop should hand off to a replica). before='${before}' after='${after}'"
+        cleanup_release; return
+    fi
+    log "roles flipped on ${flips}/6 pods — handover ran"
+
+    cleanup_release
+    pass "${name}"
+}
+
+# ---------------------------------------------------------------------------
+# Scenario: cluster bus dials by IP, even with cluster-preferred-endpoint-type
+# =hostname. After a rolling restart, a pod whose nodes.conf has only stale
+# peer IPs becomes a stranded minority partition — every gossip attempt
+# times out against dead IPs and it never gets the chance to learn fresh
+# ones. The chart's init container re-resolves each peer's announced FQDN
+# and rewrites stale IPs in /data/nodes.conf before valkey-server starts;
+# this scenario proves that refresh works end-to-end.
+#
+# Reproduction:
+#   1) Install cluster (replicasPerShard=1) and wait for cluster_state:ok.
+#   2) Snapshot pod-0's nodes.conf to extract the real peer IPs.
+#   3) Poison: replace every peer IP in pod-0's nodes.conf with TEST-NET-1
+#      (192.0.2.0/24, RFC 5737 documentation range — guaranteed unroutable).
+#   4) SIGKILL valkey-server (pid 1) so the shutdown handler can't rewrite
+#      nodes.conf back to good state; the pod restarts via the StatefulSet
+#      controller.
+#   5) Wait for pod-0 to be Ready again. The init container's refresh
+#      block should re-resolve every peer FQDN and rewrite the IPs back
+#      to the real ones BEFORE valkey-server starts.
+#   6) Assert: pod-0's nodes.conf no longer contains 192.0.2.99 and
+#      cluster_state from pod-0's perspective is back to ok.
+#
+# Without the refresh: pod-0 boots, dials 192.0.2.99 on the bus, every
+# connection times out, cluster_state stays fail forever. So the
+# assertion has teeth — a regression that drops the refresh would leave
+# the poisoned IPs in place and cluster_state would never recover.
+# ---------------------------------------------------------------------------
+scenario_nodes_conf_ip_refresh() {
+    local name="cluster init refreshes stale nodes.conf IPs after pod restart"
+    log "SCENARIO: ${name}"
+    cleanup_release
+
+    if ! hctl install "${RELEASE}" "${CHART_DIR}" \
+            --set=cluster.enabled=true \
+            --set=cluster.persistence.size=100Mi \
+            --set=cluster.shards=3 \
+            --set=cluster.replicasPerShard=1 \
+            --wait --timeout=300s >/dev/null; then
+        fail "${name}" "helm install failed"
+        return
+    fi
+    kctl wait --for=condition=complete "job/${RELEASE}-cluster-init" --timeout=300s >/dev/null
+
+    # Wait for gossip convergence — same rationale as the rollout
+    # scenario: the init Job returning doesn't mean every node has seen
+    # every PING/PONG yet, and we need cluster_state:ok before we can
+    # meaningfully assert it recovers.
+    local s
+    for _ in $(seq 1 60); do
+        s=$(kctl exec "${RELEASE}-0" -c "${RELEASE}" -- \
+            valkey-cli cluster info 2>/dev/null \
+            | awk -F: '/^cluster_state:/{print $2}' | tr -d '\r\n' || true)
+        [[ ${s} == ok ]] && break
+        sleep 2
+    done
+    if [[ ${s} != ok ]]; then
+        fail "${name}" "cluster_state=${s:-<unavailable>} after install (need ok before poisoning)"
+        cleanup_release; return
+    fi
+
+    # Snapshot the original nodes.conf for diagnostics and to confirm
+    # poisoning actually changes content.
+    local orig
+    orig=$(kctl exec "${RELEASE}-0" -c "${RELEASE}" -- cat /data/nodes.conf 2>/dev/null)
+    if [[ -z ${orig} ]]; then
+        fail "${name}" "failed to read /data/nodes.conf on ${RELEASE}-0"
+        cleanup_release; return
+    fi
+
+    # Poison: replace every peer's IP token with 192.0.2.99 (RFC 5737
+    # documentation prefix — guaranteed unroutable). Critically, SIGSTOP
+    # valkey-server BEFORE rewriting nodes.conf — otherwise the live
+    # server's gossip tick (every cluster-node-timeout/2 ≈ 7.5 s) or any
+    # incoming gossip event from a peer would rewrite nodes.conf back to
+    # the real IPs, defeating the test. SIGSTOP freezes the process so
+    # it can't write the file; the subsequent force-delete sends SIGKILL
+    # which clears the STOP and tears the container down.
+    #
+    # Atomic file swap (write+mv) so a kill mid-write can't corrupt
+    # anything; sync forces the page cache to disk so the new pod's
+    # init container reads the poison from the PVC.
+    log "SIGSTOPping valkey-server and poisoning /data/nodes.conf on ${RELEASE}-0"
+    # shellcheck disable=SC2016
+    if ! kctl exec "${RELEASE}-0" -c "${RELEASE}" -- sh -c '
+            kill -STOP 1 \
+            && awk '"'"'
+              # Pass through blank lines and the "vars currentEpoch ..." footer.
+              /^$/ || /^vars / { print; next }
+              # Field 2 is "<ip:port@busport>,<fqdn>,..." — replace ONLY
+              # the leading ip:port@busport, keep everything else. The
+              # production bug had myself stale too, so we deliberately
+              # poison the myself line: the refresh block must handle it.
+              {
+                # Split field 2 on commas: head is ip:port@busport, tail is rest.
+                n = split($2, a, ",")
+                head = a[1]
+                tail = ""
+                for (i = 2; i <= n; i++) tail = tail "," a[i]
+                # Replace the IP only; preserve port and bus port.
+                sub(/^[0-9.]+/, "192.0.2.99", head)
+                $2 = head tail
+                print
+              }
+            '"'"' /data/nodes.conf >/data/nodes.conf.poisoned \
+            && mv /data/nodes.conf.poisoned /data/nodes.conf \
+            && sync
+        '; then
+        fail "${name}" "failed to poison /data/nodes.conf on ${RELEASE}-0"
+        cleanup_release; return
+    fi
+
+    # Capture the current pod UID so we can detect the replacement.
+    local old_uid
+    old_uid=$(kctl get pod "${RELEASE}-0" -o jsonpath='{.metadata.uid}' 2>/dev/null)
+    if [[ -z ${old_uid} ]]; then
+        fail "${name}" "could not read UID of ${RELEASE}-0 before delete"
+        cleanup_release; return
+    fi
+
+    # Force-delete the pod to trigger pod RECREATION (not in-place
+    # container restart). Init containers only run on new pods; SIGKILL
+    # of pid 1 alone leaves the same pod object in place and kubelet
+    # just restarts the container, skipping the init phase entirely.
+    # Force + grace=0 also bypasses the preStop hook and the graceful-
+    # shutdown handler, both of which would otherwise rewrite nodes.conf
+    # back to a clean state and defeat the test.
+    log "Force-deleting ${RELEASE}-0 to trigger pod recreation"
+    kctl delete pod "${RELEASE}-0" --force --grace-period=0 \
+        --wait=false >/dev/null 2>&1 || true
+
+    # Wait for the StatefulSet controller to create a NEW pod with a
+    # different UID (the old one may briefly persist in Terminating
+    # state).
+    log "Waiting for ${RELEASE}-0 to be recreated with a fresh UID"
+    local new_uid
+    for _ in $(seq 1 60); do
+        new_uid=$(kctl get pod "${RELEASE}-0" -o jsonpath='{.metadata.uid}' 2>/dev/null || true)
+        if [[ -n ${new_uid} && ${new_uid} != "${old_uid}" ]]; then
+            break
+        fi
+        sleep 2
+    done
+    if [[ ${new_uid} == "${old_uid}" || -z ${new_uid} ]]; then
+        fail "${name}" "${RELEASE}-0 was not recreated (UID still ${old_uid:-empty})"
+        cleanup_release; return
+    fi
+
+    # Now wait for the new pod to be Ready (init container ran, probe
+    # passes — which only happens if cluster_state recovered, which only
+    # happens if the refresh worked).
+    log "Waiting for the new ${RELEASE}-0 (uid=${new_uid}) to be Ready"
+    if ! kctl wait --for=condition=Ready "pod/${RELEASE}-0" --timeout=180s >/dev/null; then
+        fail "${name}" "${RELEASE}-0 never became Ready after recreation"
+        cleanup_release; return
+    fi
+
+    # The post-restart nodes.conf must NOT contain the poison IP — the
+    # init container's refresh step replaces it before valkey-server
+    # boots. (Valkey itself only writes peers' IPs to nodes.conf as it
+    # observes them via gossip; without our pre-boot refresh, the boot
+    # would proceed against 192.0.2.99 and the file would stay poisoned.)
+    local after
+    after=$(kctl exec "${RELEASE}-0" -c "${RELEASE}" -- cat /data/nodes.conf 2>/dev/null)
+    if grep -q '192\.0\.2\.99' <<<"${after}"; then
+        fail "${name}" "nodes.conf still contains poison IP 192.0.2.99 after restart — refresh did not run. Content: ${after}"
+        cleanup_release; return
+    fi
+
+    # And the cluster must be functional from pod-0's view — the whole
+    # point of the refresh is that it boots into a cluster it can talk
+    # to. Poll because gossip needs a moment to re-converge after the
+    # restart.
+    for _ in $(seq 1 60); do
+        s=$(kctl exec "${RELEASE}-0" -c "${RELEASE}" -- \
+            valkey-cli cluster info 2>/dev/null \
+            | awk -F: '/^cluster_state:/{print $2}' | tr -d '\r\n' || true)
+        [[ ${s} == ok ]] && break
+        sleep 2
+    done
+    if [[ ${s} != ok ]]; then
+        fail "${name}" "cluster_state=${s:-<unavailable>} after refresh (want ok). nodes.conf was: ${after}"
+        cleanup_release; return
+    fi
+
+    cleanup_release
+    pass "${name}"
+}
+
+# ---------------------------------------------------------------------------
+# Scenario: probe LOADING-policy is wired correctly on the live workload.
+#
+# The chart applies a tri-state policy:
+#   * startupProbe   — rejects LOADING (gate has teeth during initial RDB load)
+#   * livenessProbe  — accepts LOADING (don't kill a replica mid-full-resync)
+#   * readinessProbe — rejects LOADING (don't route traffic to a loading pod)
+#
+# Production regression that motivates this test: a replica in a 38 GB cluster
+# hit `cluster_state:fail` after a replication break triggered a full resync;
+# the post-resync in-memory load took ~57 s, and livenessProbe
+# (failureThreshold=6 * periodSeconds=10s = 60 s) killed the pod just before
+# load completed. The kill discarded the freshly-streamed RDB; the next pod
+# incarnation triggered yet another full resync. Crash-loop until intervention.
+#
+# helm-unittest already locks the rendered command strings in via
+# matchRegex; this functional test goes one layer further by asserting
+# that the live API objects in the cluster carry the right policy. A
+# template change that bypasses the helper would slip past unit tests
+# but get caught here.
+# ---------------------------------------------------------------------------
+scenario_probe_loading_policy() {
+    local name="probes carry tri-state LOADING policy on live workload"
+    log "SCENARIO: ${name}"
+    cleanup_release
+
+    if ! hctl install "${RELEASE}" "${CHART_DIR}" \
+            --set=cluster.enabled=true \
+            --set=cluster.persistence.size=100Mi \
+            --set=cluster.shards=3 \
+            --set=cluster.replicasPerShard=0 \
+            --wait --timeout=300s >/dev/null; then
+        fail "${name}" "helm install failed"
+        return
+    fi
+
+    local startup liveness readiness
+    startup=$(kctl get statefulset "${RELEASE}" \
+        -o jsonpath='{.spec.template.spec.containers[0].startupProbe.exec.command[2]}')
+    liveness=$(kctl get statefulset "${RELEASE}" \
+        -o jsonpath='{.spec.template.spec.containers[0].livenessProbe.exec.command[2]}')
+    readiness=$(kctl get statefulset "${RELEASE}" \
+        -o jsonpath='{.spec.template.spec.containers[0].readinessProbe.exec.command[2]}')
+
+    if grep -q LOADING <<<"${startup}"; then
+        fail "${name}" "startupProbe must reject LOADING but accepts it: ${startup}"
+        cleanup_release; return
+    fi
+    if ! grep -q LOADING <<<"${liveness}"; then
+        fail "${name}" "livenessProbe must accept LOADING but rejects it: ${liveness}"
+        cleanup_release; return
+    fi
+    if grep -q LOADING <<<"${readiness}"; then
+        fail "${name}" "readinessProbe must reject LOADING but accepts it: ${readiness}"
+        cleanup_release; return
+    fi
+
+    cleanup_release
+    pass "${name}"
+}
+
+trap 'cleanup_release; cleanup_pair; cleanup_ambient_pair' EXIT
+
+scenario_aclconfig_metrics                       || true
+scenario_default_deny_netpol                     || true
+scenario_bus_port_hidden                         || true
+scenario_readiness_probe_exists                  || true
+scenario_two_clusters_isolated                   || true
+scenario_isolation_off_lets_merge_happen         || true
+scenario_rollout_restart_orderly_failover        || true
+scenario_nodes_conf_ip_refresh                   || true
+scenario_probe_loading_policy                    || true
+scenario_ambient_authz_blocks_cross_release_meet || true
+scenario_ambient_ap_disabled_refused             || true
+scenario_ambient_shared_default_sa_refused       || true
+scenario_ambient_trustdomain_override            || true
+scenario_ambient_prometheus_scrape               || true
+
+echo
+log "Extra scenario summary"
+passed=0; failed=0
+for r in "${RESULTS[@]}"; do
+    printf '  %s\n' "${r}"
+    [[ ${r} == PASS:* ]] && passed=$(( passed + 1 )) || failed=$(( failed + 1 ))
+done
+echo
+log "Extras: ${passed} passed, ${failed} failed"
+(( failed == 0 ))
diff --git a/functional-tests/run-scenario.sh b/functional-tests/run-scenario.sh
new file mode 100755
index 00000000..4615e707
--- /dev/null
+++ b/functional-tests/run-scenario.sh
@@ -0,0 +1,366 @@
+#!/usr/bin/env bash
+# Run a single scenario of the Valkey functional matrix against the
+# already-created kind cluster.
+#
+# Usage:
+#   ./run-scenario.sh <tls> <auth> <shard> <rep> <istio>
+# tls/auth/shard/rep are on|off; istio is off|sidecar|ambient.
+# Example:
+#   ./run-scenario.sh off off on on ambient
+# drives the "TLS off, auth off, shard on, rep on, Istio ambient" scenario.
+
+HERE=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=lib.sh
+. "${HERE}/lib.sh"
+
+if (( $# != 5 )); then
+    echo "usage: $0 <tls> <auth> <shard> <rep> <istio>" >&2
+    echo "       tls/auth/shard/rep: on|off" >&2
+    echo "       istio: off|sidecar|ambient" >&2
+    exit 2
+fi
+
+on_or_off() {
+    case "$1" in
+        on|off) return 0 ;;
+        *) echo "expected 'on' or 'off', got: $1" >&2; return 1 ;;
+    esac
+}
+for v in "$1" "$2" "$3" "$4"; do on_or_off "${v}"; done
+case "$5" in
+    off|sidecar|ambient) ;;
+    *) echo "expected istio=off|sidecar|ambient, got: $5" >&2; exit 2 ;;
+esac
+
+TLS=$1; AUTH=$2; SHARD=$3; REP=$4; ISTIO=$5
+SCENARIO="tls=${TLS} auth=${AUTH} shard=${SHARD} rep=${REP} istio=${ISTIO}"
+
+is_on()       { [[ $1 == on ]]; }
+is_mesh()     { [[ ${ISTIO} != off ]]; }
+is_sidecar()  { [[ ${ISTIO} == sidecar ]]; }
+is_ambient()  { [[ ${ISTIO} == ambient ]]; }
+
+# Pick a testbench that shares the right mesh participation with the chart
+# workload — that's the only way the in-mesh connectivity checks reflect
+# what an in-production client on the same mesh would experience. The three
+# testbench flavours are launched once by setup.sh.
+case "${ISTIO}" in
+    off)     TESTBENCH=${TESTBENCH_POD} ;;
+    sidecar) TESTBENCH=${TESTBENCH_POD_INJECTED} ;;
+    ambient) TESTBENCH=${TESTBENCH_POD_AMBIENT} ;;
+esac
+testbench_exec() { testbench_exec_in "${TESTBENCH}" "$@"; }
+
+# ---------------------------------------------------------------------------
+# Build helm flags for this scenario.
+# ---------------------------------------------------------------------------
+helm_flags=()
+
+if is_mesh; then
+    helm_flags+=(
+        --set=istio.enabled=true
+        "--set=istio.mode=${ISTIO}"
+    )
+fi
+# istio=off needs no extra flags: the chart emits zero mesh labels when
+# istio.enabled=false, and setup.sh leaves the namespace unlabelled so
+# pods stay out of both data planes by default.
+
+if is_on "${AUTH}"; then
+    helm_flags+=(
+        --set=auth.enabled=true
+        --set=auth.usersExistingSecret="${AUTH_SECRET}"
+        --set=auth.aclUsers.default.permissions='~* &* +@all'
+    )
+fi
+
+if is_on "${TLS}"; then
+    helm_flags+=(
+        --set=tls.enabled=true
+        --set=tls.existingSecret="${TLS_SECRET}"
+    )
+fi
+
+if is_on "${SHARD}"; then
+    helm_flags+=(
+        --set=cluster.enabled=true
+        --set=cluster.persistence.size=1Gi
+        --set=cluster.shards=3
+    )
+    if is_on "${REP}"; then
+        helm_flags+=(--set=cluster.replicasPerShard=1)
+        expected_node_count=6
+    else
+        helm_flags+=(--set=cluster.replicasPerShard=0)
+        expected_node_count=3
+    fi
+elif is_on "${REP}"; then
+    helm_flags+=(
+        --set=replica.enabled=true
+        --set=replica.persistence.size=1Gi
+    )
+    expected_node_count=0   # unused
+else
+    expected_node_count=0   # unused
+fi
+
+# ---------------------------------------------------------------------------
+# Install.
+# ---------------------------------------------------------------------------
+
+# Register cleanup BEFORE `helm install`. If the install itself fails
+# (timeout, post-install hook never ready, etc.) Helm leaves a "failed"
+# release in the cluster that blocks every subsequent scenario with a
+# `cannot reuse a name that is still in use` error. Trap-before-install
+# ensures we always clean up, even on install failure.
+cleanup() {
+    local rc=$?
+    log "Cleaning up scenario: ${SCENARIO}"
+    hctl uninstall "${RELEASE}" 2>/dev/null || true
+    kctl delete pvc --selector="app.kubernetes.io/instance=${RELEASE}" --ignore-not-found
+    exit "${rc}"
+}
+trap cleanup EXIT
+
+# Also scrub anything left behind by a prior scenario that crashed hard
+# (SIGKILL, harness panic) without running its trap.
+hctl uninstall "${RELEASE}" 2>/dev/null || true
+
+log "Installing scenario: ${SCENARIO}"
+hctl install "${RELEASE}" "${CHART_DIR}" "${helm_flags[@]}"
+
+# ---------------------------------------------------------------------------
+# Wait for pods to become ready.
+# ---------------------------------------------------------------------------
+log "Waiting for workload to be ready"
+if is_on "${SHARD}"; then
+    kctl rollout status "statefulset/${RELEASE}" --timeout=300s
+    # The cluster-init Job is a post-install hook; wait for it to complete.
+    kctl wait --for=condition=complete "job/${RELEASE}-cluster-init" --timeout=300s
+elif is_on "${REP}"; then
+    kctl rollout status "statefulset/${RELEASE}" --timeout=300s
+else
+    kctl rollout status "deployment/${RELEASE}" --timeout=300s
+fi
+
+# ---------------------------------------------------------------------------
+# Build the canonical "working" valkey-cli argv for this scenario.
+# ---------------------------------------------------------------------------
+cli_args_good=(valkey-cli -h "valkey.${NAMESPACE}.svc.cluster.local" --no-auth-warning)
+if is_on "${AUTH}"; then
+    cli_args_good+=(-a "${AUTH_PASSWORD}")
+fi
+if is_on "${TLS}"; then
+    cli_args_good+=(--tls --cacert /tls/ca.crt)
+fi
+
+# ---------------------------------------------------------------------------
+# Assertions.
+# ---------------------------------------------------------------------------
+fail() { echo "FAIL: $*" >&2; exit 1; }
+
+assert_eq() {
+    local expected=$1 actual=$2 what=$3
+    if [[ ${actual} != "${expected}" ]]; then
+        fail "${what}: expected '${expected}', got '${actual}'"
+    fi
+}
+
+# Pick any chart pod so mode-specific checks can inspect live container /
+# label state. The first matching pod is fine — all pods in a release
+# share the same mesh participation shape.
+pod=$(kctl get pod -l "app.kubernetes.io/instance=${RELEASE}" \
+    -o jsonpath='{.items[0].metadata.name}')
+
+# Chart-owned Istio resources should be present iff istio is enabled.
+# PeerAuthentication is mode-neutral (enforced by Envoy in sidecar, ztunnel
+# in ambient). DestinationRule is sidecar-only — ambient's ztunnel HBONE
+# supersedes it. AuthorizationPolicy renders only in cluster mode.
+case "${ISTIO}" in
+    off)
+        log "Istio check: chart-owned resources must be absent"
+        if kctl get peerauthentication "${RELEASE}" >/dev/null 2>&1; then
+            fail "PeerAuthentication/${RELEASE} should not exist when istio=off"
+        fi
+        if kctl get destinationrule "${RELEASE}" >/dev/null 2>&1; then
+            fail "DestinationRule/${RELEASE} should not exist when istio=off"
+        fi
+        if kctl get authorizationpolicy "${RELEASE}-cluster-bus" >/dev/null 2>&1; then
+            fail "AuthorizationPolicy/${RELEASE}-cluster-bus should not exist when istio=off"
+        fi
+        # Pod must have no istio-proxy container.
+        if kctl get pod "${pod}" \
+             -o jsonpath='{.spec.containers[*].name} {.spec.initContainers[*].name}' \
+             | tr ' ' '\n' | grep -Fxq istio-proxy; then
+            fail "pod ${pod} has an istio-proxy container when istio=off"
+        fi
+        ;;
+    sidecar)
+        log "Istio check: sidecar-mode resources must exist"
+        kctl get peerauthentication "${RELEASE}" >/dev/null \
+            || fail "PeerAuthentication/${RELEASE} missing in sidecar mode"
+        kctl get destinationrule "${RELEASE}" >/dev/null \
+            || fail "DestinationRule/${RELEASE} missing in sidecar mode"
+        if is_on "${SHARD}" || is_on "${REP}"; then
+            kctl get destinationrule "${RELEASE}-headless" >/dev/null \
+                || fail "DestinationRule/${RELEASE}-headless missing in sidecar mode"
+        fi
+        # Istio >=1.29 injects as a native sidecar (initContainer with
+        # restartPolicy=Always), so check both containers and initContainers.
+        if ! kctl get pod "${pod}" \
+             -o jsonpath='{.spec.containers[*].name} {.spec.initContainers[*].name}' \
+             | tr ' ' '\n' | grep -Fxq istio-proxy; then
+            fail "pod ${pod} has no istio-proxy container in sidecar mode"
+        fi
+        if is_on "${SHARD}"; then
+            # AP renders only in cluster mode, but it applies in BOTH sidecar
+            # and ambient. Verify once per mode so a sidecar-only regression
+            # (e.g. dropping the AP when !ambient) can't hide.
+            kctl get authorizationpolicy "${RELEASE}-cluster-bus" >/dev/null \
+                || fail "AuthorizationPolicy/${RELEASE}-cluster-bus missing in sidecar+cluster mode"
+            # The bus-port exclude annotations are sidecar-only (ambient has
+            # no Envoy to exclude ports from).
+            excl=$(kctl get statefulset "${RELEASE}" \
+                -o jsonpath='{.spec.template.metadata.annotations.traffic\.sidecar\.istio\.io/excludeInboundPorts}')
+            if [[ ${excl} != "16379" ]]; then
+                fail "traffic.sidecar.istio.io/excludeInboundPorts=${excl:-<unset>}, want '16379' in sidecar+cluster"
+            fi
+        else
+            # AP is cluster-mode only. Don't render for standalone/replica.
+            if kctl get authorizationpolicy "${RELEASE}-cluster-bus" >/dev/null 2>&1; then
+                fail "AuthorizationPolicy/${RELEASE}-cluster-bus should not render outside cluster mode"
+            fi
+        fi
+        ;;
+    ambient)
+        log "Istio check: ambient-mode resources must exist"
+        kctl get peerauthentication "${RELEASE}" >/dev/null \
+            || fail "PeerAuthentication/${RELEASE} missing in ambient mode"
+        # DestinationRule is sidecar-only; a DR in ambient requires a
+        # waypoint proxy and layers a second mTLS inside ztunnel's HBONE.
+        if kctl get destinationrule "${RELEASE}" >/dev/null 2>&1; then
+            fail "DestinationRule/${RELEASE} must not exist in ambient mode"
+        fi
+        if kctl get destinationrule "${RELEASE}-headless" >/dev/null 2>&1; then
+            fail "DestinationRule/${RELEASE}-headless must not exist in ambient mode"
+        fi
+        # Ambient has no sidecar — ztunnel handles HBONE at the node. If any
+        # chart pod picks one up, our inject=false label is being ignored.
+        if kctl get pod "${pod}" \
+             -o jsonpath='{.spec.containers[*].name} {.spec.initContainers[*].name}' \
+             | tr ' ' '\n' | grep -Fxq istio-proxy; then
+            fail "pod ${pod} has an istio-proxy container in ambient mode"
+        fi
+        dpmode=$(kctl get pod "${pod}" -o jsonpath='{.metadata.labels.istio\.io/dataplane-mode}')
+        if [[ ${dpmode} != ambient ]]; then
+            fail "pod ${pod} has istio.io/dataplane-mode=${dpmode:-<unset>}, want ambient"
+        fi
+        if is_on "${SHARD}"; then
+            # Ambient skips the cluster-isolation NetworkPolicy (it would
+            # drop HBONE) and relies entirely on the AP at the ztunnel
+            # layer. Verify both halves of that swap.
+            kctl get authorizationpolicy "${RELEASE}-cluster-bus" >/dev/null \
+                || fail "AuthorizationPolicy/${RELEASE}-cluster-bus missing in ambient+cluster mode"
+            if kctl get networkpolicy "${RELEASE}-cluster-isolation" >/dev/null 2>&1; then
+                fail "NetworkPolicy/${RELEASE}-cluster-isolation must not exist in ambient+cluster mode"
+            fi
+            # Sidecar-only exclude annotations must not leak through.
+            excl=$(kctl get statefulset "${RELEASE}" \
+                -o jsonpath='{.spec.template.metadata.annotations.traffic\.sidecar\.istio\.io/excludeInboundPorts}')
+            if [[ -n ${excl} ]]; then
+                fail "traffic.sidecar.istio.io/excludeInboundPorts=${excl} leaked into ambient pod"
+            fi
+            # And the AP bus rule must be scoped to this release's SPIFFE
+            # principal, not a wildcard or missing `from` — that's the whole
+            # point of the ambient cross-release isolation promise.
+            principals=$(kctl get authorizationpolicy "${RELEASE}-cluster-bus" \
+                -o jsonpath='{.spec.rules[0].from[0].source.principals[*]}')
+            if [[ ${principals} != *"/sa/${RELEASE}" ]]; then
+                fail "AuthorizationPolicy principals=${principals}, want .../sa/${RELEASE}"
+            fi
+        else
+            if kctl get authorizationpolicy "${RELEASE}-cluster-bus" >/dev/null 2>&1; then
+                fail "AuthorizationPolicy/${RELEASE}-cluster-bus should not render outside cluster mode"
+            fi
+        fi
+        ;;
+esac
+
+# Positive: the fully-correct invocation should succeed.
+log "Positive check"
+if is_on "${SHARD}"; then
+    # Even after the cluster-init Job completes, gossip needs a few seconds to converge
+    # — each node updates `cluster_state` only after it sees the others. Poll for that.
+    state=fail
+    for _ in $(seq 1 30); do
+        state=$(testbench_exec "${cli_args_good[@]}" cluster info | awk -F: '/^cluster_state:/{print $2}' | tr -d '\r\n')
+        [[ ${state} == ok ]] && break
+        sleep 2
+    done
+    assert_eq "ok" "${state}" "cluster_state"
+
+    # Inspect the topology: exact count + master/slave split.
+    nodes=$(testbench_exec "${cli_args_good[@]}" cluster nodes)
+    actual_nodes=$(printf '%s\n' "${nodes}" | sed '/^$/d' | wc -l | tr -d ' ')
+    assert_eq "${expected_node_count}" "${actual_nodes}" "cluster node count"
+
+    master_count=$(printf '%s\n' "${nodes}" | grep -c 'master' || true)
+    assert_eq "3" "${master_count}" "master count"
+
+    if is_on "${REP}"; then
+        slave_count=$(printf '%s\n' "${nodes}" | grep -c 'slave' || true)
+        assert_eq "3" "${slave_count}" "slave count"
+    fi
+else
+    pong=$(testbench_exec "${cli_args_good[@]}" ping | tr -d '\r\n')
+    assert_eq "PONG" "${pong}" "ping"
+fi
+
+# Negative — auth. No password should be rejected with NOAUTH.
+if is_on "${AUTH}"; then
+    log "Negative check: missing password must be rejected"
+    cli_args_noauth=(valkey-cli -h "valkey.${NAMESPACE}.svc.cluster.local" --no-auth-warning)
+    if is_on "${TLS}"; then
+        cli_args_noauth+=(--tls --cacert /tls/ca.crt)
+    fi
+    if is_on "${SHARD}"; then
+        probe_cmd=(cluster info)
+    else
+        probe_cmd=(ping)
+    fi
+    set +e
+    out=$(testbench_exec "${cli_args_noauth[@]}" "${probe_cmd[@]}" 2>&1)
+    rc=$?
+    set -e
+    if ! grep -qi 'NOAUTH' <<<"${out}"; then
+        fail "expected NOAUTH error, got (rc=${rc}): ${out}"
+    fi
+fi
+
+# Negative — TLS. No --tls at all, and --tls without the CA, must both fail.
+if is_on "${TLS}"; then
+    log "Negative check: plaintext client against TLS server must fail"
+    cli_args_plaintext=(valkey-cli -h "valkey.${NAMESPACE}.svc.cluster.local" --no-auth-warning)
+    if is_on "${AUTH}"; then cli_args_plaintext+=(-a "${AUTH_PASSWORD}"); fi
+    if is_on "${SHARD}"; then probe_cmd=(cluster info); else probe_cmd=(ping); fi
+    set +e
+    out=$(testbench_exec "${cli_args_plaintext[@]}" "${probe_cmd[@]}" 2>&1)
+    rc=$?
+    set -e
+    if (( rc == 0 )); then
+        fail "plaintext client should have failed but succeeded: ${out}"
+    fi
+
+    log "Negative check: TLS client without CA must fail to verify"
+    cli_args_nocacert=(valkey-cli -h "valkey.${NAMESPACE}.svc.cluster.local" --tls --no-auth-warning)
+    if is_on "${AUTH}"; then cli_args_nocacert+=(-a "${AUTH_PASSWORD}"); fi
+    set +e
+    out=$(testbench_exec "${cli_args_nocacert[@]}" "${probe_cmd[@]}" 2>&1)
+    rc=$?
+    set -e
+    if (( rc == 0 )) || ! grep -qi 'certificate verify failed' <<<"${out}"; then
+        fail "expected 'certificate verify failed', got (rc=${rc}): ${out}"
+    fi
+fi
+
+log "PASS: ${SCENARIO}"
diff --git a/functional-tests/setup.sh b/functional-tests/setup.sh
new file mode 100755
index 00000000..3fe57025
--- /dev/null
+++ b/functional-tests/setup.sh
@@ -0,0 +1,168 @@
+#!/usr/bin/env bash
+# Bring up the kind cluster, install Istio (demo profile), and create the
+# shared fixtures (auth secret, TLS secret, two testbench pods) used by
+# every scenario.
+
+HERE=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=lib.sh
+. "${HERE}/lib.sh"
+
+log "Creating kind cluster ${CLUSTER_NAME}"
+if kind get clusters | grep -Fxq "${CLUSTER_NAME}"; then
+    echo "kind cluster '${CLUSTER_NAME}' already exists; reusing"
+else
+    kind create cluster --config "${HERE}/kind-config.yaml" --wait 120s
+fi
+
+log "Installing Istio (ambient profile)"
+if istio_installed; then
+    echo "istio-system namespace already exists; assuming Istio is installed"
+else
+    # `ambient` ships istiod + the ambient data plane (istio-cni DaemonSet
+    # for iptables redirection, ztunnel DaemonSet for node-local HBONE
+    # mTLS). It also installs the sidecar injection webhook, so classic
+    # sidecar-mode pods still work on the same cluster — we can run both
+    # the sidecar matrix and the ambient regressions against one install.
+    istioctl install --context="${KUBE_CONTEXT}" \
+        --set profile=ambient --skip-confirmation
+fi
+
+# Wait for the ambient data plane to be live before launching testbenches.
+# Without this, the first few ambient scenarios race ztunnel startup and
+# the testbench gets no HBONE wrapping.
+if istio_ambient_installed; then
+    log "Waiting for ztunnel DaemonSet to be ready"
+    kubectl --context="${KUBE_CONTEXT}" -n "${ISTIO_NAMESPACE}" \
+        rollout status daemonset/ztunnel --timeout=180s
+fi
+
+# Namespace-level Istio injection intentionally NOT set. The chart now
+# carries per-pod `sidecar.istio.io/inject` and `istio.io/dataplane-mode`
+# labels derived from `istio.enabled` + `istio.mode`, so every workload
+# opts in or out explicitly at the pod layer. Labelling the namespace
+# `istio-injection=enabled` on top would (a) pull every istio=off pod
+# into the sidecar data plane — since namespace injection is inherited
+# unless each pod stamps `sidecar.istio.io/inject=false` to veto it —
+# and (b) blur which layer is actually responsible for mesh capture
+# when troubleshooting. Keep the decision at the pod level, the same as
+# how the chart ships to real operators.
+log "Namespace ${NAMESPACE} left unlabelled — chart controls mesh opt-in at the pod level"
+kubectl --context="${KUBE_CONTEXT}" label namespace "${NAMESPACE}" \
+    istio-injection- istio.io/dataplane-mode- 2>/dev/null || true
+
+log "Creating ${AUTH_SECRET} secret"
+kctl delete secret "${AUTH_SECRET}" --ignore-not-found
+kctl create secret generic "${AUTH_SECRET}" \
+    --from-literal="default=${AUTH_PASSWORD}"
+
+log "Generating self-signed TLS material"
+CERT_DIR=$(mktemp -d)
+trap 'rm -rf -- "${CERT_DIR}"' EXIT
+
+# CA
+openssl req -x509 -nodes -days 365 -newkey rsa:2048 \
+    -keyout "${CERT_DIR}/valkey-ca.key" \
+    -out    "${CERT_DIR}/valkey-ca.crt" \
+    -subj /CN=valkey-ca 2>/dev/null
+
+# Server CSR with SANs the chart's pods present on
+openssl req -nodes -newkey rsa:2048 \
+    -keyout "${CERT_DIR}/valkey-server.key" \
+    -out    "${CERT_DIR}/valkey-server.csr" \
+    -subj "/CN=valkey.${NAMESPACE}.svc.cluster.local" \
+    -addext "subjectAltName=DNS:valkey.${NAMESPACE}.svc.cluster.local,DNS:valkey-headless.${NAMESPACE}.svc.cluster.local,DNS:*.valkey-headless.${NAMESPACE}.svc.cluster.local" \
+    2>/dev/null
+
+openssl x509 -req \
+    -in "${CERT_DIR}/valkey-server.csr" \
+    -CA "${CERT_DIR}/valkey-ca.crt" \
+    -CAkey "${CERT_DIR}/valkey-ca.key" \
+    -CAcreateserial \
+    -out "${CERT_DIR}/valkey-server.crt" \
+    -days 365 \
+    -copy_extensions copyall \
+    2>/dev/null
+
+log "Creating ${TLS_SECRET} secret"
+kctl delete secret "${TLS_SECRET}" --ignore-not-found
+kctl create secret generic "${TLS_SECRET}" \
+    --from-file="server.crt=${CERT_DIR}/valkey-server.crt" \
+    --from-file="server.key=${CERT_DIR}/valkey-server.key" \
+    --from-file="ca.crt=${CERT_DIR}/valkey-ca.crt"
+
+# ---------------------------------------------------------------------------
+# Testbench pods. Three flavours, each expressing its mesh intent via
+# POD-level labels (the namespace is intentionally unlabelled — see the
+# comment at the sidecar-injection step above). The chart's Valkey pods
+# take the same pod-level approach, so the tests exercise the same opt-in
+# path operators use in production.
+#
+#   valkey-testbench          — out of both meshes. Used for istio=off
+#                               scenarios; no mesh labels emitted.
+#   valkey-testbench-injected — Envoy sidecar via per-pod inject=true.
+#                               Used for istio=on mode=sidecar.
+#   valkey-testbench-ambient  — ztunnel-wrapped via
+#                               istio.io/dataplane-mode=ambient.
+#                               Used for istio=on mode=ambient.
+# ---------------------------------------------------------------------------
+# $1: pod name
+# $2: flavour (plain|sidecar|ambient)
+launch_testbench() {
+    local pod=$1 flavour=$2 overrides labels
+    case "${flavour}" in
+        plain)
+            # No mesh labels: with the namespace unlabelled, the default is
+            # already "out of both meshes".
+            labels=''
+            ;;
+        sidecar)
+            labels='sidecar.istio.io/inject=true'
+            ;;
+        ambient)
+            labels='istio.io/dataplane-mode=ambient'
+            ;;
+        *)
+            echo "launch_testbench: unknown flavour ${flavour}" >&2
+            return 2
+            ;;
+    esac
+    overrides='{
+      "spec": {
+        "containers": [{
+          "name": "'"${pod}"'",
+          "image": "valkey/valkey:9.0.1",
+          "command": ["sleep", "infinity"],
+          "volumeMounts": [{"name": "tls", "mountPath": "/tls", "readOnly": true}]
+        }],
+        "volumes": [{
+          "name": "tls",
+          "secret": {"secretName": "'"${TLS_SECRET}"'"}
+        }]
+      }
+    }'
+    local label_args=()
+    [[ -n ${labels} ]] && label_args=(--labels="${labels}")
+    kctl delete pod "${pod}" --ignore-not-found --wait=true
+    kctl run "${pod}" \
+        --image=valkey/valkey:9.0.1 \
+        "${label_args[@]}" \
+        --restart=Never \
+        --overrides="${overrides}" \
+        --command -- sleep infinity
+    wait_for_testbench "${pod}"
+}
+
+log "Launching ${TESTBENCH_POD} (no mesh)"
+launch_testbench "${TESTBENCH_POD}" plain
+
+log "Launching ${TESTBENCH_POD_INJECTED} (Envoy sidecar)"
+launch_testbench "${TESTBENCH_POD_INJECTED}" sidecar
+
+if istio_ambient_installed; then
+    log "Launching ${TESTBENCH_POD_AMBIENT} (ambient / ztunnel)"
+    launch_testbench "${TESTBENCH_POD_AMBIENT}" ambient
+else
+    log "Skipping ${TESTBENCH_POD_AMBIENT} — ambient data plane not installed"
+fi
+
+log "Setup complete"
diff --git a/functional-tests/teardown.sh b/functional-tests/teardown.sh
new file mode 100755
index 00000000..1349abad
--- /dev/null
+++ b/functional-tests/teardown.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# Remove the shared fixtures and (optionally) the kind cluster itself.
+#
+# Usage:
+#   ./teardown.sh           # remove fixtures, keep cluster
+#   ./teardown.sh --cluster # also delete the kind cluster
+
+HERE=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=lib.sh
+. "${HERE}/lib.sh"
+
+DELETE_CLUSTER=0
+for arg in "$@"; do
+    case "${arg}" in
+        --cluster) DELETE_CLUSTER=1 ;;
+        *) echo "unknown arg: ${arg}" >&2; exit 2 ;;
+    esac
+done
+
+if kind get clusters | grep -Fxq "${CLUSTER_NAME}"; then
+    log "Removing fixtures from ${CLUSTER_NAME}"
+    # Best-effort: any lingering release + PVCs.
+    hctl uninstall "${RELEASE}"                                    2>/dev/null || true
+    kctl delete pvc --selector="app.kubernetes.io/instance=${RELEASE}" --ignore-not-found
+    kctl delete pod    "${TESTBENCH_POD}" "${TESTBENCH_POD_INJECTED}" "${TESTBENCH_POD_AMBIENT}" --ignore-not-found
+    kctl delete secret "${AUTH_SECRET}" "${TLS_SECRET}"                --ignore-not-found
+fi
+
+if (( DELETE_CLUSTER )); then
+    log "Deleting kind cluster ${CLUSTER_NAME}"
+    kind delete cluster --name "${CLUSTER_NAME}"
+fi
diff --git a/valkey/Chart.yaml b/valkey/Chart.yaml
index 6d6c7114..ea8eaba4 100644
--- a/valkey/Chart.yaml
+++ b/valkey/Chart.yaml
@@ -3,7 +3,7 @@ name: valkey
 description: A Helm chart for Kubernetes
 type: application
 version: 0.9.4
-appVersion: "9.0.2"
+appVersion: "9.1.0"
 home: https://valkey.io/valkey-helm/
 sources:
   - https://github.com/valkey-io/valkey-helm.git
diff --git a/valkey/README.md b/valkey/README.md
index d069809a..7fea2332 100644
--- a/valkey/README.md
+++ b/valkey/README.md
@@ -58,6 +58,60 @@ replica:
 
 If fewer than `minReplicasToWrite` replicas are available, the master will reject write operations.
 
+### Cluster Mode
+
+Deploy a sharded Valkey cluster for horizontal scaling and high availability:
+
+```bash
+helm install valkey valkey/valkey --set cluster.enabled=true --set cluster.persistence.size=5Gi
+```
+
+**Architecture:**
+
+* Data is automatically sharded across multiple primary nodes (16384 hash slots distributed across shards)
+* Each shard can have replicas for high availability within the shard
+* Total nodes = `shards` × (1 + `replicasPerShard`)
+
+**Default Configuration (6 nodes):**
+
+```yaml
+cluster:
+  enabled: true
+  shards: 3              # Minimum 3 shards required
+  replicasPerShard: 1    # 1 replica per shard
+  persistence:
+    size: 5Gi            # Required
+```
+
+This creates 6 nodes: 3 primary shards + 3 replicas.
+
+**High Availability Configuration (15 nodes):**
+
+```yaml
+cluster:
+  enabled: true
+  shards: 5              # 5 primary shards
+  replicasPerShard: 2    # 2 replicas per shard for extra redundancy
+  persistence:
+    size: 10Gi
+    storageClass: "fast-ssd"
+```
+
+**Services:**
+
+* `valkey`: Main service for client connections (routes to all nodes)
+* `valkey-headless`: Headless service for pod discovery and cluster communication
+
+**Cluster Configuration Options:**
+
+```yaml
+cluster:
+  nodeTimeout: 15000          # Milliseconds before a node is considered failed
+  requireFullCoverage: true   # Require all hash slots covered to accept writes
+  allowReadsWhenDown: false   # Allow reads when cluster is in down state
+  busPort: 16379              # Port for inter-node cluster communication
+```
+
 ## Storage
 
 ### Standalone Storage
@@ -93,6 +147,20 @@ replica:
     storageClass: "fast-ssd"  # Optional
 ```
 
+### Cluster Storage
+
+Persistent storage is **mandatory** in cluster mode. Each node in the cluster maintains its own data partition and cluster state configuration.
+
+```yaml
+cluster:
+  enabled: true
+  persistence:
+    size: 10Gi  # Required
+    storageClass: "fast-ssd"  # Optional
+    accessModes:
+      - ReadWriteOnce
+```
+
 ## Authentication
 
 This chart supports ACL-based authentication for Valkey.
@@ -174,6 +242,35 @@ replica:
 * This user MUST be defined in `auth.aclUsers` with appropriate permissions
 * Minimum permissions: `+psync +replconf +ping`
 
+### Cluster with Authentication
+
+When using ACL authentication in cluster mode, nodes need credentials to authenticate with each other for cluster operations:
+
+```yaml
+auth:
+  enabled: true
+  usersExistingSecret: "my-valkey-users"
+  aclUsers:
+    default:
+      permissions: "~* &* +@all"
+    cluster-user:
+      permissions: "+psync +replconf +ping"
+
+cluster:
+  enabled: true
+  shards: 3
+  replicasPerShard: 1
+  replicationUser: "cluster-user"  # Must be defined in auth.aclUsers
+  persistence:
+    size: 5Gi
+```
+
+**Important Notes:**
+
+* `cluster.replicationUser` specifies which ACL user cluster nodes use to authenticate
+* This user MUST be defined in `auth.aclUsers` with appropriate permissions
+* Minimum permissions: `+psync +replconf +ping`
+
 ## Metrics
 
 This chart supports Prometheus metrics collection using the [Redis exporter](https://github.com/oliver006/redis_exporter).
@@ -349,6 +446,17 @@ tls:
 | replica.persistence.size | string | `""` | Required if replica is enabled |
 | replica.persistence.storageClass | string | `""` |  |
 | replica.persistence.accessModes | list | `""` |  |
+| cluster.enabled | bool | `false` | Enable cluster mode (mutually exclusive with replica.enabled) |
+| cluster.shards | int | `3` | Number of primary shards (minimum 3) |
+| cluster.replicasPerShard | int | `1` | Number of replicas per shard |
+| cluster.replicationUser | string | `"default"` | ACL user for cluster authentication (must be in auth.aclUsers) |
+| cluster.nodeTimeout | int | `15000` | Milliseconds before node is considered failed |
+| cluster.requireFullCoverage | bool | `true` | Require all slots covered to accept writes |
+| cluster.allowReadsWhenDown | bool | `false` | Allow reads when cluster is down |
+| cluster.busPort | int | `16379` | Port for inter-node cluster communication |
+| cluster.persistence.size | string | `""` | Required if cluster is enabled |
+| cluster.persistence.storageClass | string | `""` |  |
+| cluster.persistence.accessModes | list | `["ReadWriteOnce"]` |  |
 | resources | object | `{}` |  |
 | securityContext.capabilities.drop[0] | string | `"ALL"` |  |
 | securityContext.readOnlyRootFilesystem | bool | `true` |  |
diff --git a/valkey/scripts/cluster-init-script.sh b/valkey/scripts/cluster-init-script.sh
new file mode 100644
index 00000000..ff9a96f8
--- /dev/null
+++ b/valkey/scripts/cluster-init-script.sh
@@ -0,0 +1,201 @@
+#!/bin/sh
+set -eu
+
+# --- Configuration & Initial Checks ---
+if [ "${CLUSTER_NODE_COUNT}" -eq "1" ]; then
+    echo "Single node deployment. Skipping cluster initialization"
+    exit 0
+fi
+
+REPLICAS_PER_SHARD=${CLUSTER_REPLICAS_PER_SHARD:-1}
+PRIMARIES=$(( CLUSTER_NODE_COUNT / (1 + REPLICAS_PER_SHARD) ))
+
+{{- if and .Values.auth.enabled .Values.auth.aclUsers }}
+{{- $replUsername := .Values.cluster.replicationUser }}
+{{- $replUser := index .Values.auth.aclUsers $replUsername }}
+{{- $replPasswordKey := $replUser.passwordKey | default $replUsername }}
+{{- if .Values.auth.usersExistingSecret }}
+if [ -f "/valkey-users-secret/{{ $replPasswordKey }}" ]; then
+  REDISCLI_AUTH=$(cat "/valkey-users-secret/{{ $replPasswordKey }}")
+elif [ -f "/valkey-auth-secret/{{ $replUsername }}-password" ]; then
+  REDISCLI_AUTH=$(cat "/valkey-auth-secret/{{ $replUsername }}-password")
+else
+  echo "ERROR: No password found for cluster replication user {{ $replUsername }}" >&2
+  exit 1
+fi
+{{- else }}
+if [ -f "/valkey-auth-secret/{{ $replUsername }}-password" ]; then
+  REDISCLI_AUTH=$(cat "/valkey-auth-secret/{{ $replUsername }}-password")
+else
+  echo "ERROR: No password found for cluster replication user {{ $replUsername }}" >&2
+  exit 1
+fi
+{{- end }}
+# Valkey/Redis clients honour REDISCLI_AUTH, which avoids passing the password
+# on the command line (where it would leak via `ps` and trip over shell
+# metacharacters).
+export REDISCLI_AUTH
+{{- end }}
+
+# vcli: thin wrapper that inherits REDISCLI_AUTH and always adds TLS args when
+# configured. Callers pass only host/port/subcommand.
+vcli() {
+{{- if .Values.tls.enabled }}
+  valkey-cli --no-auth-warning --tls --cacert "/tls/{{ .Values.tls.caPublicKey }}" "$@"
+{{- else }}
+  valkey-cli --no-auth-warning "$@"
+{{- end }}
+}
+
+echo "Cluster init job starting. Total nodes: ${CLUSTER_NODE_COUNT}, Primaries: ${PRIMARIES}, Replicas per shard: ${REPLICAS_PER_SHARD}"
+
+HEADLESS_SVC="{{ include "valkey.headlessServiceName" . }}"
+NAMESPACE="{{ .Release.Namespace }}"
+CLUSTER_DOMAIN="{{ .Values.clusterDomain }}"
+PORT="{{ .Values.service.port }}"
+FULLNAME="{{ include "valkey.fullname" . }}"
+
+node_host() { echo "${FULLNAME}-$1.${HEADLESS_SVC}.${NAMESPACE}.svc.${CLUSTER_DOMAIN}"; }
+
+# --- Wait for all Valkey nodes to be ready ---
+for i in $(seq 0 $((CLUSTER_NODE_COUNT - 1))); do
+  NODE_HOST=$(node_host "${i}")
+  until vcli -h "${NODE_HOST}" -p "${PORT}" ping 2>/dev/null | grep -q "PONG"; do
+    echo "Waiting for ${NODE_HOST} to be ready..."
+    sleep 2
+  done
+  echo "Node ${NODE_HOST} is ready."
+done
+
+echo "All ${CLUSTER_NODE_COUNT} nodes are ready."
+
+# --- Discover Existing Cluster ---
+HEALTHY_NODE=""
+for i in $(seq 0 $((CLUSTER_NODE_COUNT - 1))); do
+  NODE_HOST=$(node_host "${i}")
+  if vcli -h "${NODE_HOST}" -p "${PORT}" cluster info 2>/dev/null | grep -q "cluster_state:ok"; then
+    HEALTHY_NODE="${NODE_HOST}"
+    echo "Found healthy cluster node: ${HEALTHY_NODE}"
+    break
+  fi
+done
+
+# --- Logic for Joining an Existing Cluster (scaling up) ---
+if [ -n "${HEALTHY_NODE}" ]; then
+  echo "Existing cluster found. Checking for new nodes to add..."
+
+  KNOWN_NODES=$(vcli -h "${HEALTHY_NODE}" -p "${PORT}" cluster nodes 2>/dev/null)
+
+  NEW_NODE_COUNT=0
+  for i in $(seq 0 $((CLUSTER_NODE_COUNT - 1))); do
+    NODE_HOST=$(node_host "${i}")
+    NODE_IP=$(getent hosts "${NODE_HOST}" | awk '{print $1}')
+
+    if echo "${KNOWN_NODES}" | grep -v "fail" | grep -q "${NODE_IP}:${PORT}"; then
+      echo "Node ${NODE_HOST} (${NODE_IP}) already in cluster."
+      continue
+    fi
+
+    echo "New node found: ${NODE_HOST} (${NODE_IP}). Adding to cluster..."
+    NEW_NODE_COUNT=$((NEW_NODE_COUNT + 1))
+
+    # Forget any old, failed instance of this node
+    FAILED_NODE_ID=$(echo "${KNOWN_NODES}" | grep "${NODE_IP}:${PORT}" | grep "fail" | awk '{print $1}' || true)
+    if [ -n "${FAILED_NODE_ID}" ]; then
+      echo "Found node IP (${NODE_IP}) marked as failed with ID ${FAILED_NODE_ID}. Forgetting it..."
+      vcli --cluster call "${HEALTHY_NODE}:${PORT}" cluster forget "${FAILED_NODE_ID}" > /dev/null 2>&1 || true
+      sleep 3
+    fi
+
+    # Meet the cluster via the new node
+    HEALTHY_NODE_IP=$(getent hosts "${HEALTHY_NODE}" | awk '{print $1}')
+    echo "Sending CLUSTER MEET from ${NODE_HOST} to ${HEALTHY_NODE} (${HEALTHY_NODE_IP})"
+    vcli -h "${NODE_HOST}" -p "${PORT}" cluster meet "${HEALTHY_NODE_IP}" "${PORT}"
+  done
+
+  if [ "${NEW_NODE_COUNT}" -eq 0 ]; then
+    echo "No new nodes to add. Cluster is up to date."
+    exit 0
+  fi
+
+  sleep 5
+
+  # Assign roles to new nodes: find masters needing replicas
+  for i in $(seq 0 $((CLUSTER_NODE_COUNT - 1))); do
+    NODE_HOST=$(node_host "${i}")
+    NODE_ID=$(vcli -h "${NODE_HOST}" -p "${PORT}" cluster myid)
+
+    # Re-fetch cluster state from healthy node for current view
+    CURRENT_NODES=$(vcli -h "${HEALTHY_NODE}" -p "${PORT}" cluster nodes)
+
+    # Check if this node is a master with no slots (new node)
+    NODE_INFO=$(echo "${CURRENT_NODES}" | grep "${NODE_ID}")
+    IS_MASTER=$(echo "${NODE_INFO}" | grep -c "master" || true)
+    HAS_SLOTS=$(echo "${NODE_INFO}" | awk '{for(i=9;i<=NF;i++) print $i}' | head -1)
+
+    if [ "${IS_MASTER}" -gt 0 ] && [ -z "${HAS_SLOTS}" ]; then
+      echo "Node ${NODE_HOST} is an empty master. Searching for a master to replicate..."
+
+      TARGET_MASTER_ID=$(echo "${CURRENT_NODES}" | awk -v replicas_needed="${REPLICAS_PER_SHARD}" -v my_id="${NODE_ID}" '
+        /master/ && !/fail/ { masters[$1] = 1 }
+        /slave/ && !/fail/ { master_replicas[$4]++ }
+        END {
+          for (master_id in masters) {
+            if ( master_id != my_id && (master_replicas[master_id] < replicas_needed || master_replicas[master_id] == "") ) {
+              print master_id
+              exit
+            }
+          }
+        }
+      ')
+
+      if [ -n "${TARGET_MASTER_ID}" ]; then
+        echo "Found target master ${TARGET_MASTER_ID} that needs a replica."
+        if vcli -h "${NODE_HOST}" -p "${PORT}" cluster replicate "${TARGET_MASTER_ID}"; then
+          echo "Successfully configured ${NODE_HOST} as a replica for ${TARGET_MASTER_ID}."
+        else
+          echo "WARNING: Failed to replicate master ${TARGET_MASTER_ID} from ${NODE_HOST}."
+        fi
+      fi
+    fi
+  done
+
+  # Rebalance if needed
+  echo "Attempting cluster rebalance..."
+
+  PROPAGATION_ATTEMPTS=0
+  MAX_PROPAGATION_ATTEMPTS=60
+  while [ ${PROPAGATION_ATTEMPTS} -lt ${MAX_PROPAGATION_ATTEMPTS} ]; do
+    CLUSTER_STATE=$(vcli -h "${HEALTHY_NODE}" -p "${PORT}" cluster info 2>/dev/null | grep "cluster_state:" | cut -d: -f2 | tr -d '\r\n')
+    if [ "${CLUSTER_STATE}" = "ok" ]; then
+      echo "Cluster state is OK. Proceeding with rebalance."
+      break
+    fi
+    echo "Cluster state is ${CLUSTER_STATE}. Waiting for propagation... (${PROPAGATION_ATTEMPTS}/${MAX_PROPAGATION_ATTEMPTS})"
+    PROPAGATION_ATTEMPTS=$((PROPAGATION_ATTEMPTS + 1))
+    sleep 5
+  done
+
+  vcli --cluster rebalance "${HEALTHY_NODE}:${PORT}" --cluster-use-empty-masters --cluster-yes || true
+
+  echo "Cluster update completed."
+  exit 0
+fi
+
+# --- Create New Cluster ---
+echo "No existing cluster found. Creating new cluster..."
+NODES=""
+for i in $(seq 0 $((CLUSTER_NODE_COUNT - 1))); do
+  NODE_HOST=$(node_host "${i}")
+  NODES="${NODES} ${NODE_HOST}:${PORT}"
+done
+
+# Allow time for cluster-enabled nodes to fully initialize
+sleep 10
+
+echo "Creating cluster with nodes:${NODES}"
+# shellcheck disable=SC2086
+echo "yes" | vcli --cluster create ${NODES} --cluster-replicas "${REPLICAS_PER_SHARD}"
+echo "Cluster created successfully."
+
+exit 0
diff --git a/valkey/scripts/cluster-prestop-script.sh b/valkey/scripts/cluster-prestop-script.sh
new file mode 100644
index 00000000..308caf72
--- /dev/null
+++ b/valkey/scripts/cluster-prestop-script.sh
@@ -0,0 +1,168 @@
+#!/bin/sh
+# preStop hook for cluster-mode Valkey pods: orchestrate an orderly
+# CLUSTER FAILOVER before kubelet sends SIGTERM.
+#
+# Problem this solves
+# -------------------
+# A rollout restart (or any voluntary pod eviction) sends SIGTERM to Valkey
+# and — 30 seconds later by default — SIGKILL. Without a preStop hook, a
+# primary pod dies with open client connections; the TCP sockets close
+# abruptly, connection pools fill with dead handles, the app errors out on
+# every pooled command, and the cluster takes up to cluster-node-timeout
+# (15s default) to promote a replica. That is the behaviour the bug report
+# describes.
+#
+# The fix: before the SIGTERM, detect if this pod is a primary; if so, ask
+# one of its own replicas to run `CLUSTER FAILOVER`. Valkey then performs
+# the canonical orderly handover — the primary pauses new writes, both
+# sides sync replication offsets, the replica promotes, the old primary
+# demotes to replica. Clients with cluster-topology refresh see the new
+# primary immediately via MOVED; existing connections close cleanly as
+# part of the demotion. No SIGTERM-during-write window, no pooled dead
+# connections, no visible blip.
+#
+# No-op paths (deliberately best-effort — a failing preStop must never
+# block pod shutdown; the old abrupt behaviour is still strictly better
+# than hanging in Terminating):
+#   * This pod is already a replica — losing a replica is invisible to
+#     clients, no failover needed.
+#   * Shard has no replicas (cluster.replicasPerShard=0) — nothing to fail
+#     over to, accept the abrupt close as a topology choice.
+#   * This pod has no healthy replica of its own (all its replicas are
+#     marked fail) — skip; FAILOVER would target nothing.
+#   * Any vcli command fails — log and exit 0.
+#
+# Notably NOT a no-op path: cluster_state:fail. That state is expected
+# mid-rollout (slots briefly uncovered between restarts). Skipping the
+# hook there would perpetuate the degraded state by letting every
+# subsequent primary also die abruptly.
+#
+# This script is templated at Helm render time so it can inline the same
+# TLS/auth plumbing the cluster-init script uses. Keeping them separate
+# (rather than a shared sourced helper) is intentional: Helm's text-
+# template model makes shared sh includes fragile, the code is short, and
+# the two scripts evolve independently.
+set -eu
+
+log() { echo "preStop: $*" >&2; }
+
+PORT="{{ .Values.service.port }}"
+TIMEOUT={{ .Values.cluster.preStopFailover.timeoutSeconds }}
+
+# Self-FQDN (matches what init_config.yaml announces via
+# cluster-announce-hostname). Using 127.0.0.1 would work for TCP but
+# break TLS SAN verification — the server cert's SAN lists the FQDN, not
+# the loopback. Same rationale applies to the replica endpoint below.
+SELF_FQDN="${HOSTNAME}.{{ include "valkey.headlessServiceName" . }}.{{ .Release.Namespace }}.svc.{{ .Values.clusterDomain }}"
+
+{{- if and .Values.auth.enabled .Values.auth.aclUsers }}
+{{- $replUsername := .Values.cluster.replicationUser }}
+{{- $replUser := index .Values.auth.aclUsers $replUsername }}
+{{- $replPasswordKey := $replUser.passwordKey | default $replUsername }}
+{{- if .Values.auth.usersExistingSecret }}
+if [ -f "/valkey-users-secret/{{ $replPasswordKey }}" ]; then
+  REDISCLI_AUTH=$(cat "/valkey-users-secret/{{ $replPasswordKey }}")
+elif [ -f "/valkey-auth-secret/{{ $replUsername }}-password" ]; then
+  REDISCLI_AUTH=$(cat "/valkey-auth-secret/{{ $replUsername }}-password")
+else
+  log "no password found for user {{ $replUsername }}; cannot authenticate preStop"
+  exit 0
+fi
+{{- else }}
+if [ -f "/valkey-auth-secret/{{ $replUsername }}-password" ]; then
+  REDISCLI_AUTH=$(cat "/valkey-auth-secret/{{ $replUsername }}-password")
+else
+  log "no password found for user {{ $replUsername }}; cannot authenticate preStop"
+  exit 0
+fi
+{{- end }}
+export REDISCLI_AUTH
+{{- end }}
+
+vcli() {
+{{- if .Values.tls.enabled }}
+  valkey-cli --no-auth-warning --tls --cacert "/tls/{{ .Values.tls.caPublicKey }}" "$@"
+{{- else }}
+  valkey-cli --no-auth-warning "$@"
+{{- end }}
+}
+
+# We do NOT gate on cluster_state here. A rollout restarts pods one at a
+# time, and between restarts this node sees cluster_state:fail until
+# gossip observes the previous pod rejoin — exactly the window this
+# preStop is meant to close. Skipping FAILOVER there would defeat the
+# hook: without it, SIGTERM takes the primary's slots offline and the
+# next pod also sees cluster_state:fail, perpetuating the degraded state
+# for the rest of the rollout. We rely instead on CLUSTER FAILOVER's
+# own preconditions (a healthy, caught-up replica) to decide whether the
+# handover is safe.
+role=$(vcli -h "${SELF_FQDN}" -p "${PORT}" info replication 2>/dev/null | awk -F: '/^role:/{print $2}' | tr -d '\r\n' || true)
+case "${role}" in
+  master) ;;
+  slave|replica)
+    log "role=${role}; no failover needed"
+    exit 0
+    ;;
+  *)
+    log "unexpected role=${role:-<unknown>}; not attempting failover"
+    exit 0
+    ;;
+esac
+
+my_id=$(vcli -h "${SELF_FQDN}" -p "${PORT}" cluster myid 2>/dev/null | tr -d '\r\n' || true)
+if [ -z "${my_id}" ]; then
+  log "cluster myid empty; not attempting failover"
+  exit 0
+fi
+
+# CLUSTER REPLICAS <my-id> returns a subset of CLUSTER NODES, one line per
+# replica of this primary, in the same eight-field format. We want a live
+# (non-failing), online replica. Field 2 is the announce endpoint
+# "host:port@busport[,hostname]"; Helm sets
+# cluster-preferred-endpoint-type=hostname in init_config.yaml, so the
+# host half is a DNS name that matches the TLS SAN when TLS is enabled.
+replica_line=$(vcli -h "${SELF_FQDN}" -p "${PORT}" cluster replicas "${my_id}" 2>/dev/null \
+  | awk '!/fail/ && NF' \
+  | head -n1 || true)
+if [ -z "${replica_line}" ]; then
+  log "no healthy replica for this primary; skipping failover"
+  exit 0
+fi
+
+endpoint=$(printf '%s\n' "${replica_line}" | awk '{print $2}' | cut -d@ -f1)
+replica_host=${endpoint%:*}
+replica_port=${endpoint##*:}
+
+if [ -z "${replica_host}" ] || [ -z "${replica_port}" ]; then
+  log "could not parse replica endpoint from '${replica_line}'; skipping failover"
+  exit 0
+fi
+
+log "primary ${my_id}; asking replica ${replica_host}:${replica_port} to take over"
+
+# Plain CLUSTER FAILOVER (no FORCE/TAKEOVER) is the graceful path: the
+# replica negotiates with the primary, waits for replication-offset sync,
+# then promotes. If the replica is too far behind or the primary is
+# unreachable, it returns an error — we then exit 0 and let SIGTERM run.
+if ! vcli -h "${replica_host}" -p "${replica_port}" cluster failover 2>/dev/null; then
+  log "CLUSTER FAILOVER rejected; proceeding with abrupt shutdown"
+  exit 0
+fi
+
+# CLUSTER FAILOVER returns OK as soon as the replica accepts the request;
+# the actual role flip is asynchronous. Poll our own INFO until we see
+# role=slave (or give up on TIMEOUT).
+deadline=$(( $(date +%s) + TIMEOUT ))
+while :; do
+  now=$(date +%s)
+  if [ "${now}" -ge "${deadline}" ]; then
+    log "timed out after ${TIMEOUT}s waiting for demotion; proceeding with shutdown"
+    exit 0
+  fi
+  cur_role=$(vcli -h "${SELF_FQDN}" -p "${PORT}" info replication 2>/dev/null | awk -F: '/^role:/{print $2}' | tr -d '\r\n' || true)
+  if [ "${cur_role}" = "slave" ] || [ "${cur_role}" = "replica" ]; then
+    log "demoted to ${cur_role}; handover complete"
+    exit 0
+  fi
+  sleep 1
+done
diff --git a/valkey/templates/NOTES.txt b/valkey/templates/NOTES.txt
index 07ddb6dd..c5ff2bba 100644
--- a/valkey/templates/NOTES.txt
+++ b/valkey/templates/NOTES.txt
@@ -10,7 +10,59 @@ Namespace:    {{ .Release.Namespace }}
 Chart:        {{ .Chart.Name }} {{ .Chart.Version }}
 App version:  {{ .Chart.AppVersion }}
 
-{{- if .Values.replica.enabled }}
+{{- if .Values.cluster.enabled }}
+================================================================================
+🌐 CLUSTER MODE (Sharded)
+================================================================================
+
+Your Valkey deployment is running in CLUSTER mode:
+- {{ .Values.cluster.shards }} Shard(s) (primary nodes)
+- {{ .Values.cluster.replicasPerShard }} Replica(s) per shard
+- {{ include "valkey.clusterNodeCount" . }} Total node(s)
+
+Hash slots (16384 total) are distributed across the {{ .Values.cluster.shards }} shards.
+
+Service:      {{ include "valkey.fullname" . }}
+Type:         {{ .Values.service.type }}
+Port:         {{ .Values.service.port }}
+
+Bus port {{ .Values.cluster.busPort }} is reachable only through the headless
+service — it carries cluster gossip + failover traffic between nodes, so it
+must bypass the round-robin frontend service.
+
+1) In-cluster access
+   From another Pod:
+   $ valkey-cli -h {{ include "valkey.fullname" . }} -p {{ .Values.service.port }}{{ if .Values.tls.enabled }} --tls{{- end }} -c PING
+
+   Note: Use the `-c` flag to enable cluster mode in valkey-cli.
+
+2) Local access via kubectl port-forward
+   $ kubectl -n {{ .Release.Namespace }} port-forward svc/{{ include "valkey.fullname" . }} 6379:{{ .Values.service.port }}
+   In another terminal:
+   $ valkey-cli -h 127.0.0.1 -p 6379{{ if .Values.tls.enabled }} --tls{{- end }} -c PING
+{{ if eq .Values.service.type "LoadBalancer" }}
+3) External access (LoadBalancer)
+   $ export SERVICE_IP=$(kubectl -n {{ .Release.Namespace }} get svc {{ include "valkey.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
+   $ valkey-cli -h $SERVICE_IP -p {{ .Values.service.port }}{{ if .Values.tls.enabled }} --tls{{- end }} -c PING
+{{ else if eq .Values.service.type "NodePort" }}
+3) External access (NodePort)
+   $ export NODE_PORT=$(kubectl -n {{ .Release.Namespace }} get svc {{ include "valkey.fullname" . }} -o jsonpath='{.spec.ports[0].nodePort}')
+   $ export NODE_IP=$(kubectl get nodes -o jsonpath='{.items[0].status.addresses[?(@.type=="InternalIP")].address}')
+   $ valkey-cli -h $NODE_IP -p $NODE_PORT{{ if .Values.tls.enabled }} --tls{{- end }} -c PING
+{{ end }}
+Direct Pod Access (Headless Service):
+{{- $shards := int .Values.cluster.shards }}
+{{- $replicasPerShard := int .Values.cluster.replicasPerShard }}
+{{- $totalNodes := mul $shards (add 1 $replicasPerShard) }}
+{{- range $i := until (int $totalNodes) }}
+  {{ include "valkey.fullname" $ }}-{{ $i }}.{{ include "valkey.headlessServiceName" $ }}.{{ $.Release.Namespace }}.svc.{{ $.Values.clusterDomain }}
+{{- end }}
+
+Cluster Info:
+  $ valkey-cli -h {{ include "valkey.fullname" . }} -p {{ .Values.service.port }}{{ if .Values.tls.enabled }} --tls{{- end }}{{ if .Values.auth.enabled }} --user <user> -a <password>{{ end }} cluster info
+  $ valkey-cli -h {{ include "valkey.fullname" . }} -p {{ .Values.service.port }}{{ if .Values.tls.enabled }} --tls{{- end }}{{ if .Values.auth.enabled }} --user <user> -a <password>{{ end }} cluster nodes
+
+{{- else if .Values.replica.enabled }}
 ================================================================================
 🔄 REPLICATION MODE
 ================================================================================
@@ -99,13 +151,29 @@ Port:         {{ .Values.service.port }}
 {{ end }}
 
 ✅ Quick test
+{{- if .Values.cluster.enabled }}
+$ valkey-cli -h {{ include "valkey.fullname" . }} -p {{ .Values.service.port }}{{ if .Values.tls.enabled }} --tls{{- end }}{{ if .Values.auth.enabled }} --user <user> -a <password>{{ end }} -c
+valkey> SET foo bar
+valkey> GET foo
+"bar"
+valkey> CLUSTER INFO
+{{- else }}
 $ valkey-cli -h {{ include "valkey.fullname" . }} -p {{ .Values.service.port }}{{ if .Values.tls.enabled }} --tls{{- end }}{{ if .Values.auth.enabled }} --user <user> -a <password>{{ end }}
 valkey> SET foo bar
 valkey> GET foo
 "bar"
+{{- end }}
 
 💾 Persistence
-{{- if .Values.replica.enabled }}
+{{- if .Values.cluster.enabled }}
+- Persistence is ENABLED (required for cluster mode). Each node has its own volume.
+- Size: {{ .Values.cluster.persistence.size }}
+{{- if .Values.cluster.persistence.storageClass }}
+- Storage class: {{ .Values.cluster.persistence.storageClass }}
+{{- end }}
+- To see PVCs:
+  $ kubectl -n {{ .Release.Namespace }} get pvc -l app.kubernetes.io/instance={{ .Release.Name }}
+{{- else if .Values.replica.enabled }}
 - Persistence is ENABLED (required for replication mode). Each instance has its own volume.
 - Size: {{ .Values.replica.persistence.size }}
 {{- if .Values.replica.persistence.storageClass }}
diff --git a/valkey/templates/_helpers.tpl b/valkey/templates/_helpers.tpl
index 593cf77c..739ba844 100644
--- a/valkey/templates/_helpers.tpl
+++ b/valkey/templates/_helpers.tpl
@@ -82,19 +82,17 @@ Returns the Valkey exporter container image
 The common image function that renders the container image
 */}}
 {{- define "common.image" -}}
-{{- $registryName := .image.registry }}
-{{- $repositoryName := .image.repository }}
-{{- $tag := .image.tag }}
-{{- if .global }}
-  {{- if .global.imageRegistry }}
-    {{- $registryName = .global.imageRegistry }}
-  {{- end }}
-{{- end }}
-{{- if $registryName }}
-{{- printf "%s/%s:%s" $registryName $repositoryName $tag }}
-{{- else }}
-{{- printf "%s:%s" $repositoryName $tag }}
-{{ end }}
+{{- $registryName := .image.registry -}}
+{{- $repositoryName := .image.repository -}}
+{{- $tag := .image.tag -}}
+{{- if and .global .global.imageRegistry -}}
+{{- $registryName = .global.imageRegistry -}}
+{{- end -}}
+{{- if $registryName -}}
+{{- printf "%s/%s:%s" $registryName $repositoryName $tag -}}
+{{- else -}}
+{{- printf "%s:%s" $repositoryName $tag -}}
+{{- end -}}
 {{- end -}}
 
 {{/*
@@ -188,3 +186,249 @@ Validate replica authentication configuration
 {{- end }}
 {{- end -}}
 
+{{/*
+Validate cluster configuration
+*/}}
+{{- define "valkey.validateClusterConfig" -}}
+{{- if .Values.cluster.enabled }}
+  {{- if .Values.replica.enabled }}
+    {{- fail "cluster.enabled and replica.enabled are mutually exclusive. Please enable only one mode." }}
+  {{- end }}
+  {{- if lt (int .Values.cluster.shards) 3 }}
+    {{- fail "Cluster mode requires at least 3 shards (cluster.shards >= 3) for proper cluster operation." }}
+  {{- end }}
+  {{- if not .Values.cluster.persistence.size }}
+    {{- fail "Cluster mode requires persistent storage. Please set cluster.persistence.size (e.g., '5Gi')" }}
+  {{- end }}
+{{- end }}
+{{- end -}}
+
+{{/*
+Validate cluster authentication configuration
+*/}}
+{{- define "valkey.validateClusterAuth" -}}
+{{- if and .Values.cluster.enabled .Values.auth.enabled }}
+  {{- if not (hasKey .Values.auth.aclUsers .Values.cluster.replicationUser) }}
+    {{- fail (printf "Cluster replication user '%s' (cluster.replicationUser) must be defined in auth.aclUsers. The chart requires this to retrieve the password for cluster authentication." .Values.cluster.replicationUser) }}
+  {{- end }}
+{{- end }}
+{{- end -}}
+
+{{/*
+Calculate total number of nodes in the cluster
+*/}}
+{{- define "valkey.clusterNodeCount" -}}
+{{- $shards := int .Values.cluster.shards -}}
+{{- $replicasPerShard := int .Values.cluster.replicasPerShard -}}
+{{- mul $shards (add 1 $replicasPerShard) -}}
+{{- end -}}
+
+{{/*
+Istio pod labels. Emits the labels that tell Istio exactly how to capture
+this pod's traffic, so the chart works whether or not the namespace carries
+`istio-injection=enabled` or `istio.io/dataplane-mode=ambient` — and, just
+as importantly, so that toggling `istio.mode` on a dual-mode cluster moves
+pods between data planes cleanly.
+
+Sidecar mode:
+  sidecar.istio.io/inject: "true"   — force Envoy injection even if the
+                                       namespace lacks the injection label.
+  istio.io/dataplane-mode: none     — veto ambient capture, so a cluster
+                                       that ALSO runs ambient (e.g. during
+                                       a sidecar→ambient migration) does
+                                       not double-redirect this pod.
+
+Ambient mode:
+  istio.io/dataplane-mode: ambient  — ztunnel captures this pod's traffic.
+  sidecar.istio.io/inject: "false"  — veto Envoy injection even if the
+                                       namespace has the injection label,
+                                       so the pod isn't simultaneously
+                                       sidecar'd (which double-redirects
+                                       and silently breaks mTLS, surfacing
+                                       as "Connection reset by peer" on
+                                       every request).
+
+Either mode by itself is enough; emitting both (per mode) makes pod-level
+intent the source of truth and eliminates the cluster-configuration
+dependency that's easy to miss at install time.
+
+When istio.enabled is false this helper emits nothing so the user remains
+free to pick their own opt-in/out via podLabels (see the istio=off
+functional-tests path).
+*/}}
+{{- define "valkey.istioPodLabels" -}}
+{{- if .Values.istio.enabled -}}
+{{- if eq (.Values.istio.mode | default "sidecar") "ambient" -}}
+istio.io/dataplane-mode: ambient
+sidecar.istio.io/inject: "false"
+{{- else -}}
+sidecar.istio.io/inject: "true"
+istio.io/dataplane-mode: none
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Compute the merged pod labels map: selector + common + chart-computed mesh
+labels + user podLabels (user wins on collision). Emits the merged dict as
+YAML so the rendered output has no duplicate keys, even when a user sets
+e.g. `sidecar.istio.io/inject=false` via podLabels alongside
+`istio.enabled=true`.
+*/}}
+{{- define "valkey.podLabels" -}}
+{{- $selector := fromYaml (include "valkey.selectorLabels" .) -}}
+{{- $common   := .Values.commonLabels | default dict -}}
+{{- $mesh     := fromYaml (include "valkey.istioPodLabels" .) | default dict -}}
+{{- $user     := .Values.podLabels   | default dict -}}
+{{- toYaml (mergeOverwrite $selector $common $mesh $user) -}}
+{{- end -}}
+
+{{/*
+Job-pod labels: same merge as valkey.podLabels with one extra layer for
+`cluster.initJob.podLabels` applied last (so it wins). Lets operators
+veto a globally-injected metrics/observability sidecar on the cluster-
+init Job — which is a short-lived, exit-on-success batch task — without
+having to disable the same injector for the long-running data pods.
+mergeOverwrite handles the deep-merge and the no-duplicate-keys
+guarantee just like the data-pod helper.
+*/}}
+{{- define "valkey.initJobPodLabels" -}}
+{{- $selector := fromYaml (include "valkey.selectorLabels" .) -}}
+{{- $common   := .Values.commonLabels | default dict -}}
+{{- $mesh     := fromYaml (include "valkey.istioPodLabels" .) | default dict -}}
+{{- $user     := .Values.podLabels    | default dict -}}
+{{- $jobUser  := (.Values.cluster.initJob).podLabels | default dict -}}
+{{- toYaml (mergeOverwrite $selector $common $mesh $user $jobUser) -}}
+{{- end -}}
+
+{{/*
+Job-pod annotations: same shape as the global .Values.podAnnotations,
+with `cluster.initJob.podAnnotations` merged on top so it wins on
+collision. Same opt-out rationale as valkey.initJobPodLabels — some
+sidecar injectors read annotations rather than labels.
+
+Emits nothing when the merged map is empty so the Job's metadata block
+collapses cleanly (Helm/`with` semantics expect an absent key, not an
+empty mapping, to skip).
+*/}}
+{{- define "valkey.initJobPodAnnotations" -}}
+{{- $global := .Values.podAnnotations | default dict -}}
+{{- $job    := (.Values.cluster.initJob).podAnnotations | default dict -}}
+{{- $merged := mergeOverwrite (deepCopy $global) $job -}}
+{{- if $merged -}}
+{{- toYaml $merged -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Probe shell command. Returns the "sh -c" argument that pings valkey-server
+locally; the set of replies that count as healthy is parameterised.
+
+Args (passed as a dict):
+  ctx           — the parent context (.) so we can read .Values.tls
+  acceptLoading — whether to treat 'LOADING' as healthy
+
+Replies to PING are one of:
+  PONG          — fully up, dataset loaded
+  NOAUTH …      — up, requires auth (treat as proof of liveness — the
+                  server is fully serving, we just lack credentials)
+  LOADING …     — TCP listener is up but the dataset is being read from
+                  RDB/AOF; the server cannot serve traffic yet
+
+The three probes have different jobs and therefore different LOADING
+policies:
+
+  startupProbe (acceptLoading=false): the gate that holds liveness and
+    readiness off until the pod is actually serving. If startupProbe
+    accepted LOADING it would pass the moment the TCP listener opens,
+    kubelet would switch to liveness/readiness immediately, and the
+    gate would do nothing useful. Operators with multi-GB RDBs bump
+    `startupProbe.failureThreshold` to extend the load window — the
+    canonical Kubernetes pattern for slow loaders.
+
+  livenessProbe (acceptLoading=true): runs only AFTER startupProbe
+    passes. After that point, LOADING almost always means a full-resync
+    from primary is in progress (replica fell behind, replication
+    backlog overflowed, etc.). Killing the pod here loses the in-flight
+    download work and forces yet another full resync, perpetuating the
+    very condition the kill was supposed to escape. A pod stuck loading
+    forever is rare and harmless compared to the kill-loop, so accept
+    LOADING and let the load complete.
+
+  readinessProbe (acceptLoading=false): decides whether the pod is in
+    the Service endpoint set. A LOADING pod can't serve traffic, so it
+    must be removed from the rotation until it's truly ready. This
+    leaves the pod 'Running 0/1' during full-resync — exactly right.
+*/}}
+{{- define "valkey.probeShellCommand" -}}
+{{- $ctx := .ctx -}}
+{{- $pingCmd := "valkey-cli ping" -}}
+{{- if $ctx.Values.tls.enabled -}}
+{{- $pingCmd = printf "valkey-cli --tls --cacert /tls/%s ping" $ctx.Values.tls.caPublicKey -}}
+{{- end -}}
+{{- $accepted := "PONG|NOAUTH" -}}
+{{- if .acceptLoading -}}
+{{- $accepted = "PONG|NOAUTH|LOADING" -}}
+{{- end -}}
+{{- printf "%s 2>&1 | grep -qE '%s'" $pingCmd $accepted -}}
+{{- end -}}
+
+{{/*
+The valkey ServiceAccount name as an Istio SPIFFE principal.
+Used by the AuthorizationPolicy to pin the cluster-bus port to same-release
+pods cryptographically rather than by pod-selector IP.
+*/}}
+{{- define "valkey.istioPrincipal" -}}
+{{- $trustDomain := .Values.istio.trustDomain | default "cluster.local" -}}
+{{- printf "%s/ns/%s/sa/%s" $trustDomain .Release.Namespace (include "valkey.serviceAccountName" .) -}}
+{{- end -}}
+
+{{/*
+Validate istio configuration. Runs regardless of istio.enabled so a typo in
+istio.mode (e.g. `mode: ambiet` buried in a GitOps values file) surfaces at
+template time instead of silently rendering the sidecar-only code paths.
+*/}}
+{{- define "valkey.validateIstioConfig" -}}
+{{- if hasKey .Values.istio "mode" }}
+  {{- if not (or (eq .Values.istio.mode "sidecar") (eq .Values.istio.mode "ambient")) }}
+    {{- fail (printf "istio.mode must be 'sidecar' or 'ambient', got: %s" .Values.istio.mode) }}
+  {{- end }}
+{{- end }}
+{{- /*
+Guard against the silent-no-protection footgun for the cluster bus port:
+when istio is enabled in ambient mode AND cluster mode is on, dropping BOTH
+the NetworkPolicy (skipped for ambient) AND the AuthorizationPolicy leaves
+the bus port open to any pod that can route to it. The feature's whole
+point is cross-release isolation; failing closed is the only safe default.
+Users who genuinely want the bus port unprotected can set
+`cluster.isolation.enabled=true` (NetworkPolicy path still runs in sidecar
+mode, but in ambient it's dropped) and explicitly acknowledge by setting
+`istio.authorizationPolicy.enabled=true`; the chart refuses to let BOTH be
+false when both layers have been chosen-off.
+*/}}
+{{- if and .Values.istio.enabled (eq .Values.istio.mode "ambient") .Values.cluster.enabled }}
+  {{- if not .Values.istio.authorizationPolicy.enabled }}
+    {{- fail "istio.authorizationPolicy.enabled=false in ambient mode + cluster mode leaves the cluster-bus port unprotected: the NetworkPolicy is skipped for ambient (it would block HBONE), and disabling the AuthorizationPolicy removes the only remaining cross-release isolation layer. Re-enable istio.authorizationPolicy.enabled, or switch to istio.mode=sidecar if you intend to rely on the NetworkPolicy." }}
+  {{- end }}
+{{- end }}
+{{- /*
+Guard against the shared-ServiceAccount footgun. The AuthorizationPolicy
+uses the SPIFFE principal `<trust-domain>/ns/<ns>/sa/<sa>` to scope the bus
+port to same-release pods. If two releases in the same namespace share a SA
+(e.g. both use `serviceAccount.create=false` with the namespace default, or
+both explicitly set the same `serviceAccount.name`), their APs encode the
+SAME principal — cross-release MEET passes the identity check and the
+clusters silently merge. The chart cannot detect other releases at template
+time, but it can surface the risk: refuse the obviously-unsafe case
+(`serviceAccount.create=false` with no explicit name, i.e. the shared
+`default` SA) whenever the AP is rendered. Users who deliberately share
+a named SA across releases can still do so; they just have to type it.
+*/}}
+{{- if and .Values.istio.enabled .Values.istio.authorizationPolicy.enabled .Values.cluster.enabled }}
+  {{- if and (not .Values.serviceAccount.create) (not .Values.serviceAccount.name) }}
+    {{- fail "istio.authorizationPolicy gives cross-release cluster-bus isolation by scoping the bus port to a SPIFFE principal built from the pod's ServiceAccount. With serviceAccount.create=false AND serviceAccount.name empty, the chart falls back to the namespace's 'default' ServiceAccount — which every other release using the same fallback ALSO maps to, so the AuthorizationPolicy cannot distinguish them and cross-release CLUSTER MEET succeeds. Either set serviceAccount.create=true (per-release SA) or serviceAccount.name=<distinct-name>." }}
+  {{- end }}
+{{- end }}
+{{- end -}}
+
+
diff --git a/valkey/templates/cluster-init-job.yaml b/valkey/templates/cluster-init-job.yaml
new file mode 100644
index 00000000..c875ef54
--- /dev/null
+++ b/valkey/templates/cluster-init-job.yaml
@@ -0,0 +1,120 @@
+{{- if .Values.cluster.enabled }}
+{{- include "valkey.validateAuthConfig" . }}
+{{- include "valkey.validateClusterConfig" . }}
+{{- include "valkey.validateClusterAuth" . }}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "valkey.fullname" . }}-cluster-init
+  labels:
+    {{- include "valkey.labels" . | nindent 4 }}
+  annotations:
+    "helm.sh/hook": post-install,post-upgrade
+    "helm.sh/hook-weight": "0"
+    "helm.sh/hook-delete-policy": before-hook-creation
+spec:
+  backoffLimit: 6
+  ttlSecondsAfterFinished: {{ .Values.cluster.initJob.ttlSecondsAfterFinished }}
+  template:
+    metadata:
+      labels:
+        {{- /*
+        Job-scoped label set (see valkey.initJobPodLabels helper). Same
+        layering as the data-pod helper, plus `cluster.initJob.podLabels`
+        on top so operators can opt the short-lived batch Job out of
+        global label-driven injectors (sidecar metrics agents, etc.)
+        without affecting the long-running data pods. In ambient mode
+        the dataplane-mode label is still emitted automatically so
+        ztunnel captures the Job's outbound connections; in sidecar mode
+        sidecar.istio.io/inject=true is emitted so the Job works on
+        namespaces that don't carry the injection label.
+        */}}
+        {{- include "valkey.initJobPodLabels" . | nindent 8 }}
+      {{- $annotations := include "valkey.initJobPodAnnotations" . }}
+      {{- with $annotations }}
+      annotations:
+        {{- . | nindent 8 }}
+      {{- end }}
+    spec:
+      {{- (include "valkey.imagePullSecrets" .) | nindent 6 }}
+      automountServiceAccountToken: false
+      serviceAccountName: {{ include "valkey.serviceAccountName" . }}
+      restartPolicy: OnFailure
+      {{- if .Values.priorityClassName }}
+      priorityClassName: {{ .Values.priorityClassName | quote }}
+      {{- end }}
+      securityContext:
+      {{- toYaml .Values.podSecurityContext | nindent 8 }}
+      containers:
+        - name: cluster-init
+          image: {{ include "valkey.image" . }}
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          securityContext:
+            {{- toYaml .Values.securityContext | nindent 12 }}
+          command: [ "/bin/sh", "/cluster-script/init-cluster.sh" ]
+          env:
+            - name: CLUSTER_NODE_COUNT
+              value: {{ include "valkey.clusterNodeCount" . | quote }}
+            - name: CLUSTER_REPLICAS_PER_SHARD
+              value: {{ .Values.cluster.replicasPerShard | quote }}
+          {{- with .Values.initResources }}
+          resources:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          volumeMounts:
+            - name: cluster-script
+              mountPath: /cluster-script
+            {{- if .Values.tls.enabled }}
+            - name: {{ include "valkey.fullname" . }}-tls
+              mountPath: /tls
+            {{- end }}
+            {{- if .Values.auth.enabled }}
+            {{- if .Values.auth.usersExistingSecret }}
+            - name: valkey-users-secret
+              mountPath: /valkey-users-secret
+              readOnly: true
+            {{- end }}
+            {{- if or (include "valkey.hasInlinePasswords" . | eq "true") .Values.auth.aclConfig }}
+            - name: valkey-auth-secret
+              mountPath: /valkey-auth-secret
+              readOnly: true
+            {{- end }}
+            {{- end }}
+      volumes:
+        - name: cluster-script
+          configMap:
+            name: {{ include "valkey.fullname" . }}-cluster-script
+            defaultMode: 0555
+        {{- if .Values.tls.enabled }}
+        - name: {{ include "valkey.fullname" . }}-tls
+          secret:
+            secretName: {{ required "An existing secret is required to enable TLS" .Values.tls.existingSecret }}
+            defaultMode: 0400
+        {{- end }}
+        {{- if .Values.auth.enabled }}
+        {{- if .Values.auth.usersExistingSecret }}
+        - name: valkey-users-secret
+          secret:
+            secretName: {{ .Values.auth.usersExistingSecret }}
+            defaultMode: 0400
+        {{- end }}
+        {{- if or (include "valkey.hasInlinePasswords" . | eq "true") .Values.auth.aclConfig }}
+        - name: valkey-auth-secret
+          secret:
+            secretName: {{ include "valkey.fullname" . }}-auth
+            defaultMode: 0400
+        {{- end }}
+        {{- end }}
+      {{- with .Values.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+{{- end }}
diff --git a/valkey/templates/cluster-isolation-netpol.yaml b/valkey/templates/cluster-isolation-netpol.yaml
new file mode 100644
index 00000000..3b51f42d
--- /dev/null
+++ b/valkey/templates/cluster-isolation-netpol.yaml
@@ -0,0 +1,83 @@
+{{- /*
+Cluster-bus isolation NetworkPolicy.
+
+Valkey's gossip/cluster-bus protocol has no authentication of its own: a pod
+that can open a TCP connection to a node's bus port (default 16379) can send
+CLUSTER MEET and merge into the cluster. When two independent Valkey clusters
+share a Kubernetes cluster (or even a namespace), nothing in Valkey itself
+stops an accidental or malicious MEET from fusing them.
+
+This policy pins the bus port INBOUND to same-release traffic only, by
+matching on `app.kubernetes.io/instance`. Blocking the receiving side of
+the MEET handshake is sufficient: the handshake is bidirectional, so with
+the receiver refusing connections, the placeholder node is evicted by the
+cluster-node-timeout and the two clusters stay separate.
+
+Client (6379) and metrics (9121) ports stay open — they're application-level
+and have their own auth (ACL/TLS).
+
+We deliberately do NOT set an Egress policyType. Adding Egress here would
+require enumerating every destination a Valkey pod legitimately needs to
+reach (kube-dns, Istio's xDS on istiod:15012, Envoy's health port, JWKS
+endpoints for Istio AuthorizationPolicy, and so on); getting that wrong
+breaks Istio sidecar bootstrap. Users who want egress isolation on top of
+this should add an Istio AuthorizationPolicy (when they have Istio) or a
+separate NetworkPolicy targeting `valkey.selectorLabels` — Kubernetes
+combines those additively with this one.
+
+Kubernetes policies are additive: adding this one alongside the user-defined
+`networkPolicy` value still allows the user's ingress/egress rules to match.
+
+Running on a CNI that doesn't enforce NetworkPolicy (plain Flannel, the
+in-tree kubenet, etc.) makes this rendered policy a no-op. There is no
+namespace-based fallback — pod-to-pod traffic crosses namespaces freely
+unless something actually enforces policy at the data plane. On such a
+cluster there is no way to prevent a cross-release CLUSTER MEET from the
+chart alone; either switch to a policy-enforcing CNI, add an Istio
+AuthorizationPolicy at layer 7, or run each Valkey cluster in its own
+Kubernetes cluster.
+
+Ambient mesh caveat: in ambient mode, ztunnel wraps all pod-to-pod traffic
+in HBONE on port 15008, then unwraps it at the destination and re-delivers
+to the pod-local port. A NetworkPolicy that only allows ingress on 6379 /
+16379 / 9121 drops the inbound HBONE — the client port gets blocked at
+the policy layer and every connection fails with "Connection reset by
+peer". The chart-owned AuthorizationPolicy already provides equivalent
+(and stronger, identity-based) bus-port scoping for ambient, so we skip
+this NetworkPolicy entirely when istio.mode=ambient. Users who still want
+a belt-and-braces IP-level NetworkPolicy in ambient can add their own via
+.Values.networkPolicy (rendered by netpolicy.yaml) and include port 15008.
+*/}}
+{{- if and .Values.cluster.enabled .Values.cluster.isolation.enabled (not (and .Values.istio.enabled (eq .Values.istio.mode "ambient"))) }}
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  name: {{ include "valkey.fullname" . }}-cluster-isolation
+  labels:
+    {{- include "valkey.labels" . | nindent 4 }}
+spec:
+  podSelector:
+    matchLabels:
+      {{- include "valkey.selectorLabels" . | nindent 6 }}
+  policyTypes:
+    - Ingress
+  ingress:
+    # Bus port: only other pods of THIS release.
+    - from:
+        - podSelector:
+            matchLabels:
+              {{- include "valkey.selectorLabels" . | nindent 14 }}
+      ports:
+        - protocol: TCP
+          port: {{ .Values.cluster.busPort }}
+    # Client port: anyone. ACL + TLS guard it above the network layer.
+    - ports:
+        - protocol: TCP
+          port: {{ .Values.service.port }}
+    {{- if .Values.metrics.enabled }}
+    # Metrics sidecar: anyone (typically Prometheus).
+    - ports:
+        - protocol: TCP
+          port: {{ .Values.metrics.exporter.port }}
+    {{- end }}
+{{- end }}
diff --git a/valkey/templates/cluster-script.yaml b/valkey/templates/cluster-script.yaml
new file mode 100644
index 00000000..bd9fcdd7
--- /dev/null
+++ b/valkey/templates/cluster-script.yaml
@@ -0,0 +1,13 @@
+{{- if .Values.cluster.enabled }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "valkey.fullname" . }}-cluster-script
+  labels:
+    {{- include "valkey.labels" . | nindent 4 }}
+data:
+  init-cluster.sh: |-
+{{ tpl (.Files.Get "scripts/cluster-init-script.sh") . | indent 4 }}
+  prestop.sh: |-
+{{ tpl (.Files.Get "scripts/cluster-prestop-script.sh") . | indent 4 }}
+{{- end }}
diff --git a/valkey/templates/cluster-statefulset.yaml b/valkey/templates/cluster-statefulset.yaml
new file mode 100644
index 00000000..5dad0344
--- /dev/null
+++ b/valkey/templates/cluster-statefulset.yaml
@@ -0,0 +1,375 @@
+{{- if .Values.cluster.enabled }}
+{{- include "valkey.validateAuthConfig" . }}
+{{- include "valkey.validateClusterConfig" . }}
+{{- include "valkey.validateClusterAuth" . }}
+{{- include "valkey.validateIstioConfig" . }}
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: {{ include "valkey.fullname" . }}
+  labels:
+    {{- include "valkey.labels" . | nindent 4 }}
+  {{- with .Values.workloadAnnotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  serviceName: {{ include "valkey.fullname" . }}-headless
+  replicas: {{ include "valkey.clusterNodeCount" . }}
+  podManagementPolicy: Parallel
+  {{- if .Values.cluster.persistentVolumeClaimRetentionPolicy }}
+  persistentVolumeClaimRetentionPolicy:
+    {{- toYaml .Values.cluster.persistentVolumeClaimRetentionPolicy | nindent 4 }}
+  {{- end }}
+  selector:
+    matchLabels:
+      {{- include "valkey.selectorLabels" . | nindent 6 }}
+  volumeClaimTemplates:
+  - metadata:
+      name: valkey-data
+    spec:
+      accessModes: {{ toYaml .Values.cluster.persistence.accessModes | nindent 8 }}
+      {{- if .Values.cluster.persistence.storageClass }}
+      storageClassName: {{ .Values.cluster.persistence.storageClass | quote }}
+      {{- end }}
+      resources:
+        requests:
+          storage: {{ .Values.cluster.persistence.size | quote }}
+  template:
+    metadata:
+      labels:
+        {{- /*
+        Single merged label set: selector + commonLabels + chart-computed
+        mesh labels + user podLabels (user wins on collision). Keeps the
+        rendered YAML free of duplicate keys when e.g. a user sets
+        sidecar.istio.io/inject=false via podLabels alongside
+        istio.enabled=true.
+        */}}
+        {{- include "valkey.podLabels" . | nindent 8 }}
+      annotations:
+      {{- with .Values.podAnnotations }}
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+        checksum/initconfig: {{ include (print $.Template.BasePath "/init_config.yaml") . | sha256sum | trunc 32 | quote }}
+        {{- if .Values.valkeyConfig }}
+        checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum | trunc 32 | quote }}
+        {{- end }}
+        {{- /*
+        The cluster-bus port (16379 by default) carries raw Valkey gossip: a
+        binary, long-lived, bidirectional protocol that neither Envoy nor
+        ztunnel can proxy sensibly. The chart keeps it unproxied in both
+        modes, but the mechanics differ:
+
+          sidecar — explicit: Envoy sees the port in its iptables rules, so
+                    we emit traffic.sidecar.istio.io/exclude*Ports to take
+                    it out. The AuthorizationPolicy (rendered separately)
+                    does the cross-release enforcement via Envoy-terminated
+                    mTLS on the OTHER ports.
+
+          ambient — implicit: ztunnel only captures traffic for pods it
+                    recognises, and the AuthorizationPolicy's ALLOW rules
+                    only bind the client/metrics ports. That leaves the bus
+                    port outside ztunnel's HBONE tunnel; pod-to-pod TCP on
+                    16379 takes the direct CNI path. No annotation needed
+                    (they're sidecar-only).
+        */}}
+        {{- if and .Values.istio.enabled (eq .Values.istio.mode "sidecar") }}
+        traffic.sidecar.istio.io/excludeInboundPorts: {{ .Values.cluster.busPort | quote }}
+        traffic.sidecar.istio.io/excludeOutboundPorts: {{ .Values.cluster.busPort | quote }}
+        {{- end }}
+    spec:
+      {{- (include "valkey.imagePullSecrets" .) | nindent 6 }}
+      automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
+      serviceAccountName: {{ include "valkey.serviceAccountName" . }}
+      {{- if .Values.priorityClassName }}
+      priorityClassName: {{ .Values.priorityClassName | quote }}
+      {{- end }}
+      securityContext:
+      {{- toYaml .Values.podSecurityContext | nindent 8 }}
+      initContainers:
+        - name: {{ include "valkey.fullname" . }}-init
+          image: {{ include "valkey.image" . }}
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          {{- with .Values.securityContext }}
+          securityContext:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          command: [ "/scripts/init.sh" ]
+          env:
+            - name: POD_INDEX
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.labels['apps.kubernetes.io/pod-index']
+            - name: CLUSTER_SHARDS
+              value: {{ .Values.cluster.shards | quote }}
+            - name: CLUSTER_REPLICAS_PER_SHARD
+              value: {{ .Values.cluster.replicasPerShard | quote }}
+          volumeMounts:
+            - name: valkey-data
+              mountPath: /data
+            - name: scripts
+              mountPath: /scripts
+            {{- if .Values.valkeyConfig }}
+            - name: valkey-config
+              mountPath: /usr/local/etc/valkey/valkey.conf
+              subPath: valkey.conf
+            {{- end }}
+            {{- if .Values.auth.enabled }}
+            - name: valkey-acl
+              mountPath: /etc/valkey
+            {{- if .Values.auth.usersExistingSecret }}
+            - name: valkey-users-secret
+              mountPath: /valkey-users-secret
+              readOnly: true
+            {{- end }}
+            {{- if or (include "valkey.hasInlinePasswords" . | eq "true") .Values.auth.aclConfig }}
+            - name: valkey-auth-secret
+              mountPath: /valkey-auth-secret
+              readOnly: true
+            {{- end }}
+            {{- end }}
+          {{- with .Values.initResources }}
+          resources:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+      {{- with .Values.extraInitContainers }}
+      {{- toYaml . | nindent 8 }}
+      {{- end }}
+      containers:
+        - name: {{ include "valkey.fullname" . }}
+          image: {{ include "valkey.image" . }}
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          command: [ "valkey-server" ]
+          args: [ "/data/conf/valkey.conf" ]
+          securityContext:
+            {{- toYaml .Values.securityContext | nindent 12 }}
+          env:
+            {{- range $key, $val := .Values.env }}
+            - name: {{ $key }}
+              value: "{{ $val }}"
+            {{- end }}
+            - name: VALKEY_LOGLEVEL
+              value: "{{ .Values.valkeyLogLevel }}"
+          ports:
+            - name: tcp
+              containerPort: {{ .Values.service.port }}
+              protocol: TCP
+            - name: tcp-bus
+              containerPort: {{ .Values.cluster.busPort }}
+              protocol: TCP
+          {{- $strictCmd := include "valkey.probeShellCommand" (dict "ctx" . "acceptLoading" false) }}
+          {{- $loadCmd   := include "valkey.probeShellCommand" (dict "ctx" . "acceptLoading" true)  }}
+          startupProbe:
+            exec:
+              command: [ "sh", "-c", {{ $strictCmd | quote }} ]
+            initialDelaySeconds: 5
+            periodSeconds: {{ .Values.cluster.startupProbe.periodSeconds }}
+            timeoutSeconds: {{ .Values.cluster.startupProbe.timeoutSeconds }}
+            failureThreshold: {{ .Values.cluster.startupProbe.failureThreshold }}
+          livenessProbe:
+            exec:
+              command: [ "sh", "-c", {{ $loadCmd | quote }} ]
+            periodSeconds: {{ .Values.cluster.livenessProbe.periodSeconds }}
+            timeoutSeconds: {{ .Values.cluster.livenessProbe.timeoutSeconds }}
+            failureThreshold: {{ .Values.cluster.livenessProbe.failureThreshold }}
+          readinessProbe:
+            exec:
+              command: [ "sh", "-c", {{ $strictCmd | quote }} ]
+            periodSeconds: {{ .Values.cluster.readinessProbe.periodSeconds }}
+            timeoutSeconds: {{ .Values.cluster.readinessProbe.timeoutSeconds }}
+            failureThreshold: {{ .Values.cluster.readinessProbe.failureThreshold }}
+          {{- if and (gt (int .Values.cluster.replicasPerShard) 0) .Values.cluster.preStopFailover.enabled }}
+          lifecycle:
+            # Graceful CLUSTER FAILOVER on primary-pod shutdown. Gated on
+            # replicasPerShard>0 (no replica to hand over to otherwise —
+            # the hook would no-op and just eat grace-period budget). The
+            # script itself is best-effort and never blocks SIGTERM.
+            preStop:
+              exec:
+                command: [ "/bin/sh", "/cluster-script/prestop.sh" ]
+          {{- end }}
+          resources:
+            {{- toYaml .Values.resources | nindent 12 }}
+          volumeMounts:
+            - name: valkey-data
+              mountPath: /data
+            {{- if and (gt (int .Values.cluster.replicasPerShard) 0) .Values.cluster.preStopFailover.enabled }}
+            - name: cluster-script
+              mountPath: /cluster-script
+            {{- end }}
+            {{- if .Values.tls.enabled }}
+            - name: {{ include "valkey.fullname" . }}-tls
+              mountPath: /tls
+            {{- end }}
+            {{- if .Values.auth.enabled }}
+            - name: valkey-acl
+              mountPath: /etc/valkey
+            {{- if .Values.auth.usersExistingSecret }}
+            - name: valkey-users-secret
+              mountPath: /valkey-users-secret
+              readOnly: true
+            {{- end }}
+            {{- if or (include "valkey.hasInlinePasswords" . | eq "true") .Values.auth.aclConfig }}
+            - name: valkey-auth-secret
+              mountPath: /valkey-auth-secret
+              readOnly: true
+            {{- end }}
+            {{- end }}
+            {{- range $secret := .Values.extraValkeySecrets }}
+            - name: {{ $secret.name }}-valkey
+              mountPath: {{ $secret.mountPath }}
+            {{- end }}
+            {{- range $config := .Values.extraValkeyConfigs }}
+            - name: {{ $config.name }}-valkey
+              mountPath: {{ $config.mountPath }}
+            {{- end }}
+            {{- with .Values.extraVolumeMounts }}
+            {{- toYaml . | nindent 12 }}
+            {{- end }}
+        {{- if .Values.metrics.enabled }}
+        - name: metrics
+          image: {{ include "valkey.metrics.exporter.image" . }}
+          imagePullPolicy: {{ .Values.metrics.exporter.image.pullPolicy | quote }}
+          {{- with .Values.metrics.exporter.securityContext }}
+          securityContext:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          {{- with .Values.metrics.exporter.command }}
+          command:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          {{- with .Values.metrics.exporter.args }}
+          args:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          ports:
+            - name: metrics
+              containerPort: {{ .Values.metrics.exporter.port }}
+          startupProbe:
+            tcpSocket:
+              port: metrics
+          livenessProbe:
+            tcpSocket:
+              port: metrics
+          readinessProbe:
+            httpGet:
+              path: /
+              port: metrics
+          {{- with .Values.metrics.exporter.resources }}
+          resources:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          {{- with .Values.metrics.exporter.extraVolumeMounts }}
+          volumeMounts:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          env:
+            - name: REDIS_ALIAS
+              value: {{ include "valkey.fullname" . }}
+            {{- if .Values.auth.enabled }}
+            {{- $defaultUser := get (.Values.auth.aclUsers | default dict) "default" | default dict }}
+            {{- $hasInlineDefaultPassword := hasKey $defaultUser "password" }}
+            {{- if .Values.auth.usersExistingSecret }}
+            - name: REDIS_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: {{ .Values.auth.usersExistingSecret }}
+                  key: {{ $defaultUser.passwordKey | default "default" }}
+            {{- else if $hasInlineDefaultPassword }}
+            - name: REDIS_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: {{ include "valkey.fullname" . }}-auth
+                  key: default-password
+            {{- end }}
+            {{- end }}
+            {{- range $key, $val := .Values.metrics.exporter.extraEnvs }}
+            - name: {{ $key }}
+              value: "{{ $val }}"
+            {{- end }}
+        {{- end }}
+        {{- with .Values.extraContainers }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+      terminationGracePeriodSeconds: {{ .Values.cluster.terminationGracePeriodSeconds }}
+      volumes:
+        - name: scripts
+          configMap:
+            name: {{ include "valkey.fullname" . }}-init-scripts
+            defaultMode: 0555
+        {{- if and (gt (int .Values.cluster.replicasPerShard) 0) .Values.cluster.preStopFailover.enabled }}
+        - name: cluster-script
+          configMap:
+            name: {{ include "valkey.fullname" . }}-cluster-script
+            defaultMode: 0555
+        {{- end }}
+        {{- if .Values.auth.enabled }}
+        - name: valkey-acl
+          emptyDir:
+            medium: Memory
+        {{- end }}
+        {{- if .Values.valkeyConfig }}
+        - name: valkey-config
+          configMap:
+            name: {{ include "valkey.fullname" . }}-config
+        {{- end }}
+        {{- range .Values.extraValkeySecrets }}
+        - name: {{ .name }}-valkey
+          secret:
+            secretName: {{ .name }}
+            defaultMode: {{ .defaultMode | default 0440 }}
+        {{- end }}
+        {{- if .Values.tls.enabled }}
+        - name: {{ include "valkey.fullname" . }}-tls
+          secret:
+            secretName: {{ required "An existing secret is required to enable TLS" .Values.tls.existingSecret }}
+            defaultMode: 0400
+        {{- end }}
+        {{- range .Values.extraValkeyConfigs }}
+        - name: {{ .name }}-valkey
+          configMap:
+            name: {{ .name }}
+            defaultMode: {{ .defaultMode | default 0440 }}
+        {{- end }}
+        {{- if .Values.metrics.enabled }}
+        {{- range .Values.metrics.exporter.extraExporterSecrets }}
+        - name: {{ .name }}-exporter
+          secret:
+            secretName: {{ .name }}
+            defaultMode: {{ .defaultMode | default 0440 }}
+        {{- end }}
+        {{- end }}
+        {{- if .Values.auth.enabled }}
+        {{- if .Values.auth.usersExistingSecret }}
+        - name: valkey-users-secret
+          secret:
+            secretName: {{ .Values.auth.usersExistingSecret }}
+            defaultMode: 0400
+        {{- end }}
+        {{- if or (include "valkey.hasInlinePasswords" . | eq "true") .Values.auth.aclConfig }}
+        - name: valkey-auth-secret
+          secret:
+            secretName: {{ include "valkey.fullname" . }}-auth
+            defaultMode: 0400
+        {{- end }}
+        {{- end }}
+        {{- with .Values.extraVolumes }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+      {{- with .Values.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.topologySpreadConstraints }}
+      topologySpreadConstraints:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+{{- end }}
diff --git a/valkey/templates/deploy_valkey.yaml b/valkey/templates/deploy_valkey.yaml
index 7bc9a5a8..64501320 100644
--- a/valkey/templates/deploy_valkey.yaml
+++ b/valkey/templates/deploy_valkey.yaml
@@ -1,8 +1,9 @@
-{{- if not .Values.replica.enabled }}
+{{- if not (or .Values.replica.enabled .Values.cluster.enabled) }}
 {{- $fullname := include "valkey.fullname" . }}
 {{- $storage := .Values.dataStorage }}
 {{- $createPVC := and $storage.enabled (not (empty $storage.requestedSize)) (empty $storage.persistentVolumeClaimName) }}
 {{- include "valkey.validateAuthConfig" . }}
+{{- include "valkey.validateIstioConfig" . }}
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -23,13 +24,14 @@ spec:
   template:
     metadata:
       labels:
-        {{- include "valkey.selectorLabels" . | nindent 8 }}
-        {{- with .Values.commonLabels }}
-        {{- toYaml . | nindent 8 }}
-        {{- end }}
-        {{- with .Values.podLabels }}
-        {{- toYaml . | nindent 8 }}
-        {{- end }}
+        {{- /*
+        Single merged label set: selector + commonLabels + chart-computed
+        mesh labels + user podLabels (user wins on collision). Keeps the
+        rendered YAML free of duplicate keys when e.g. a user sets
+        sidecar.istio.io/inject=false via podLabels alongside
+        istio.enabled=true.
+        */}}
+        {{- include "valkey.podLabels" . | nindent 8 }}
       annotations:
         {{- with .Values.podAnnotations }}
         {{- toYaml . | nindent 8 }}
@@ -69,10 +71,6 @@ spec:
               mountPath: /usr/local/etc/valkey/valkey.conf
               subPath: valkey.conf
             {{- end }}
-            {{- if .Values.extraSecretValkeyConfigs }}
-            - name: extravalkeyconfigs-volume
-              mountPath: /extravalkeyconfigs
-            {{- end }}
             {{- if .Values.auth.enabled }}
             - name: valkey-acl
               mountPath: /etc/valkey
@@ -116,20 +114,26 @@ spec:
             - name: tcp
               containerPort: {{ .Values.service.port }}
               protocol: TCP
+          {{- $strictCmd := include "valkey.probeShellCommand" (dict "ctx" . "acceptLoading" false) }}
+          {{- $loadCmd   := include "valkey.probeShellCommand" (dict "ctx" . "acceptLoading" true)  }}
           startupProbe:
             exec:
-              {{- if .Values.tls.enabled }}
-              command: [ "sh", "-c", "valkey-cli --cacert /tls/{{ .Values.tls.caPublicKey }} --tls ping" ]
-              {{- else }}
-              command: [ "sh", "-c", "valkey-cli ping" ]
-              {{- end }}
+              command: [ "sh", "-c", {{ $strictCmd | quote }} ]
+            periodSeconds: {{ .Values.startupProbe.periodSeconds }}
+            timeoutSeconds: {{ .Values.startupProbe.timeoutSeconds }}
+            failureThreshold: {{ .Values.startupProbe.failureThreshold }}
           livenessProbe:
             exec:
-              {{- if .Values.tls.enabled }}
-              command: [ "sh", "-c", "valkey-cli --cacert /tls/{{ .Values.tls.caPublicKey }} --tls ping" ]
-              {{- else }}
-              command: [ "sh", "-c", "valkey-cli ping" ]
-              {{- end }}
+              command: [ "sh", "-c", {{ $loadCmd | quote }} ]
+            periodSeconds: {{ .Values.livenessProbe.periodSeconds }}
+            timeoutSeconds: {{ .Values.livenessProbe.timeoutSeconds }}
+            failureThreshold: {{ .Values.livenessProbe.failureThreshold }}
+          readinessProbe:
+            exec:
+              command: [ "sh", "-c", {{ $strictCmd | quote }} ]
+            periodSeconds: {{ .Values.readinessProbe.periodSeconds }}
+            timeoutSeconds: {{ .Values.readinessProbe.timeoutSeconds }}
+            failureThreshold: {{ .Values.readinessProbe.failureThreshold }}
           resources:
             {{- toYaml .Values.resources | nindent 12 }}
           volumeMounts:
@@ -198,18 +202,21 @@ spec:
             - name: REDIS_ALIAS
               value: {{ include "valkey.fullname" . }}
             {{- if .Values.auth.enabled }}
+            {{- $defaultUser := get (.Values.auth.aclUsers | default dict) "default" | default dict }}
+            {{- $hasInlineDefaultPassword := hasKey $defaultUser "password" }}
+            {{- if .Values.auth.usersExistingSecret }}
             - name: REDIS_PASSWORD
               valueFrom:
                 secretKeyRef:
-                  {{- if .Values.auth.usersExistingSecret }}
-                  {{- $defaultUser := index .Values.auth.aclUsers "default" | default dict }}
-                  {{- $passwordKey := $defaultUser.passwordKey | default "default" }}
                   name: {{ .Values.auth.usersExistingSecret }}
-                  key: {{ $passwordKey }}
-                  {{- else }}
+                  key: {{ $defaultUser.passwordKey | default "default" }}
+            {{- else if $hasInlineDefaultPassword }}
+            - name: REDIS_PASSWORD
+              valueFrom:
+                secretKeyRef:
                   name: {{ include "valkey.fullname" . }}-auth
                   key: default-password
-                  {{- end }}
+            {{- end }}
             {{- end }}
             {{- range $key, $val := .Values.metrics.exporter.extraEnvs }}
             - name: {{ $key }}
diff --git a/valkey/templates/init_config.yaml b/valkey/templates/init_config.yaml
index 9b0337e5..4d11e327 100644
--- a/valkey/templates/init_config.yaml
+++ b/valkey/templates/init_config.yaml
@@ -53,7 +53,10 @@ data:
       fi
       {{- end }}
 
-      echo "$password"
+      # printf is byte-safe; dash's `echo` quietly interprets backslash
+      # escapes (\b, \t, \\, etc.), corrupting any password that contains a
+      # backslash before it's hashed into the ACL.
+      printf '%s' "$password"
     }
     {{- end }}
 
@@ -123,8 +126,9 @@ data:
     # User: {{ $username }}
     PASSWORD=$(get_user_password "{{ $username }}" "{{ $passwordKey }}") || exit 1
 
-    # Hash the password and write ACL entry
-    PASSHASH=$(echo -n "$PASSWORD" | sha256sum | cut -f 1 -d " ")
+    # Hash the password and write ACL entry. printf (not echo -n) is POSIX —
+    # echo -n is implementation-defined and quietly emits `-n\n` under some shells.
+    PASSHASH=$(printf '%s' "$PASSWORD" | sha256sum | cut -f 1 -d " ")
     echo "user {{ $username }} on #$PASSHASH {{ $user.permissions }}" >> /etc/valkey/users.acl
 
     {{- end }}
@@ -190,8 +194,10 @@ data:
       {{- $replPasswordKey := $replUser.passwordKey | default $replUsername }}
       REPL_PASSWORD=$(get_user_password "{{ $replUsername }}" "{{ $replPasswordKey }}") || exit 1
 
-      # Write masterauth configuration
-      echo "masterauth $REPL_PASSWORD" >>"$VALKEY_CONFIG"
+      # Write masterauth configuration. Quote + backslash-escape so passwords
+      # containing quotes/backslashes survive valkey.conf parsing.
+      REPL_PASSWORD_ESC=$(printf '%s' "$REPL_PASSWORD" | sed 's/\\/\\\\/g; s/"/\\"/g')
+      printf 'masterauth "%s"\n' "$REPL_PASSWORD_ESC" >>"$VALKEY_CONFIG"
       echo "masteruser {{ $replUsername }}" >>"$VALKEY_CONFIG"
       log "Configured masterauth with user {{ $replUsername }}"
       {{- end }}
@@ -219,13 +225,157 @@ data:
     {{- end }}
     {{- end }}
 
+    {{- if .Values.cluster.enabled }}
+    # Cluster mode configuration
+    log "Configuring cluster mode"
+
+    # Use POD_INDEX from Kubernetes metadata
+    POD_INDEX=${POD_INDEX:-0}
+
+    # Configure cluster-enabled settings
+    {
+      echo ""
+      echo "# Cluster Configuration"
+      echo "cluster-enabled yes"
+      echo "cluster-config-file /data/nodes.conf"
+      echo "cluster-node-timeout {{ .Values.cluster.nodeTimeout }}"
+      {{- if not .Values.cluster.requireFullCoverage }}
+      echo "cluster-require-full-coverage no"
+      {{- end }}
+      {{- if .Values.cluster.allowReadsWhenDown }}
+      echo "cluster-allow-reads-when-down yes"
+      {{- end }}
+      echo ""
+      echo "# Cluster node announcement"
+      echo "cluster-announce-hostname {{ include "valkey.fullname" . }}-$POD_INDEX.{{ include "valkey.headlessServiceName" . }}.{{ .Release.Namespace }}.svc.{{ .Values.clusterDomain }}"
+      echo "cluster-announce-port {{ .Values.service.port }}"
+      echo "cluster-announce-bus-port {{ .Values.cluster.busPort }}"
+      echo "cluster-preferred-endpoint-type hostname"
+    } >>"$VALKEY_CONFIG"
+
+    log "Cluster node $POD_INDEX configured with announce IP"
+
+    {{- if .Values.auth.enabled }}
+    # Configure cluster authentication
+    {{- $replUsername := .Values.cluster.replicationUser }}
+    {{- $replUser := index .Values.auth.aclUsers $replUsername }}
+    {{- $replPasswordKey := $replUser.passwordKey | default $replUsername }}
+    REPL_PASSWORD=$(get_user_password "{{ $replUsername }}" "{{ $replPasswordKey }}") || exit 1
+    REPL_PASSWORD_ESC=$(printf '%s' "$REPL_PASSWORD" | sed 's/\\/\\\\/g; s/"/\\"/g')
+
+    {
+      echo ""
+      echo "# Cluster authentication"
+      printf 'masterauth "%s"\n' "$REPL_PASSWORD_ESC"
+      echo "masteruser {{ $replUsername }}"
+    } >>"$VALKEY_CONFIG"
+    log "Configured cluster authentication with user {{ $replUsername }}"
+    {{- end }}
+
+    {{- if .Values.tls.enabled }}
+    # TLS for cluster
+    {
+      echo ""
+      echo "# TLS for cluster"
+      echo "tls-replication yes"
+      echo "tls-cluster yes"
+    } >>"$VALKEY_CONFIG"
+    log "Enabled TLS for cluster communication"
+    {{- end }}
+
+    # ----------------------------------------------------------------------
+    # Refresh stale IPs in /data/nodes.conf before valkey-server starts.
+    #
+    # Why: cluster bus gossip (port 16379) is dialled by raw IP, even when
+    # cluster-preferred-endpoint-type=hostname (the hostname is announced
+    # over the bus, not used to establish it). After a rolling restart pod
+    # IPs change; if the first pod we restart on a given node also took
+    # the longest to load its RDB, by the time it comes back ALL its peers
+    # have new IPs and its own nodes.conf has none of them. The pod is
+    # then a stranded minority partition and never recovers without
+    # operator intervention.
+    #
+    # Fix: on every cluster pod start, re-resolve each peer's announced
+    # FQDN (already on-disk in nodes.conf as the second comma-separated
+    # token of field 2) and rewrite the IP in place. Valkey reads
+    # nodes.conf at startup and uses those IPs as its initial gossip
+    # targets — fresh IPs in, fresh gossip out, no stranded pod.
+    #
+    # No-ops cleanly when:
+    #   * nodes.conf doesn't exist (first boot — there's nothing to refresh,
+    #     and CLUSTER MEET from cluster-init-script.sh will populate it);
+    #   * a peer's FQDN doesn't resolve (peer is also mid-restart) — we
+    #     leave that line as-is and let Valkey's normal retry/gossip
+    #     reconcile it once the peer's pod IP shows up in DNS.
+    # ----------------------------------------------------------------------
+    NODES_CONF=/data/nodes.conf
+    if [ -f "$NODES_CONF" ]; then
+      log "Refreshing IPs in $NODES_CONF against current DNS"
+      # Write the temp file in /data (the PVC) rather than $TMPDIR — the
+      # init container runs with readOnlyRootFilesystem=true, which
+      # leaves /tmp read-only. /data is the only RW mount we have, and
+      # it's the same filesystem as the destination so the final mv is
+      # atomic (rename(2) within one mount point).
+      TMP=$(mktemp /data/nodes.conf.XXXXXX)
+      changed=0
+      kept=0
+      missing=0
+      # Read line-by-line. Format per line:
+      #   <id> <ip:port@busport>,<fqdn>[,k=v,...] flags ... [slots ...]
+      # The 'vars' line at EOF has no comma so we pass it through unchanged.
+      while IFS= read -r line || [ -n "$line" ]; do
+        case "$line" in
+          ''|vars\ *)
+            printf '%s\n' "$line" >>"$TMP"
+            continue
+            ;;
+        esac
+        # Field 2 is endpoint+metadata, field 1 is node id.
+        endpoint=$(printf '%s' "$line" | awk '{print $2}')
+        # Skip lines we can't parse (defensive — preserve verbatim).
+        case "$endpoint" in
+          *,*) ;;
+          *) printf '%s\n' "$line" >>"$TMP"; continue ;;
+        esac
+        # Pull out the host portion of "ip:port@busport" and the announced FQDN.
+        addr=${endpoint%%,*}
+        rest=${endpoint#*,}
+        fqdn=${rest%%,*}
+        old_ip=${addr%%:*}
+        port_and_bus=${addr#*:}
+        # Skip if the announced FQDN looks empty (older nodes.conf shapes).
+        if [ -z "$fqdn" ]; then
+          printf '%s\n' "$line" >>"$TMP"; continue
+        fi
+        new_ip=$(getent hosts "$fqdn" 2>/dev/null | awk '{print $1; exit}')
+        if [ -z "$new_ip" ]; then
+          missing=$(( missing + 1 ))
+          printf '%s\n' "$line" >>"$TMP"
+          continue
+        fi
+        if [ "$new_ip" = "$old_ip" ]; then
+          kept=$(( kept + 1 ))
+          printf '%s\n' "$line" >>"$TMP"
+          continue
+        fi
+        # Rewrite the endpoint token in field 2; everything else verbatim.
+        new_endpoint="${new_ip}:${port_and_bus},${rest}"
+        # Replace ONLY the first whitespace-separated token after the ID.
+        # Using awk to avoid sed quoting/regex hazards when fqdn contains dots.
+        printf '%s\n' "$line" | awk -v new="$new_endpoint" '{$2 = new; print}' >>"$TMP"
+        changed=$(( changed + 1 ))
+      done <"$NODES_CONF"
+      # Atomic swap so a kill mid-rewrite can't corrupt nodes.conf.
+      mv "$TMP" "$NODES_CONF"
+      log "nodes.conf refresh: ${changed} updated, ${kept} unchanged, ${missing} unresolved"
+    else
+      log "$NODES_CONF absent — first boot, nothing to refresh"
+    fi
+    {{- end }}
+
     # Append extra configs if present
     if [ -f /usr/local/etc/valkey/valkey.conf ]; then
       log "Appending /usr/local/etc/valkey/valkey.conf"
       cat /usr/local/etc/valkey/valkey.conf >>"$VALKEY_CONFIG"
     fi
-    if [ -d /extravalkeyconfigs ]; then
-      log "Appending files in /extravalkeyconfigs/"
-      cat /extravalkeyconfigs/* >>"$VALKEY_CONFIG"
-    fi
 
diff --git a/valkey/templates/istio-authorization-policy.yaml b/valkey/templates/istio-authorization-policy.yaml
new file mode 100644
index 00000000..4b983ded
--- /dev/null
+++ b/valkey/templates/istio-authorization-policy.yaml
@@ -0,0 +1,82 @@
+{{- /*
+AuthorizationPolicy for the cluster-bus port.
+
+Valkey's CLUSTER MEET has no authentication of its own: a pod that can open
+a TCP connection to a node's bus port can merge into the cluster. The chart
+already ships a NetworkPolicy that pins the bus port to same-release pods by
+IP (cluster-isolation-netpol.yaml), but that only works on a CNI that
+enforces NetworkPolicy.
+
+An Istio AuthorizationPolicy is the belt-and-braces: it matches on SPIFFE
+principal (the caller's ServiceAccount identity), not IP, so a pod that
+spoofs its way onto the right IP range still fails the check. It also works
+regardless of CNI — the enforcement point is the sidecar Envoy (sidecar
+mode) or the node-local ztunnel (ambient mode), both of which terminate
+mTLS and have the peer's identity.
+
+Rendered only in cluster mode — no bus port to protect otherwise.
+
+Both L4 (sidecar via Envoy, ambient via ztunnel) enforce ALLOW/DENY on
+principal+port, so a single policy shape works for both modes. Ambient's
+ztunnel does NOT enforce L7 rules (HTTP method, path, etc.) — those need a
+waypoint — but we only need L4 here.
+*/}}
+{{- if and .Values.istio.enabled .Values.istio.authorizationPolicy.enabled .Values.cluster.enabled }}
+{{- include "valkey.validateIstioConfig" . }}
+apiVersion: security.istio.io/v1
+kind: AuthorizationPolicy
+metadata:
+  name: {{ include "valkey.fullname" . }}-cluster-bus
+  labels:
+    {{- include "valkey.labels" . | nindent 4 }}
+    {{- with .Values.istio.authorizationPolicy.labels }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+  {{- with .Values.istio.authorizationPolicy.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  selector:
+    matchLabels:
+      {{- include "valkey.selectorLabels" . | nindent 6 }}
+  action: ALLOW
+  rules:
+    # Same-release pods (identified by SPIFFE principal) may reach the bus port.
+    - from:
+        - source:
+            principals:
+              - {{ include "valkey.istioPrincipal" . | quote }}
+      to:
+        - operation:
+            ports:
+              - {{ .Values.cluster.busPort | quote }}
+    # Client and metrics ports stay wide open at this layer — they have
+    # their own auth (ACL, TLS) above the mesh. A separate deny rule on the
+    # bus port is unnecessary: this policy is ALLOW-only, and because at
+    # least one AuthorizationPolicy now targets these pods, Istio applies
+    # default-deny to anything not matched — i.e. the bus port for
+    # non-same-release principals.
+    #
+    # Istio-managed ports (15020 merged-stats, 15021 Envoy readiness, 15090
+    # Envoy admin) are intentionally NOT listed:
+    #   sidecar mode — Istio auto-excludes these via iptables so they never
+    #                  hit Envoy's authz stack; the AP has no bearing on
+    #                  them. Verified on a live kind+Istio 1.29 install:
+    #                  15021/15090 reachable from in-mesh pods without an
+    #                  explicit allow rule; 15020 is bound to pilot-agent
+    #                  outside Envoy's path.
+    #   ambient mode — no Envoy exists, so none of these ports have analogues
+    #                  (ztunnel metrics live on the NODE, not the pod).
+    # Prometheus scrapes this chart via the shipped Service/PodMonitor on
+    # the app-level 9121 port, so this is the only port Prometheus cares
+    # about here. Scrapers that rely on Istio's Envoy-merged 15020 path
+    # hit pilot-agent directly and aren't gated by this AP.
+    - to:
+        - operation:
+            ports:
+              - {{ .Values.service.port | quote }}
+    {{- if .Values.metrics.enabled }}
+              - {{ .Values.metrics.exporter.port | quote }}
+    {{- end }}
+{{- end }}
diff --git a/valkey/templates/istio-destination-rule.yaml b/valkey/templates/istio-destination-rule.yaml
new file mode 100644
index 00000000..88e7f6f2
--- /dev/null
+++ b/valkey/templates/istio-destination-rule.yaml
@@ -0,0 +1,50 @@
+{{- /*
+DestinationRule wraps outbound connections in ISTIO_MUTUAL mTLS. This is a
+sidecar-mode concept — an outbound Envoy sees the DR and upgrades the TLS.
+In ambient mode the ztunnel already wraps every pod-to-pod hop in HBONE mTLS
+transparently, so a DR on top would layer a second mTLS (Envoy-in-ztunnel)
+— double crypto for no security gain, and it requires a waypoint proxy to
+even take effect. Skip it.
+*/}}
+{{- if and .Values.istio.enabled (eq .Values.istio.mode "sidecar") }}
+{{- include "valkey.validateIstioConfig" . }}
+apiVersion: networking.istio.io/v1
+kind: DestinationRule
+metadata:
+  name: {{ include "valkey.fullname" . }}
+  labels:
+    {{- include "valkey.labels" . | nindent 4 }}
+    {{- with .Values.istio.destinationRule.labels }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+  {{- with .Values.istio.destinationRule.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  host: {{ include "valkey.fullname" . }}.{{ .Release.Namespace }}.svc.{{ .Values.clusterDomain }}
+  trafficPolicy:
+    tls:
+      mode: {{ .Values.istio.destinationRule.mode }}
+{{- if or .Values.replica.enabled .Values.cluster.enabled }}
+---
+apiVersion: networking.istio.io/v1
+kind: DestinationRule
+metadata:
+  name: {{ include "valkey.headlessServiceName" . }}
+  labels:
+    {{- include "valkey.labels" . | nindent 4 }}
+    {{- with .Values.istio.destinationRule.labels }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+  {{- with .Values.istio.destinationRule.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  host: {{ include "valkey.headlessServiceName" . }}.{{ .Release.Namespace }}.svc.{{ .Values.clusterDomain }}
+  trafficPolicy:
+    tls:
+      mode: {{ .Values.istio.destinationRule.mode }}
+{{- end }}
+{{- end }}
diff --git a/valkey/templates/istio-peer-authentication.yaml b/valkey/templates/istio-peer-authentication.yaml
new file mode 100644
index 00000000..d04670f3
--- /dev/null
+++ b/valkey/templates/istio-peer-authentication.yaml
@@ -0,0 +1,26 @@
+{{- /*
+PeerAuthentication applies in both sidecar and ambient mode — Envoy enforces
+in sidecar, ztunnel enforces in ambient. The CRD shape is the same for both.
+*/}}
+{{- if .Values.istio.enabled }}
+{{- include "valkey.validateIstioConfig" . }}
+apiVersion: security.istio.io/v1
+kind: PeerAuthentication
+metadata:
+  name: {{ include "valkey.fullname" . }}
+  labels:
+    {{- include "valkey.labels" . | nindent 4 }}
+    {{- with .Values.istio.peerAuthentication.labels }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+  {{- with .Values.istio.peerAuthentication.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  selector:
+    matchLabels:
+      {{- include "valkey.selectorLabels" . | nindent 6 }}
+  mtls:
+    mode: {{ .Values.istio.peerAuthentication.mode }}
+{{- end }}
diff --git a/valkey/templates/netpolicy.yaml b/valkey/templates/netpolicy.yaml
index f65c504d..a4272636 100644
--- a/valkey/templates/netpolicy.yaml
+++ b/valkey/templates/netpolicy.yaml
@@ -1,4 +1,11 @@
 {{- with .Values.networkPolicy }}
+{{- /*
+Gate on `hasKey` rather than truthiness: an empty list still counts as
+the user declaring a policy (e.g. `ingress: []` for default-deny).
+Otherwise an empty array would produce a NetworkPolicy with `policyTypes: []`
+which the API server accepts but does nothing useful.
+*/}}
+{{- if or (hasKey . "ingress") (hasKey . "egress") }}
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
@@ -16,18 +23,25 @@ spec:
     matchLabels:
       {{- include "valkey.selectorLabels" $ | nindent 6 }}
   policyTypes:
-  {{- if .ingress }}
+  {{- if hasKey . "ingress" }}
     - Ingress
   {{- end }}
-  {{- if .egress }}
+  {{- if hasKey . "egress" }}
     - Egress
   {{- end }}
-  {{- with .ingress }}
+  {{- if hasKey . "ingress" }}
   ingress:
-    {{- toYaml . | nindent 4 }}
+    {{- if .ingress }}
+    {{- toYaml .ingress | nindent 4 }}
+    {{- else }} []
+    {{- end }}
   {{- end }}
-  {{- with .egress }}
+  {{- if hasKey . "egress" }}
   egress:
-    {{- toYaml . | nindent 4 }}
+    {{- if .egress }}
+    {{- toYaml .egress | nindent 4 }}
+    {{- else }} []
+    {{- end }}
   {{- end }}
 {{- end }}
+{{- end }}
diff --git a/valkey/templates/poddisruptionbudget.yaml b/valkey/templates/poddisruptionbudget.yaml
index ff123525..00430f71 100644
--- a/valkey/templates/poddisruptionbudget.yaml
+++ b/valkey/templates/poddisruptionbudget.yaml
@@ -1,4 +1,4 @@
-{{- if and .Values.podDisruptionBudget.enabled .Values.replica.enabled }}
+{{- if and .Values.podDisruptionBudget.enabled (or .Values.replica.enabled .Values.cluster.enabled) }}
 apiVersion: policy/v1
 kind: PodDisruptionBudget
 metadata:
diff --git a/valkey/templates/pvc.yaml b/valkey/templates/pvc.yaml
index aa20859b..9f25edf8 100644
--- a/valkey/templates/pvc.yaml
+++ b/valkey/templates/pvc.yaml
@@ -1,4 +1,4 @@
-{{- if and .Values.dataStorage.enabled (not .Values.replica.enabled) (not (empty .Values.dataStorage.requestedSize)) (empty .Values.dataStorage.persistentVolumeClaimName) }}
+{{- if and .Values.dataStorage.enabled (not .Values.replica.enabled) (not .Values.cluster.enabled) (not (empty .Values.dataStorage.requestedSize)) (empty .Values.dataStorage.persistentVolumeClaimName) }}
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
diff --git a/valkey/templates/service-headless.yaml b/valkey/templates/service-headless.yaml
index 733ca683..796ccd90 100644
--- a/valkey/templates/service-headless.yaml
+++ b/valkey/templates/service-headless.yaml
@@ -1,4 +1,4 @@
-{{- if .Values.replica.enabled }}
+{{- if or .Values.replica.enabled .Values.cluster.enabled }}
 apiVersion: v1
 kind: Service
 metadata:
@@ -15,6 +15,12 @@ spec:
       port: {{ .Values.service.port }}
       targetPort: tcp
       protocol: TCP
+    {{- if .Values.cluster.enabled }}
+    - name: tcp-bus
+      port: {{ .Values.cluster.busPort }}
+      targetPort: tcp-bus
+      protocol: TCP
+    {{- end }}
   selector:
     {{- include "valkey.selectorLabels" . | nindent 4 }}
 {{- end }}
diff --git a/valkey/templates/service-read.yaml b/valkey/templates/service-read.yaml
index 49ec54e7..de84466d 100644
--- a/valkey/templates/service-read.yaml
+++ b/valkey/templates/service-read.yaml
@@ -18,8 +18,9 @@ spec:
   {{- if .Values.replica.service.loadBalancerClass }}
   loadBalancerClass: {{ .Values.replica.service.loadBalancerClass }}
   {{- end }}
-  {{- if .Values.replica.service.loadBalancerSourceRanges }}
-  loadBalancerSourceRanges: {{ .Values.replica.service.loadBalancerSourceRanges }}
+  {{- with .Values.replica.service.loadBalancerSourceRanges }}
+  loadBalancerSourceRanges:
+    {{- toYaml . | nindent 4 }}
   {{- end }}
   ports:
     - name: tcp
diff --git a/valkey/templates/service.yaml b/valkey/templates/service.yaml
index 1e786826..353375c2 100644
--- a/valkey/templates/service.yaml
+++ b/valkey/templates/service.yaml
@@ -17,8 +17,9 @@ spec:
   {{- if .Values.service.loadBalancerClass }}
   loadBalancerClass: {{ .Values.service.loadBalancerClass }}
   {{- end }}
-  {{- if .Values.service.loadBalancerSourceRanges }}
-  loadBalancerSourceRanges: {{ .Values.service.loadBalancerSourceRanges }}
+  {{- with .Values.service.loadBalancerSourceRanges }}
+  loadBalancerSourceRanges:
+    {{- toYaml . | nindent 4 }}
   {{- end }}
   ports:
     - port: {{ .Values.service.port }}
@@ -36,3 +37,4 @@ spec:
     {{- if .Values.replica.enabled }}
     statefulset.kubernetes.io/pod-name: {{ include "valkey.fullname" . }}-0
     {{- end }}
+    {{- /* In cluster mode, the service routes to all nodes; clients handle redirections */}}
diff --git a/valkey/templates/statefulset.yaml b/valkey/templates/statefulset.yaml
index 4a8d4caa..b7bd1ff6 100644
--- a/valkey/templates/statefulset.yaml
+++ b/valkey/templates/statefulset.yaml
@@ -2,6 +2,7 @@
 {{- include "valkey.validateAuthConfig" . }}
 {{- include "valkey.validateReplicaPersistence" . }}
 {{- include "valkey.validateReplicaAuth" . }}
+{{- include "valkey.validateIstioConfig" . }}
 apiVersion: apps/v1
 kind: StatefulSet
 metadata:
@@ -37,13 +38,14 @@ spec:
   template:
     metadata:
       labels:
-        {{- include "valkey.selectorLabels" . | nindent 8 }}
-        {{- with .Values.commonLabels }}
-        {{- toYaml . | nindent 8 }}
-        {{- end }}
-        {{- with .Values.podLabels }}
-        {{- toYaml . | nindent 8 }}
-        {{- end }}
+        {{- /*
+        Single merged label set: selector + commonLabels + chart-computed
+        mesh labels + user podLabels (user wins on collision). Keeps the
+        rendered YAML free of duplicate keys when e.g. a user sets
+        sidecar.istio.io/inject=false via podLabels alongside
+        istio.enabled=true.
+        */}}
+        {{- include "valkey.podLabels" . | nindent 8 }}
       annotations:
       {{- with .Values.podAnnotations }}
         {{- toYaml . | nindent 8 }}
@@ -85,10 +87,6 @@ spec:
               mountPath: /usr/local/etc/valkey/valkey.conf
               subPath: valkey.conf
             {{- end }}
-            {{- if .Values.extraSecretValkeyConfigs }}
-            - name: extravalkeyconfigs-volume
-              mountPath: /extravalkeyconfigs
-            {{- end }}
             {{- if .Values.auth.enabled }}
             - name: valkey-acl
               mountPath: /etc/valkey
@@ -119,10 +117,6 @@ spec:
           securityContext:
             {{- toYaml .Values.securityContext | nindent 12 }}
           env:
-            - name: POD_INDEX
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.labels['apps.kubernetes.io/pod-index']
             {{- range $key, $val := .Values.env }}
             - name: {{ $key }}
               value: "{{ $val }}"
@@ -133,20 +127,26 @@ spec:
             - name: tcp
               containerPort: {{ .Values.service.port }}
               protocol: TCP
+          {{- $strictCmd := include "valkey.probeShellCommand" (dict "ctx" . "acceptLoading" false) }}
+          {{- $loadCmd   := include "valkey.probeShellCommand" (dict "ctx" . "acceptLoading" true)  }}
           startupProbe:
             exec:
-              {{- if .Values.tls.enabled }}
-              command: [ "sh", "-c", "valkey-cli --cacert /tls/{{ .Values.tls.caPublicKey }} --tls ping" ]
-              {{- else }}
-              command: [ "sh", "-c", "valkey-cli ping" ]
-              {{- end }}
+              command: [ "sh", "-c", {{ $strictCmd | quote }} ]
+            periodSeconds: {{ .Values.replica.startupProbe.periodSeconds }}
+            timeoutSeconds: {{ .Values.replica.startupProbe.timeoutSeconds }}
+            failureThreshold: {{ .Values.replica.startupProbe.failureThreshold }}
           livenessProbe:
             exec:
-              {{- if .Values.tls.enabled }}
-              command: [ "sh", "-c", "valkey-cli --cacert /tls/{{ .Values.tls.caPublicKey }} --tls ping" ]
-              {{- else }}
-              command: [ "sh", "-c", "valkey-cli ping" ]
-              {{- end }}
+              command: [ "sh", "-c", {{ $loadCmd | quote }} ]
+            periodSeconds: {{ .Values.replica.livenessProbe.periodSeconds }}
+            timeoutSeconds: {{ .Values.replica.livenessProbe.timeoutSeconds }}
+            failureThreshold: {{ .Values.replica.livenessProbe.failureThreshold }}
+          readinessProbe:
+            exec:
+              command: [ "sh", "-c", {{ $strictCmd | quote }} ]
+            periodSeconds: {{ .Values.replica.readinessProbe.periodSeconds }}
+            timeoutSeconds: {{ .Values.replica.readinessProbe.timeoutSeconds }}
+            failureThreshold: {{ .Values.replica.readinessProbe.failureThreshold }}
           resources:
             {{- toYaml .Values.resources | nindent 12 }}
           volumeMounts:
@@ -212,18 +212,21 @@ spec:
             - name: REDIS_ALIAS
               value: {{ include "valkey.fullname" . }}
             {{- if .Values.auth.enabled }}
+            {{- $defaultUser := get (.Values.auth.aclUsers | default dict) "default" | default dict }}
+            {{- $hasInlineDefaultPassword := hasKey $defaultUser "password" }}
+            {{- if .Values.auth.usersExistingSecret }}
             - name: REDIS_PASSWORD
               valueFrom:
                 secretKeyRef:
-                  {{- if .Values.auth.usersExistingSecret }}
-                  {{- $defaultUser := index .Values.auth.aclUsers "default" | default dict }}
-                  {{- $passwordKey := $defaultUser.passwordKey | default "default" }}
                   name: {{ .Values.auth.usersExistingSecret }}
-                  key: {{ $passwordKey }}
-                  {{- else }}
+                  key: {{ $defaultUser.passwordKey | default "default" }}
+            {{- else if $hasInlineDefaultPassword }}
+            - name: REDIS_PASSWORD
+              valueFrom:
+                secretKeyRef:
                   name: {{ include "valkey.fullname" . }}-auth
                   key: default-password
-                  {{- end }}
+            {{- end }}
             {{- end }}
             {{- range $key, $val := .Values.metrics.exporter.extraEnvs }}
             - name: {{ $key }}
diff --git a/valkey/templates/tests/auth.yaml b/valkey/templates/tests/auth.yaml
index b289bb98..833d365a 100644
--- a/valkey/templates/tests/auth.yaml
+++ b/valkey/templates/tests/auth.yaml
@@ -19,9 +19,13 @@ metadata:
     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
 spec:
   restartPolicy: Never
+  securityContext:
+    {{- toYaml .Values.podSecurityContext | nindent 4 }}
   containers:
     - name: test-auth
       image: {{ include "valkey.image" . | quote }}
+      securityContext:
+        {{- toYaml .Values.securityContext | nindent 8 }}
       command:
         - sh
         - -c
@@ -95,9 +99,13 @@ metadata:
     "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
 spec:
   restartPolicy: Never
+  securityContext:
+    {{- toYaml .Values.podSecurityContext | nindent 4 }}
   containers:
     - name: test-auth
       image: {{ include "valkey.image" . | quote }}
+      securityContext:
+        {{- toYaml .Values.securityContext | nindent 8 }}
       command:
         - sh
         - -c
diff --git a/valkey/tests/cluster_isolation_netpol_test.yaml b/valkey/tests/cluster_isolation_netpol_test.yaml
new file mode 100644
index 00000000..52aa8903
--- /dev/null
+++ b/valkey/tests/cluster_isolation_netpol_test.yaml
@@ -0,0 +1,167 @@
+suite: cluster isolation network policy
+templates:
+  - templates/cluster-isolation-netpol.yaml
+tests:
+  - it: should not render when cluster mode is disabled
+    set:
+      cluster.enabled: false
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should not render in replica mode
+    set:
+      replica.enabled: true
+      replica.persistence.size: "5Gi"
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should not render when isolation is explicitly disabled
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.isolation.enabled: false
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should render by default in cluster mode
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    asserts:
+      - isKind:
+          of: NetworkPolicy
+      - equal:
+          path: metadata.name
+          value: RELEASE-NAME-valkey-cluster-isolation
+
+  - it: should select only pods of this release
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    asserts:
+      - equal:
+          path: spec.podSelector.matchLabels["app.kubernetes.io/name"]
+          value: valkey
+      - equal:
+          path: spec.podSelector.matchLabels["app.kubernetes.io/instance"]
+          value: RELEASE-NAME
+
+  # Egress is intentionally NOT restricted here — locking it down breaks
+  # Istio sidecar bootstrap (xDS to istiod) and any DNS-heavy flow. Users
+  # who want egress isolation should add their own NetworkPolicy on top.
+  - it: should restrict ingress only
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    asserts:
+      - equal:
+          path: spec.policyTypes
+          value:
+            - Ingress
+      - notExists:
+          path: spec.egress
+
+  # The core guarantee: the bus port inbound is scoped to same-instance pods.
+  - it: bus port ingress must be scoped to same-release pods
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.busPort: 16379
+    asserts:
+      - contains:
+          path: spec.ingress
+          content:
+            from:
+              - podSelector:
+                  matchLabels:
+                    app.kubernetes.io/name: valkey
+                    app.kubernetes.io/instance: RELEASE-NAME
+            ports:
+              - protocol: TCP
+                port: 16379
+
+  # The client port must NOT be scoped — arbitrary clients need to reach it.
+  # If a future change accidentally restricts it to same-release pods, every
+  # client outside the chart will lose access.
+  - it: client port ingress must not require the same-release selector
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    asserts:
+      - contains:
+          path: spec.ingress
+          content:
+            ports:
+              - protocol: TCP
+                port: 6379
+
+  - it: should include metrics port ingress only when metrics enabled
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      metrics.enabled: true
+      metrics.exporter.port: 9121
+    asserts:
+      - contains:
+          path: spec.ingress
+          content:
+            ports:
+              - protocol: TCP
+                port: 9121
+
+  - it: should not include metrics port ingress when metrics disabled
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      metrics.enabled: false
+    asserts:
+      - notContains:
+          path: spec.ingress
+          content:
+            ports:
+              - protocol: TCP
+                port: 9121
+
+  # --- Istio ambient mesh interaction ---
+  # In ambient mode, ztunnel wraps all pod-to-pod hops in HBONE (port 15008)
+  # then re-delivers to the pod-local port. A NetworkPolicy that only lists
+  # 6379/16379/9121 drops the inbound HBONE and every connection breaks with
+  # "Connection reset by peer". The chart-owned AuthorizationPolicy gives
+  # equivalent (and cryptographically stronger) isolation at the ztunnel
+  # layer, so we render NO NetworkPolicy when ambient is on. Bus-port
+  # protection still exists — just at a different layer.
+  - it: should not render in ambient mode
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      istio.enabled: true
+      istio.mode: ambient
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should still render in sidecar mode (Envoy's iptables capture is
+      per-pod and leaves the chart's pod-selector-based netpol correct)
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      istio.enabled: true
+      istio.mode: sidecar
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: NetworkPolicy
+
+  - it: should still render when istio is disabled
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      istio.enabled: false
+    asserts:
+      - hasDocuments:
+          count: 1
+
diff --git a/valkey/tests/cluster_test.yaml b/valkey/tests/cluster_test.yaml
new file mode 100644
index 00000000..07a2d720
--- /dev/null
+++ b/valkey/tests/cluster_test.yaml
@@ -0,0 +1,1496 @@
+suite: cluster configuration
+templates:
+  - templates/cluster-statefulset.yaml
+  - templates/cluster-script.yaml
+  - templates/cluster-init-job.yaml
+  - templates/service-headless.yaml
+  - templates/service.yaml
+  - templates/init_config.yaml
+tests:
+  # Validation tests
+  - it: should fail when cluster enabled but no persistence size provided
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: ""
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - failedTemplate:
+          errorPattern: "Cluster mode requires persistent storage.*"
+
+  - it: should fail when cluster enabled with less than 3 shards
+    set:
+      cluster.enabled: true
+      cluster.shards: 2
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - failedTemplate:
+          errorPattern: "Cluster mode requires at least 3 shards.*"
+
+  - it: should fail when both cluster and replica are enabled
+    set:
+      cluster.enabled: true
+      replica.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - failedTemplate:
+          errorPattern: "cluster.enabled and replica.enabled are mutually exclusive.*"
+
+  # StatefulSet tests
+  - it: should create StatefulSet when cluster is enabled
+    set:
+      cluster.enabled: true
+      cluster.shards: 3
+      cluster.replicasPerShard: 1
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - isKind:
+          of: StatefulSet
+      - equal:
+          path: spec.replicas
+          value: 6  # 3 shards * (1 + 1 replica) = 6 nodes
+
+  - it: should create StatefulSet with 3 shards and 0 replicas (3 nodes total)
+    set:
+      cluster.enabled: true
+      cluster.shards: 3
+      cluster.replicasPerShard: 0
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - isKind:
+          of: StatefulSet
+      - equal:
+          path: spec.replicas
+          value: 3
+
+  - it: should create StatefulSet with 5 shards and 2 replicas (15 nodes total)
+    set:
+      cluster.enabled: true
+      cluster.shards: 5
+      cluster.replicasPerShard: 2
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - isKind:
+          of: StatefulSet
+      - equal:
+          path: spec.replicas
+          value: 15  # 5 shards * (1 + 2 replicas) = 15 nodes
+
+  - it: should use Parallel pod management policy for cluster mode
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - equal:
+          path: spec.podManagementPolicy
+          value: Parallel
+
+  - it: should configure PVC with correct storage settings
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "10Gi"
+      cluster.persistence.storageClass: "fast-ssd"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - equal:
+          path: spec.volumeClaimTemplates[0].spec.resources.requests.storage
+          value: "10Gi"
+      - equal:
+          path: spec.volumeClaimTemplates[0].spec.storageClassName
+          value: "fast-ssd"
+
+  - it: should expose both tcp and tcp-bus ports in cluster mode
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.busPort: 16379
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].ports
+          content:
+            name: tcp
+            containerPort: 6379
+            protocol: TCP
+      - contains:
+          path: spec.template.spec.containers[0].ports
+          content:
+            name: tcp-bus
+            containerPort: 16379
+            protocol: TCP
+
+  # StatefulSet runs valkey-server directly (no background init script)
+  - it: should run valkey-server directly without background init script
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.containers[0].command
+          value: [ "valkey-server" ]
+      - equal:
+          path: spec.template.spec.containers[0].args
+          value: [ "/data/conf/valkey.conf" ]
+
+  # Cluster-script is consumed on the STS side exclusively by the preStop
+  # CLUSTER FAILOVER hook: no replicas in the shard ⇒ no failover target
+  # ⇒ no need for the script on the main container.
+  - it: should not mount cluster-script volume in StatefulSet container when no replicas
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.replicasPerShard: 0
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - notContains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: cluster-script
+            mountPath: /cluster-script
+
+  - it: should not define cluster-script volume in StatefulSet when no replicas
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.replicasPerShard: 0
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - notContains:
+          path: spec.template.spec.volumes
+          content:
+            name: cluster-script
+          any: true
+
+  - it: should not mount cluster-script volume when preStopFailover disabled
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.replicasPerShard: 1
+      cluster.preStopFailover.enabled: false
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - notContains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: cluster-script
+            mountPath: /cluster-script
+      - notContains:
+          path: spec.template.spec.volumes
+          content:
+            name: cluster-script
+          any: true
+
+  - it: should mount cluster-script volume in StatefulSet container when replicas>=1 (default)
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.replicasPerShard: 1
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: cluster-script
+            mountPath: /cluster-script
+      - contains:
+          path: spec.template.spec.volumes
+          content:
+            name: cluster-script
+            configMap:
+              name: RELEASE-NAME-valkey-cluster-script
+              defaultMode: 365  # 0555
+
+  # --- preStop CLUSTER FAILOVER hook ---
+  - it: should render preStop CLUSTER FAILOVER hook when replicas>=1 (default)
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.replicasPerShard: 1
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.containers[0].lifecycle.preStop.exec.command
+          value: [ "/bin/sh", "/cluster-script/prestop.sh" ]
+
+  - it: should NOT render preStop hook when replicasPerShard=0
+    # Nothing to hand over to — the hook would be a no-op that just eats
+    # grace-period budget.
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.replicasPerShard: 0
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - notExists:
+          path: spec.template.spec.containers[0].lifecycle
+
+  - it: should NOT render preStop hook when preStopFailover explicitly disabled
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.replicasPerShard: 1
+      cluster.preStopFailover.enabled: false
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - notExists:
+          path: spec.template.spec.containers[0].lifecycle
+
+  - it: should set terminationGracePeriodSeconds from cluster.terminationGracePeriodSeconds
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.terminationGracePeriodSeconds: 120
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.terminationGracePeriodSeconds
+          value: 120
+
+  - it: should default terminationGracePeriodSeconds to 60 (enough for default preStop timeout of 40s + SIGTERM flush)
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.terminationGracePeriodSeconds
+          value: 60
+
+  - it: cluster-script ConfigMap should contain prestop.sh with CLUSTER FAILOVER
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-script.yaml
+    asserts:
+      - isNotNull:
+          path: data["prestop.sh"]
+      - matchRegex:
+          path: data["prestop.sh"]
+          pattern: "cluster failover"
+
+  - it: prestop.sh should inline TLS args when tls.enabled
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      tls.enabled: true
+      tls.existingSecret: valkey-tls
+    template: templates/cluster-script.yaml
+    asserts:
+      - matchRegex:
+          path: data["prestop.sh"]
+          pattern: "--tls --cacert"
+
+  - it: prestop.sh should NOT inline TLS args when tls disabled
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-script.yaml
+    asserts:
+      - notMatchRegex:
+          path: data["prestop.sh"]
+          pattern: "--tls --cacert"
+
+  - it: prestop.sh should source REDISCLI_AUTH when auth.enabled
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      auth.enabled: true
+      auth.aclUsers:
+        default:
+          permissions: "~* &* +@all"
+          password: "secretpass"
+    template: templates/cluster-script.yaml
+    asserts:
+      - matchRegex:
+          path: data["prestop.sh"]
+          pattern: "REDISCLI_AUTH"
+
+  - it: prestop.sh timeout should follow cluster.preStopFailover.timeoutSeconds
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.preStopFailover.timeoutSeconds: 25
+    template: templates/cluster-script.yaml
+    asserts:
+      - matchRegex:
+          path: data["prestop.sh"]
+          pattern: "TIMEOUT=25"
+
+  # Init container tests
+  - it: should have init container with cluster environment variables
+    set:
+      cluster.enabled: true
+      cluster.shards: 4
+      cluster.replicasPerShard: 2
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - contains:
+          path: spec.template.spec.initContainers[0].env
+          content:
+            name: CLUSTER_SHARDS
+            value: "4"
+      - contains:
+          path: spec.template.spec.initContainers[0].env
+          content:
+            name: CLUSTER_REPLICAS_PER_SHARD
+            value: "2"
+
+  # Service headless tests
+  - it: should create headless service with bus port in cluster mode
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.busPort: 16379
+    template: templates/service-headless.yaml
+    asserts:
+      - isKind:
+          of: Service
+      - equal:
+          path: spec.clusterIP
+          value: None
+      - contains:
+          path: spec.ports
+          content:
+            name: tcp
+            port: 6379
+            targetPort: tcp
+            protocol: TCP
+      - contains:
+          path: spec.ports
+          content:
+            name: tcp-bus
+            port: 16379
+            targetPort: tcp-bus
+            protocol: TCP
+
+  # Main service tests
+  - it: should not expose the bus port on the frontend service in cluster mode
+    # Bus port is pod-to-pod only; clients reach nodes via the headless service.
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.busPort: 16379
+    template: templates/service.yaml
+    asserts:
+      - isKind:
+          of: Service
+      - notContains:
+          path: spec.ports
+          content:
+            name: tcp-bus
+      - notContains:
+          path: spec.ports
+          content:
+            port: 16379
+
+  # Cluster init script ConfigMap tests
+  - it: should create cluster-script ConfigMap when cluster is enabled
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-script.yaml
+    asserts:
+      - isKind:
+          of: ConfigMap
+      - equal:
+          path: metadata.name
+          value: RELEASE-NAME-valkey-cluster-script
+
+  - it: cluster-script ConfigMap should contain init-cluster.sh
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-script.yaml
+    asserts:
+      - isNotNull:
+          path: data["init-cluster.sh"]
+      - matchRegex:
+          path: data["init-cluster.sh"]
+          pattern: "CLUSTER MEET"
+
+  - it: cluster-script should contain cluster create logic
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-script.yaml
+    asserts:
+      - matchRegex:
+          path: data["init-cluster.sh"]
+          pattern: "--cluster create"
+
+  # --- Cluster Init Job tests ---
+  - it: should create cluster-init Job when cluster is enabled
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - isKind:
+          of: Job
+      - equal:
+          path: metadata.name
+          value: RELEASE-NAME-valkey-cluster-init
+
+  - it: Job should have Helm hook annotations for post-install and post-upgrade
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: metadata.annotations["helm.sh/hook"]
+          value: "post-install,post-upgrade"
+      - equal:
+          path: metadata.annotations["helm.sh/hook-weight"]
+          value: "0"
+      - equal:
+          path: metadata.annotations["helm.sh/hook-delete-policy"]
+          value: "before-hook-creation"
+
+  - it: Job should have backoffLimit of 6
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.backoffLimit
+          value: 6
+
+  - it: Job should use OnFailure restart policy
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.restartPolicy
+          value: OnFailure
+
+  - it: Job should run init-cluster.sh from cluster-script ConfigMap
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.containers[0].command
+          value: [ "/bin/sh", "/cluster-script/init-cluster.sh" ]
+
+  - it: Job should mount cluster-script volume
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: cluster-script
+            mountPath: /cluster-script
+      - contains:
+          path: spec.template.spec.volumes
+          content:
+            name: cluster-script
+            configMap:
+              name: RELEASE-NAME-valkey-cluster-script
+              defaultMode: 365
+
+  - it: Job should have CLUSTER_NODE_COUNT and CLUSTER_REPLICAS_PER_SHARD env vars
+    set:
+      cluster.enabled: true
+      cluster.shards: 4
+      cluster.replicasPerShard: 2
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: CLUSTER_NODE_COUNT
+            value: "12"
+      - contains:
+          path: spec.template.spec.containers[0].env
+          content:
+            name: CLUSTER_REPLICAS_PER_SHARD
+            value: "2"
+
+  - it: Job should use same image as StatefulSet
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      image.registry: "myregistry.io"
+      image.repository: "valkey/valkey"
+      image.tag: "7.0.0"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.containers[0].image
+          value: "myregistry.io/valkey/valkey:7.0.0"
+
+  - it: Job should use pod security context
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      podSecurityContext:
+        fsGroup: 1000
+        runAsUser: 1000
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.securityContext.fsGroup
+          value: 1000
+      - equal:
+          path: spec.template.spec.securityContext.runAsUser
+          value: 1000
+
+  - it: Job should use container security context
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      securityContext:
+        allowPrivilegeEscalation: false
+        runAsNonRoot: true
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.containers[0].securityContext.allowPrivilegeEscalation
+          value: false
+      - equal:
+          path: spec.template.spec.containers[0].securityContext.runAsNonRoot
+          value: true
+
+  - it: Job should use initResources when set
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      initResources:
+        limits:
+          cpu: 200m
+          memory: 256Mi
+        requests:
+          cpu: 100m
+          memory: 128Mi
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.containers[0].resources.limits.cpu
+          value: 200m
+      - equal:
+          path: spec.template.spec.containers[0].resources.requests.memory
+          value: 128Mi
+
+  - it: Job should use service account
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      serviceAccount.create: true
+      serviceAccount.name: "my-sa"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.serviceAccountName
+          value: "my-sa"
+
+  - it: Job should not automount service account token
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.automountServiceAccountToken
+          value: false
+
+  - it: Job should include common labels
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - isNotNull:
+          path: metadata.labels["helm.sh/chart"]
+      - isNotNull:
+          path: metadata.labels["app.kubernetes.io/name"]
+
+  - it: Job should include pod labels and annotations when set
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      podLabels:
+        custom-label: my-value
+      podAnnotations:
+        custom-annotation: my-annotation
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels["custom-label"]
+          value: my-value
+      - equal:
+          path: spec.template.metadata.annotations["custom-annotation"]
+          value: my-annotation
+
+  # --- cluster.initJob.podLabels / .podAnnotations override surface ---
+  # Lets operators veto a globally-injected sidecar (metrics agent, mesh
+  # proxy via namespace label, policy webhook, etc.) on the short-lived
+  # cluster-init Job without affecting the long-running data pods.
+
+  - it: Job should NOT have annotations key when none are set
+    # Important for Helm/`with` semantics: an empty mapping is not the
+    # same as an absent key; preserving the absent-key shape keeps the
+    # rendered manifest identical to its pre-feature state when no
+    # overrides are configured.
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - notExists:
+          path: spec.template.metadata.annotations
+
+  - it: cluster.initJob.podLabels should land on the Job pod
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.initJob.podLabels:
+        my-injector/skip: "true"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels["my-injector/skip"]
+          value: "true"
+
+  - it: cluster.initJob.podAnnotations should land on the Job pod
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.initJob.podAnnotations:
+        sidecar-injector.example.com/skip: "true"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.annotations["sidecar-injector.example.com/skip"]
+          value: "true"
+
+  - it: cluster.initJob.podLabels should NOT leak onto the data StatefulSet
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.initJob.podLabels:
+        my-injector/skip: "true"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - notExists:
+          path: spec.template.metadata.labels["my-injector/skip"]
+
+  - it: cluster.initJob.podAnnotations should NOT leak onto the data StatefulSet
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.initJob.podAnnotations:
+        sidecar-injector.example.com/skip: "true"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - notExists:
+          path: spec.template.metadata.annotations["sidecar-injector.example.com/skip"]
+
+  - it: cluster.initJob.podLabels should win over global podLabels on key collision
+    # Same key, different values. The Job-scoped layer must override the
+    # global one so an opt-out can be expressed Job-only.
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      podLabels:
+        observe.example.com/inject: "true"
+      cluster.initJob.podLabels:
+        observe.example.com/inject: "false"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels["observe.example.com/inject"]
+          value: "false"
+
+  - it: cluster.initJob.podAnnotations should win over global podAnnotations on key collision
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      podAnnotations:
+        sidecar.example.com/inject: "true"
+      cluster.initJob.podAnnotations:
+        sidecar.example.com/inject: "false"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.annotations["sidecar.example.com/inject"]
+          value: "false"
+
+  - it: cluster.initJob.podLabels should be able to veto chart-emitted istio sidecar inject label
+    # Real-world repro: in sidecar mode the chart emits
+    # sidecar.istio.io/inject=true on every pod. Operators may want the
+    # Job to skip injection (e.g. their own scrape-only inspector
+    # mid-rollout, or to avoid the Job-hangs-on-sidecar problem). The
+    # override must sit at the END of the merge so it can replace a
+    # chart-computed mesh label, not just a user-supplied one.
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.initJob.podLabels:
+        sidecar.istio.io/inject: "false"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels["sidecar.istio.io/inject"]
+          value: "false"
+
+  - it: cluster.initJob overrides do NOT veto the chart-emitted mesh label on the data StatefulSet
+    # Symmetric guard for the test above — proves the veto is Job-only,
+    # so flipping it in the Job can't accidentally take the data pods
+    # out of the mesh.
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.initJob.podLabels:
+        sidecar.istio.io/inject: "false"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels["sidecar.istio.io/inject"]
+          value: "true"
+
+  - it: Job should default ttlSecondsAfterFinished to 300 (auto-cleanup window)
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.ttlSecondsAfterFinished
+          value: 300
+
+  - it: cluster.initJob.ttlSecondsAfterFinished should be configurable
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.initJob.ttlSecondsAfterFinished: 60
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.ttlSecondsAfterFinished
+          value: 60
+
+  - it: cluster.initJob.ttlSecondsAfterFinished=0 deletes immediately on completion
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.initJob.ttlSecondsAfterFinished: 0
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.ttlSecondsAfterFinished
+          value: 0
+
+  - it: global podLabels still flow through to the Job when initJob.podLabels does not collide
+    # Don't shadow the existing pre-feature behaviour: a user who has set
+    # only the global podLabels gets them on the Job too.
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      podLabels:
+        team: platform
+      cluster.initJob.podLabels:
+        my-injector/skip: "true"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels.team
+          value: platform
+      - equal:
+          path: spec.template.metadata.labels["my-injector/skip"]
+          value: "true"
+
+  - it: Job should include node selector when set
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      nodeSelector:
+        kubernetes.io/os: linux
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.nodeSelector["kubernetes.io/os"]
+          value: linux
+
+  - it: Job should include tolerations when set
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      tolerations:
+        - key: "dedicated"
+          operator: "Equal"
+          value: "valkey"
+          effect: "NoSchedule"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - contains:
+          path: spec.template.spec.tolerations
+          content:
+            key: "dedicated"
+            operator: "Equal"
+            value: "valkey"
+            effect: "NoSchedule"
+
+  - it: Job should include affinity when set
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: node-type
+                    operator: In
+                    values:
+                      - cache
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - isNotNull:
+          path: spec.template.spec.affinity.nodeAffinity
+
+  - it: Job should include priority class name when set
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      priorityClassName: "high-priority"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.priorityClassName
+          value: "high-priority"
+
+  # --- Job TLS tests ---
+  - it: Job should mount TLS volume when TLS is enabled
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      tls.enabled: true
+      tls.existingSecret: "valkey-tls-secret"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: RELEASE-NAME-valkey-tls
+            mountPath: /tls
+      - contains:
+          path: spec.template.spec.volumes
+          content:
+            name: RELEASE-NAME-valkey-tls
+            secret:
+              secretName: valkey-tls-secret
+              defaultMode: 256
+
+  - it: Job should not mount TLS volume when TLS is disabled
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      tls.enabled: false
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - notContains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: RELEASE-NAME-valkey-tls
+            mountPath: /tls
+
+  # --- Job Authentication tests ---
+  - it: Job should mount valkey-users-secret when usersExistingSecret is set
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      auth.enabled: true
+      auth.usersExistingSecret: "my-valkey-users"
+      auth.aclUsers:
+        default:
+          permissions: "~* &* +@all"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: valkey-users-secret
+            mountPath: /valkey-users-secret
+            readOnly: true
+      - contains:
+          path: spec.template.spec.volumes
+          content:
+            name: valkey-users-secret
+            secret:
+              secretName: my-valkey-users
+              defaultMode: 256
+
+  - it: Job should mount valkey-auth-secret when inline passwords are used
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      auth.enabled: true
+      auth.aclUsers:
+        default:
+          password: "testpass"
+          permissions: "~* &* +@all"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: valkey-auth-secret
+            mountPath: /valkey-auth-secret
+            readOnly: true
+      - contains:
+          path: spec.template.spec.volumes
+          content:
+            name: valkey-auth-secret
+            secret:
+              secretName: RELEASE-NAME-valkey-auth
+              defaultMode: 256
+
+  - it: Job should mount both auth secrets when both are configured
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      auth.enabled: true
+      auth.usersExistingSecret: "my-valkey-users"
+      auth.aclUsers:
+        default:
+          permissions: "~* &* +@all"
+          password: "fallback"
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: valkey-users-secret
+            mountPath: /valkey-users-secret
+            readOnly: true
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: valkey-auth-secret
+            mountPath: /valkey-auth-secret
+            readOnly: true
+
+  - it: Job should not mount auth secrets when auth is disabled
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      auth.enabled: false
+    template: templates/cluster-init-job.yaml
+    asserts:
+      - notContains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: valkey-users-secret
+          any: true
+      - notContains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: valkey-auth-secret
+          any: true
+
+  # --- Cluster init script password retrieval tests ---
+  - it: cluster-script should read password from valkey-users-secret when usersExistingSecret is set
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      auth.enabled: true
+      auth.usersExistingSecret: "my-valkey-users"
+      auth.aclUsers:
+        default:
+          permissions: "~* &* +@all"
+    template: templates/cluster-script.yaml
+    asserts:
+      - matchRegex:
+          path: data["init-cluster.sh"]
+          pattern: '/valkey-users-secret/'
+      - notMatchRegex:
+          path: data["init-cluster.sh"]
+          pattern: '/etc/valkey/users.acl'
+
+  - it: cluster-script should read password from valkey-auth-secret when inline passwords are used
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      auth.enabled: true
+      auth.aclUsers:
+        default:
+          password: "testpass"
+          permissions: "~* &* +@all"
+    template: templates/cluster-script.yaml
+    asserts:
+      - matchRegex:
+          path: data["init-cluster.sh"]
+          pattern: '/valkey-auth-secret/default-password'
+      - notMatchRegex:
+          path: data["init-cluster.sh"]
+          pattern: '/etc/valkey/users.acl'
+
+  - it: cluster-script should use custom passwordKey when configured
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      auth.enabled: true
+      auth.usersExistingSecret: "my-valkey-users"
+      auth.aclUsers:
+        default:
+          permissions: "~* &* +@all"
+          passwordKey: "default-pwd"
+    template: templates/cluster-script.yaml
+    asserts:
+      - matchRegex:
+          path: data["init-cluster.sh"]
+          pattern: '/valkey-users-secret/default-pwd'
+
+  - it: cluster-script should use custom replicationUser for auth
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.replicationUser: "clusteruser"
+      auth.enabled: true
+      auth.aclUsers:
+        default:
+          password: "defaultpass"
+          permissions: "~* &* +@all"
+        clusteruser:
+          password: "clusterpass"
+          permissions: "~* &* +@all"
+    template: templates/cluster-script.yaml
+    asserts:
+      - matchRegex:
+          path: data["init-cluster.sh"]
+          pattern: '/valkey-auth-secret/clusteruser-password'
+
+  - it: cluster-script should NOT parse password hash from ACL file
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      auth.enabled: true
+      auth.aclUsers:
+        default:
+          password: "testpass"
+          permissions: "~* &* +@all"
+    template: templates/cluster-script.yaml
+    asserts:
+      # Ensure we don't try to extract the hash from the ACL file
+      - notMatchRegex:
+          path: data["init-cluster.sh"]
+          pattern: 'grep.*users\.acl'
+
+  # Authentication tests (StatefulSet)
+  - it: should fail when cluster auth enabled but replication user not in aclUsers
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      auth.enabled: true
+      cluster.replicationUser: "clusteruser"
+      auth.aclUsers:
+        default:
+          password: "test"
+          permissions: "~* &* +@all"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - failedTemplate:
+          errorPattern: "Cluster replication user 'clusteruser'.*must be defined in auth.aclUsers.*"
+
+  - it: should succeed when cluster auth is properly configured
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      auth.enabled: true
+      cluster.replicationUser: "default"
+      auth.aclUsers:
+        default:
+          password: "testpass"
+          permissions: "~* &* +@all"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - isKind:
+          of: StatefulSet
+
+  # TLS tests (StatefulSet)
+  - it: should configure TLS volume mount in cluster mode
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      tls.enabled: true
+      tls.existingSecret: "valkey-tls-secret"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: RELEASE-NAME-valkey-tls
+            mountPath: /tls
+
+  # Init config tests (cluster mode config generation)
+  - it: should generate cluster config in init script
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.nodeTimeout: 20000
+    template: templates/init_config.yaml
+    asserts:
+      - matchRegex:
+          path: data["init.sh"]
+          pattern: "cluster-enabled yes"
+      - matchRegex:
+          path: data["init.sh"]
+          pattern: "cluster-config-file /data/nodes.conf"
+      - matchRegex:
+          path: data["init.sh"]
+          pattern: "cluster-node-timeout 20000"
+
+  - it: should configure cluster-require-full-coverage when disabled
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.requireFullCoverage: false
+    template: templates/init_config.yaml
+    asserts:
+      - matchRegex:
+          path: data["init.sh"]
+          pattern: "cluster-require-full-coverage no"
+
+  - it: should configure cluster-allow-reads-when-down when enabled
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.allowReadsWhenDown: true
+    template: templates/init_config.yaml
+    asserts:
+      - matchRegex:
+          path: data["init.sh"]
+          pattern: "cluster-allow-reads-when-down yes"
+
+  # Cluster auth secret mount tests (StatefulSet - main container still needs ACL for Valkey server)
+  - it: should mount valkey-users-secret to main container when auth.usersExistingSecret is set
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      auth.enabled: true
+      auth.usersExistingSecret: "my-valkey-users"
+      auth.aclUsers:
+        default:
+          permissions: "~* &* +@all"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: valkey-users-secret
+            mountPath: /valkey-users-secret
+            readOnly: true
+
+  - it: should mount valkey-auth-secret to main container when inline passwords are used
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      auth.enabled: true
+      auth.aclUsers:
+        default:
+          password: "testpass"
+          permissions: "~* &* +@all"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: valkey-auth-secret
+            mountPath: /valkey-auth-secret
+            readOnly: true
+
+  - it: should mount both auth secrets to main container when both are configured
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      auth.enabled: true
+      auth.usersExistingSecret: "my-valkey-users"
+      auth.aclUsers:
+        default:
+          permissions: "~* &* +@all"
+          password: "fallback"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: valkey-users-secret
+            mountPath: /valkey-users-secret
+            readOnly: true
+      - contains:
+          path: spec.template.spec.containers[0].volumeMounts
+          content:
+            name: valkey-auth-secret
+            mountPath: /valkey-auth-secret
+            readOnly: true
+
+  # Regression: probes must accept NOAUTH as proof of liveness.
+  - it: should use a PONG|NOAUTH-tolerant probe on the valkey container
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      auth.enabled: true
+      auth.aclUsers:
+        default:
+          permissions: "~* &* +@all"
+          password: "p"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - matchRegex:
+          path: spec.template.spec.containers[0].startupProbe.exec.command[2]
+          pattern: "PONG\\|NOAUTH"
+      - matchRegex:
+          path: spec.template.spec.containers[0].livenessProbe.exec.command[2]
+          pattern: "PONG\\|NOAUTH"
+      - matchRegex:
+          path: spec.template.spec.containers[0].readinessProbe.exec.command[2]
+          pattern: "PONG\\|NOAUTH"
+
+  - it: should define a readiness probe
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - exists:
+          path: spec.template.spec.containers[0].readinessProbe
+
+  # --- Tri-state LOADING policy ---
+  # startupProbe: rejects LOADING — must keep the gate honest so a slow
+  #   RDB load doesn't pass-through to liveness, which would then kill
+  #   the still-loading container. Operators bump
+  #   cluster.startupProbe.failureThreshold for slow loaders.
+  # livenessProbe: accepts LOADING — after startup passes, LOADING means
+  #   a full-resync from primary is in progress; killing the pod loses
+  #   the in-flight download and triggers another full resync,
+  #   perpetuating the kill-loop.
+  # readinessProbe: rejects LOADING — a LOADING pod can't serve traffic
+  #   and must drop out of the Service endpoint set until ready.
+  - it: cluster startupProbe must reject LOADING
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - notMatchRegex:
+          path: spec.template.spec.containers[0].startupProbe.exec.command[2]
+          pattern: "LOADING"
+
+  - it: cluster livenessProbe must accept LOADING (full-resync tolerance)
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - matchRegex:
+          path: spec.template.spec.containers[0].livenessProbe.exec.command[2]
+          pattern: "PONG\\|NOAUTH\\|LOADING"
+
+  - it: cluster readinessProbe must reject LOADING
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - notMatchRegex:
+          path: spec.template.spec.containers[0].readinessProbe.exec.command[2]
+          pattern: "LOADING"
+
+  # Tuning knobs flow through to the rendered probe. Operators with
+  # large datasets bump cluster.startupProbe.failureThreshold to extend
+  # the load window without affecting steady-state probes.
+  - it: cluster.startupProbe overrides should land on the startupProbe
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.startupProbe.periodSeconds: 10
+      cluster.startupProbe.timeoutSeconds: 8
+      cluster.startupProbe.failureThreshold: 240
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.containers[0].startupProbe.periodSeconds
+          value: 10
+      - equal:
+          path: spec.template.spec.containers[0].startupProbe.timeoutSeconds
+          value: 8
+      - equal:
+          path: spec.template.spec.containers[0].startupProbe.failureThreshold
+          value: 240
+      # And the overrides must be probe-scoped: liveness/readiness keep
+      # their defaults.
+      - equal:
+          path: spec.template.spec.containers[0].livenessProbe.periodSeconds
+          value: 10
+      - equal:
+          path: spec.template.spec.containers[0].livenessProbe.failureThreshold
+          value: 6
+
+  - it: cluster.livenessProbe overrides should land on the livenessProbe only
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.livenessProbe.failureThreshold: 12
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.containers[0].livenessProbe.failureThreshold
+          value: 12
+      - equal:
+          path: spec.template.spec.containers[0].startupProbe.failureThreshold
+          value: 30
+      - equal:
+          path: spec.template.spec.containers[0].readinessProbe.failureThreshold
+          value: 3
+
+  # Regression: extraContainers and extraVolumes were unwired in cluster mode.
+  - it: should wire extraContainers and extraVolumes through in cluster mode
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      extraContainers:
+        - name: sidecar
+          image: busybox:1.36
+      extraVolumes:
+        - name: extra
+          emptyDir: {}
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - contains:
+          path: spec.template.spec.containers
+          content:
+            name: sidecar
+            image: busybox:1.36
+      - contains:
+          path: spec.template.spec.volumes
+          content:
+            name: extra
+            emptyDir: {}
+
+  # Regression: REDIS_PASSWORD should be wired through in cluster mode too.
+  - it: should wire REDIS_PASSWORD to the metrics exporter from the generated auth secret
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      metrics.enabled: true
+      auth.enabled: true
+      auth.aclUsers:
+        default:
+          permissions: "~* &* +@all"
+          password: "p"
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - contains:
+          path: spec.template.spec.containers[1].env
+          content:
+            name: REDIS_PASSWORD
+            valueFrom:
+              secretKeyRef:
+                name: RELEASE-NAME-valkey-auth
+                key: default-password
+
+  # --- Istio ambient mode (cluster) ---
+  - it: should add ambient dataplane-mode label when istio.mode=ambient
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      istio.enabled: true
+      istio.mode: ambient
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels["istio.io/dataplane-mode"]
+          value: ambient
+
+  - it: should NOT emit traffic.sidecar.istio.io exclude annotations in ambient mode
+    # Ambient has no pod-local Envoy; the exclude* annotations are sidecar-only
+    # and meaningless to ztunnel. ztunnel leaves the bus port unproxied by
+    # default (nothing in the AuthorizationPolicy's ALLOW set binds it).
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      istio.enabled: true
+      istio.mode: ambient
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - notExists:
+          path: spec.template.metadata.annotations["traffic.sidecar.istio.io/excludeInboundPorts"]
+      - notExists:
+          path: spec.template.metadata.annotations["traffic.sidecar.istio.io/excludeOutboundPorts"]
+
+  - it: should emit traffic.sidecar.istio.io exclude annotations in sidecar mode
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      istio.enabled: true
+      istio.mode: sidecar
+      cluster.busPort: 16379
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.annotations["traffic.sidecar.istio.io/excludeInboundPorts"]
+          value: "16379"
+      - equal:
+          path: spec.template.metadata.annotations["traffic.sidecar.istio.io/excludeOutboundPorts"]
+          value: "16379"
+
+  - it: should NOT emit traffic.sidecar.istio.io annotations when istio is disabled
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      istio.enabled: false
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - notExists:
+          path: spec.template.metadata.annotations["traffic.sidecar.istio.io/excludeInboundPorts"]
+
+  - it: should emit sidecar-mode mesh labels on the cluster statefulset
+    # Sidecar mode is self-sufficient now: we pin dataplane-mode=none AND
+    # sidecar.istio.io/inject=true on the pod, so injection works whether
+    # or not the namespace is labelled, and ztunnel stays out of the way
+    # on dual-mode clusters. Regression: we used to emit neither, leaning
+    # on namespace labels exclusively.
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      istio.enabled: true
+      istio.mode: sidecar
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels["istio.io/dataplane-mode"]
+          value: none
+      - equal:
+          path: spec.template.metadata.labels["sidecar.istio.io/inject"]
+          value: "true"
+
+  - it: should emit no mesh labels when istio is disabled (cluster)
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      istio.enabled: false
+    template: templates/cluster-statefulset.yaml
+    asserts:
+      - notExists:
+          path: spec.template.metadata.labels["istio.io/dataplane-mode"]
+      - notExists:
+          path: spec.template.metadata.labels["sidecar.istio.io/inject"]
+
diff --git a/valkey/tests/deployment_test.yaml b/valkey/tests/deployment_test.yaml
index 28c2653d..6368699b 100644
--- a/valkey/tests/deployment_test.yaml
+++ b/valkey/tests/deployment_test.yaml
@@ -3,6 +3,42 @@ templates:
   - templates/deploy_valkey.yaml
   - templates/init_config.yaml
 tests:
+  - it: should not create Deployment when replica.enabled is true
+    set:
+      replica.enabled: true
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should not create Deployment when cluster.enabled is true
+    set:
+      cluster.enabled: true
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should not create Deployment when both replica.enabled and cluster.enabled are true
+    set:
+      replica.enabled: true
+      cluster.enabled: true
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should create Deployment when both replica.enabled and cluster.enabled are false
+    set:
+      replica.enabled: false
+      cluster.enabled: false
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: Deployment
+
   - it: should not have auth volumes when auth disabled
     set:
       auth.enabled: false
@@ -428,3 +464,235 @@ tests:
               secretKeyRef:
                 name: my-custom-secret
                 key: my-password-key
+
+  # Regression: previously the exporter pointed REDIS_PASSWORD at a key
+  # (`default-password`) that is never created in aclConfig-only mode, so the
+  # container crash-looped on CreateContainerConfigError.
+  - it: should omit REDIS_PASSWORD when only auth.aclConfig is set
+    set:
+      auth.enabled: true
+      auth.aclConfig: "user default on >p ~* &* +@all"
+      metrics.enabled: true
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - isKind:
+          of: Deployment
+      - notContains:
+          path: spec.template.spec.containers[1].env
+          content:
+            name: REDIS_PASSWORD
+          any: true
+
+  # Regression: probes must accept NOAUTH as proof of liveness, otherwise every
+  # auth-enabled deployment's liveness probe silently passes on exit code 0
+  # while not actually checking anything meaningful.
+  - it: should use a PONG|NOAUTH-tolerant probe on the valkey container
+    set:
+      auth.enabled: true
+      auth.aclUsers:
+        default:
+          permissions: "~* &* +@all"
+          password: "p"
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - matchRegex:
+          path: spec.template.spec.containers[0].startupProbe.exec.command[2]
+          pattern: "PONG\\|NOAUTH"
+      - matchRegex:
+          path: spec.template.spec.containers[0].livenessProbe.exec.command[2]
+          pattern: "PONG\\|NOAUTH"
+      - matchRegex:
+          path: spec.template.spec.containers[0].readinessProbe.exec.command[2]
+          pattern: "PONG\\|NOAUTH"
+
+  # Regression: there should be a readiness probe at all — previously missing.
+  - it: should define a readiness probe
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - exists:
+          path: spec.template.spec.containers[0].readinessProbe
+
+  # --- Tri-state LOADING policy (see cluster_test.yaml for rationale) ---
+  - it: standalone startupProbe must reject LOADING
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - notMatchRegex:
+          path: spec.template.spec.containers[0].startupProbe.exec.command[2]
+          pattern: "LOADING"
+
+  - it: standalone livenessProbe must accept LOADING (full-resync tolerance)
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - matchRegex:
+          path: spec.template.spec.containers[0].livenessProbe.exec.command[2]
+          pattern: "PONG\\|NOAUTH\\|LOADING"
+
+  - it: standalone readinessProbe must reject LOADING
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - notMatchRegex:
+          path: spec.template.spec.containers[0].readinessProbe.exec.command[2]
+          pattern: "LOADING"
+
+  - it: top-level startupProbe overrides should land on the startupProbe only
+    set:
+      startupProbe.failureThreshold: 240
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.containers[0].startupProbe.failureThreshold
+          value: 240
+      - equal:
+          path: spec.template.spec.containers[0].livenessProbe.failureThreshold
+          value: 6
+      - equal:
+          path: spec.template.spec.containers[0].readinessProbe.failureThreshold
+          value: 3
+
+  # --- Istio ambient mode ---
+  - it: should add ambient dataplane-mode label when istio.mode=ambient
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels["istio.io/dataplane-mode"]
+          value: ambient
+
+  # In sidecar mode istio.io/dataplane-mode is set to "none" (not absent) so
+  # a dual-mode cluster running ambient too doesn't accidentally capture
+  # this pod via ztunnel on top of its Envoy sidecar.
+  - it: should set istio.io/dataplane-mode=none in sidecar mode
+    set:
+      istio.enabled: true
+      istio.mode: sidecar
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels["istio.io/dataplane-mode"]
+          value: none
+
+  # Namespaces labelled `istio-injection=enabled` would otherwise inject an
+  # Envoy sidecar AND the ambient capture label would direct traffic to
+  # ztunnel — the pod then gets double-redirected and every connection
+  # fails with "Connection reset by peer". Opting the pod out of injection
+  # explicitly is the only reliable way to make ambient work in that setup.
+  - it: should set sidecar.istio.io/inject=false in ambient mode
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels["sidecar.istio.io/inject"]
+          value: "false"
+
+  # Sidecar mode now forces injection on at the pod level so the chart
+  # doesn't silently depend on the namespace carrying
+  # istio-injection=enabled. Previously this was the user's problem to get
+  # right — half the "it's not working" issue reports in sidecar mode came
+  # down to "the namespace wasn't labelled".
+  - it: should set sidecar.istio.io/inject=true in sidecar mode
+    set:
+      istio.enabled: true
+      istio.mode: sidecar
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels["sidecar.istio.io/inject"]
+          value: "true"
+
+  - it: should NOT add ambient dataplane-mode label when istio is disabled
+    set:
+      istio.enabled: false
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - notExists:
+          path: spec.template.metadata.labels["istio.io/dataplane-mode"]
+
+  # Regression: the istioPodLabels helper used to emit nothing in sidecar
+  # mode AND still have its caller run `nindent 8`, which injects a blank
+  # line into the labels map. Valid YAML, but crap to read in diffs. After
+  # the fix the labels map holds exactly the selector labels + the two
+  # mode-specific keys — no blank key, no stray whitespace.
+  - it: should produce exactly the sidecar label set in sidecar mode
+    set:
+      istio.enabled: true
+      istio.mode: sidecar
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels
+          value:
+            app.kubernetes.io/name: valkey
+            app.kubernetes.io/instance: RELEASE-NAME
+            sidecar.istio.io/inject: "true"
+            istio.io/dataplane-mode: none
+
+  # And with ambient, exactly the two ambient-specific keys on top of the
+  # selector labels — and nothing else.
+  - it: should produce exactly the ambient label set in ambient mode
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels
+          value:
+            app.kubernetes.io/name: valkey
+            app.kubernetes.io/instance: RELEASE-NAME
+            istio.io/dataplane-mode: ambient
+            sidecar.istio.io/inject: "false"
+
+  # And with istio disabled, no mesh labels at all — users can still opt
+  # into their own mesh labels via podLabels.
+  - it: should emit no mesh labels when istio.enabled is false
+    set:
+      istio.enabled: false
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels
+          value:
+            app.kubernetes.io/name: valkey
+            app.kubernetes.io/instance: RELEASE-NAME
+
+  # User override: a podLabels entry that collides with a chart-computed
+  # mesh label (e.g. sidecar.istio.io/inject) must win cleanly, with NO
+  # duplicate YAML keys in the rendered output. This lets operators run
+  # istio.enabled=true but force a specific release out of the sidecar
+  # mesh (rare but legitimate — e.g. pinning a canary pod to plain TCP).
+  - it: user podLabels must override chart mesh labels cleanly
+    set:
+      istio.enabled: true
+      istio.mode: sidecar
+      podLabels:
+        sidecar.istio.io/inject: "false"
+        custom-label: "custom-value"
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels["sidecar.istio.io/inject"]
+          value: "false"
+      - equal:
+          path: spec.template.metadata.labels["istio.io/dataplane-mode"]
+          value: none
+      - equal:
+          path: spec.template.metadata.labels["custom-label"]
+          value: "custom-value"
+
+  # Regression: installing istio.enabled=false with a typo'd istio.mode
+  # (istio.mode=ambiet) used to sail through because the validator gated
+  # on istio.enabled. With schema validation active it now fails at lint,
+  # and the template helper would catch it too if someone force-fed an
+  # invalid value past the schema. This ensures the helper still errors.
+  - it: should reject typo'd istio.mode even when istio.enabled=false (at schema layer)
+    set:
+      istio.enabled: false
+      istio.mode: ambiet
+    template: templates/deploy_valkey.yaml
+    asserts:
+      - failedTemplate:
+          errorMessage: "values don't meet the specifications of the schema(s) in the following chart(s):\nvalkey:\n- at '/istio/mode': value must be one of 'sidecar', 'ambient'\n"
diff --git a/valkey/tests/init_config_test.yaml b/valkey/tests/init_config_test.yaml
index 0b1067de..8c6a8adc 100644
--- a/valkey/tests/init_config_test.yaml
+++ b/valkey/tests/init_config_test.yaml
@@ -145,3 +145,95 @@ tests:
       - equal:
           path: metadata.labels["app.kubernetes.io/name"]
           value: valkey
+
+  # Regression: `echo "$password"` in dash interprets \b, \t, \n, \\, etc.
+  # before writing them out. A password containing a backslash then gets a
+  # DIFFERENT sha256 than the bytes that were stored in the Secret — client
+  # auth with the real password fails WRONGPASS, since the stored ACL hash
+  # is of the mangled bytes. Must be `printf '%s'` (byte-safe).
+  - it: get_user_password must be byte-safe (no echo)
+    set:
+      auth.enabled: true
+      auth.aclUsers:
+        admin:
+          permissions: "~* &* +@all"
+          password: "admin-password"
+    asserts:
+      - notMatchRegex:
+          path: data["init.sh"]
+          pattern: 'echo "\$password"'
+      - matchRegex:
+          path: data["init.sh"]
+          pattern: "printf '%s' \"\\$password\""
+
+  # Regression: masterauth line in valkey.conf used to be unquoted, so a
+  # password with whitespace or `#` (valkey.conf comment char) would break
+  # the config parser. Must be double-quoted with backslash-escapes.
+  - it: masterauth must be written as a quoted+escaped string
+    set:
+      auth.enabled: true
+      replica.enabled: true
+      replica.persistence.size: "5Gi"
+      auth.aclUsers:
+        default:
+          permissions: "~* &* +@all"
+          password: "whatever"
+    asserts:
+      - matchRegex:
+          path: data["init.sh"]
+          pattern: "printf 'masterauth \"%s\"\\\\n' \"\\$REPL_PASSWORD_ESC\""
+      - matchRegex:
+          path: data["init.sh"]
+          # The escape pass: s/\\/\\\\/g; s/"/\\"/g
+          pattern: "REPL_PASSWORD_ESC=\\$\\(printf '%s' \"\\$REPL_PASSWORD\""
+
+  # --- nodes.conf IP refresh on cluster pod restart ---
+  # Cluster bus gossip dials by IP, even with cluster-preferred-endpoint-type=
+  # hostname. After a rolling restart pod IPs change, and a pod whose
+  # nodes.conf has only stale IPs becomes a stranded minority partition.
+  # The refresh block re-resolves each peer's announced FQDN and rewrites
+  # the IP before valkey-server starts.
+  - it: cluster mode should emit a nodes.conf IP refresh block
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    asserts:
+      - matchRegex:
+          path: data["init.sh"]
+          pattern: "NODES_CONF=/data/nodes\\.conf"
+      - matchRegex:
+          path: data["init.sh"]
+          pattern: "getent hosts \"\\$fqdn\""
+      # Atomic swap so a kill mid-rewrite can't corrupt the file.
+      - matchRegex:
+          path: data["init.sh"]
+          pattern: 'mv "\$TMP" "\$NODES_CONF"'
+      # No-op when nodes.conf doesn't exist (first boot).
+      - matchRegex:
+          path: data["init.sh"]
+          pattern: "first boot, nothing to refresh"
+      # 'vars' line at EOF must be passed through verbatim — it's
+      # currentEpoch / lastVoteEpoch state and corrupting it would force
+      # a fresh cluster join. The case-pattern matches an empty line
+      # OR a line starting with 'vars '.
+      - matchRegex:
+          path: data["init.sh"]
+          pattern: "''\\|vars"
+
+  - it: non-cluster mode should NOT emit the nodes.conf refresh block
+    # Standalone and replicated modes have no nodes.conf — emitting the
+    # block in their init.sh would just be dead code and could log
+    # confusing "first boot" messages on every restart.
+    asserts:
+      - notMatchRegex:
+          path: data["init.sh"]
+          pattern: "NODES_CONF=/data/nodes\\.conf"
+
+  - it: replicated (non-cluster) mode should NOT emit the nodes.conf refresh block
+    set:
+      replica.enabled: true
+      replica.persistence.size: "5Gi"
+    asserts:
+      - notMatchRegex:
+          path: data["init.sh"]
+          pattern: "NODES_CONF=/data/nodes\\.conf"
diff --git a/valkey/tests/istio_authorization_policy_test.yaml b/valkey/tests/istio_authorization_policy_test.yaml
new file mode 100644
index 00000000..b47406a9
--- /dev/null
+++ b/valkey/tests/istio_authorization_policy_test.yaml
@@ -0,0 +1,427 @@
+suite: istio authorization policy (cluster-bus isolation)
+templates:
+  - templates/istio-authorization-policy.yaml
+tests:
+  # --- Feature flag tests ---
+  - it: should not render when istio is disabled
+    set:
+      istio.enabled: false
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should not render when cluster mode is disabled
+    # No bus port to protect in standalone/replica mode.
+    set:
+      istio.enabled: true
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should not render in replica (non-cluster) mode
+    set:
+      istio.enabled: true
+      replica.enabled: true
+      replica.persistence.size: "5Gi"
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should not render when authorizationPolicy is explicitly disabled
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      istio.authorizationPolicy.enabled: false
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should render in cluster mode with istio enabled (sidecar default)
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: AuthorizationPolicy
+      - isAPIVersion:
+          of: security.istio.io/v1
+
+  - it: should render in cluster mode with istio enabled (ambient)
+    # Ambient ztunnel enforces the same L4 AuthorizationPolicy shape.
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: AuthorizationPolicy
+
+  # --- Resource identity ---
+  - it: should have correct name
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    asserts:
+      - equal:
+          path: metadata.name
+          value: RELEASE-NAME-valkey-cluster-bus
+
+  - it: should use fullnameOverride in name
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      fullnameOverride: "my-valkey"
+    asserts:
+      - equal:
+          path: metadata.name
+          value: my-valkey-cluster-bus
+
+  - it: should include chart labels
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    asserts:
+      - isNotNull:
+          path: metadata.labels["helm.sh/chart"]
+      - isNotNull:
+          path: metadata.labels["app.kubernetes.io/name"]
+      - isNotNull:
+          path: metadata.labels["app.kubernetes.io/managed-by"]
+
+  - it: should include commonLabels
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      commonLabels:
+        env: production
+    asserts:
+      - equal:
+          path: metadata.labels.env
+          value: production
+
+  - it: should include custom labels on the policy
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      istio.authorizationPolicy.labels:
+        security.example.com/reviewed: "true"
+    asserts:
+      - equal:
+          path: metadata.labels["security.example.com/reviewed"]
+          value: "true"
+
+  - it: should include custom annotations on the policy
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      istio.authorizationPolicy.annotations:
+        security.example.com/reviewed: "yes"
+    asserts:
+      - equal:
+          path: metadata.annotations["security.example.com/reviewed"]
+          value: "yes"
+
+  - it: should not have annotations when none are set
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    asserts:
+      - notExists:
+          path: metadata.annotations
+
+  # --- Selector and action ---
+  - it: should target Valkey pods via selector labels
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    asserts:
+      - equal:
+          path: spec.selector.matchLabels["app.kubernetes.io/name"]
+          value: valkey
+      - equal:
+          path: spec.selector.matchLabels["app.kubernetes.io/instance"]
+          value: RELEASE-NAME
+
+  - it: should be an ALLOW policy
+    # An ALLOW AuthorizationPolicy attached to these pods triggers Istio's
+    # implicit default-deny: anything not matched by a rule is blocked.
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    asserts:
+      - equal:
+          path: spec.action
+          value: ALLOW
+
+  # --- Bus-port principal rule ---
+  - it: bus-port rule must be scoped to the release's SPIFFE principal
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.busPort: 16379
+    asserts:
+      - contains:
+          path: spec.rules
+          content:
+            from:
+              - source:
+                  principals:
+                    - "cluster.local/ns/NAMESPACE/sa/RELEASE-NAME-valkey"
+            to:
+              - operation:
+                  ports:
+                    - "16379"
+
+  - it: bus-port rule should follow custom busPort
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      cluster.busPort: 26379
+    asserts:
+      - contains:
+          path: spec.rules
+          content:
+            from:
+              - source:
+                  principals:
+                    - "cluster.local/ns/NAMESPACE/sa/RELEASE-NAME-valkey"
+            to:
+              - operation:
+                  ports:
+                    - "26379"
+
+  - it: bus-port rule should follow custom serviceAccount name
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      serviceAccount.name: "my-sa"
+    asserts:
+      - contains:
+          path: spec.rules
+          content:
+            from:
+              - source:
+                  principals:
+                    - "cluster.local/ns/NAMESPACE/sa/my-sa"
+            to:
+              - operation:
+                  ports:
+                    - "16379"
+
+  # --- Client / metrics open rules ---
+  - it: client port should be open (no principal restriction)
+    # ACL/TLS live above the mesh; locking the client port to the release's
+    # own principal would lock out every legitimate caller.
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    asserts:
+      - contains:
+          path: spec.rules
+          content:
+            to:
+              - operation:
+                  ports:
+                    - "6379"
+
+  - it: metrics port should be in the open rule when metrics enabled
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      metrics.enabled: true
+      metrics.exporter.port: 9121
+    asserts:
+      - contains:
+          path: spec.rules
+          content:
+            to:
+              - operation:
+                  ports:
+                    - "6379"
+                    - "9121"
+
+  - it: metrics port should NOT appear when metrics disabled
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      metrics.enabled: false
+    asserts:
+      - notContains:
+          path: spec.rules
+          content:
+            to:
+              - operation:
+                  ports:
+                    - "6379"
+                    - "9121"
+
+  # Regression: the "open" rule (second element in spec.rules) must have
+  # NO `from` clause. If somebody ever adds a principal filter there the
+  # metrics port becomes same-release only, which silently kills every
+  # Prometheus scrape from a different namespace (the shipped
+  # ServiceMonitor path). Keep this rule unconditional.
+  - it: the open port rule must have no principal restriction
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      metrics.enabled: true
+    asserts:
+      - notExists:
+          path: spec.rules[1].from
+
+  # --- Invalid mode ---
+  - it: should reject invalid istio.mode at the schema layer
+    set:
+      istio.enabled: true
+      istio.mode: "not-a-real-mode"
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    asserts:
+      - failedTemplate:
+          errorMessage: "values don't meet the specifications of the schema(s) in the following chart(s):\nvalkey:\n- at '/istio/mode': value must be one of 'sidecar', 'ambient'\n"
+
+  # --- Custom trust domain ---
+  # A cluster federated via multi-cluster mesh (or any install that
+  # overrides istiod's default) publishes identities under a non-default
+  # trust domain. The AP principal string must follow — otherwise same-
+  # release callers ALSO fail the ALLOW match and the bus rule is a
+  # self-denial.
+  - it: principal should honour istio.trustDomain override
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      istio.trustDomain: "my.mesh.example.com"
+    asserts:
+      - contains:
+          path: spec.rules
+          content:
+            from:
+              - source:
+                  principals:
+                    - "my.mesh.example.com/ns/NAMESPACE/sa/RELEASE-NAME-valkey"
+            to:
+              - operation:
+                  ports:
+                    - "16379"
+
+  - it: principal should default to cluster.local when trustDomain unset
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    asserts:
+      - contains:
+          path: spec.rules
+          content:
+            from:
+              - source:
+                  principals:
+                    - "cluster.local/ns/NAMESPACE/sa/RELEASE-NAME-valkey"
+            to:
+              - operation:
+                  ports:
+                    - "16379"
+
+  # --- Cross-release isolation footguns ---
+  # These two validators exist to make the feature's SECURITY GUARANTEE
+  # hold: ambient cross-release isolation relies on the SPIFFE principal
+  # being UNIQUE per release AND on SOMETHING enforcing it. Without the
+  # guards you can silently ship a chart install with zero bus-port
+  # protection.
+
+  - it: should refuse ambient+cluster when shared 'default' SA would be used
+    # serviceAccount.create=false AND no explicit name collapses every
+    # release's AP principal to sa/default; cross-release MEET passes the
+    # identity check and the clusters silently merge. Live-repro'd in
+    # review — this MUST fail template at install time.
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      serviceAccount.create: false
+      serviceAccount.name: ""
+    asserts:
+      - failedTemplate:
+          errorPattern: "serviceAccount.create=false AND serviceAccount.name empty.*"
+
+  - it: should accept ambient+cluster with explicit (distinct) serviceAccount.name
+    # Opt-in for the advanced multi-release-shared-SA case — we can't tell
+    # whether the user picked a DIFFERENT name from a hypothetical other
+    # release, but at least the name is intentional.
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      serviceAccount.create: false
+      serviceAccount.name: "my-valkey-sa"
+    asserts:
+      - hasDocuments:
+          count: 1
+      - contains:
+          path: spec.rules
+          content:
+            from:
+              - source:
+                  principals:
+                    - "cluster.local/ns/NAMESPACE/sa/my-valkey-sa"
+            to:
+              - operation:
+                  ports:
+                    - "16379"
+
+  # Note: the "refuse ambient+cluster when AuthorizationPolicy AND
+  # NetworkPolicy are both off" assertion lives in istio_test.yaml — that
+  # suite renders the PeerAuthentication template, which is what carries
+  # the validator (the AP template correctly renders NOTHING when the AP
+  # is disabled, so it's the wrong place to prove the guard fires).
+
+  - it: should allow authorizationPolicy.enabled=false in sidecar mode (NetworkPolicy still guards)
+    set:
+      istio.enabled: true
+      istio.mode: sidecar
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      istio.authorizationPolicy.enabled: false
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should allow authorizationPolicy.enabled=false when istio is off
+    set:
+      istio.enabled: false
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      istio.authorizationPolicy.enabled: false
+    asserts:
+      - hasDocuments:
+          count: 0
diff --git a/valkey/tests/istio_test.yaml b/valkey/tests/istio_test.yaml
new file mode 100644
index 00000000..8c821bd2
--- /dev/null
+++ b/valkey/tests/istio_test.yaml
@@ -0,0 +1,640 @@
+suite: istio service mesh integration
+templates:
+  - templates/istio-peer-authentication.yaml
+  - templates/istio-destination-rule.yaml
+tests:
+  # --- Feature flag tests ---
+  - it: should not create PeerAuthentication when istio is disabled
+    set:
+      istio.enabled: false
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should not create DestinationRule when istio is disabled
+    set:
+      istio.enabled: false
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should create PeerAuthentication when istio is enabled
+    set:
+      istio.enabled: true
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: PeerAuthentication
+      - isAPIVersion:
+          of: security.istio.io/v1
+
+  - it: should create DestinationRule when istio is enabled
+    set:
+      istio.enabled: true
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: DestinationRule
+      - isAPIVersion:
+          of: networking.istio.io/v1
+
+  # --- PeerAuthentication tests ---
+  - it: PeerAuthentication should target Valkey pods via selector labels
+    set:
+      istio.enabled: true
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - equal:
+          path: spec.selector.matchLabels["app.kubernetes.io/name"]
+          value: valkey
+      - equal:
+          path: spec.selector.matchLabels["app.kubernetes.io/instance"]
+          value: RELEASE-NAME
+
+  - it: PeerAuthentication should default to STRICT mTLS mode
+    set:
+      istio.enabled: true
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - equal:
+          path: spec.mtls.mode
+          value: STRICT
+
+  - it: PeerAuthentication should allow overriding mTLS mode to PERMISSIVE
+    set:
+      istio.enabled: true
+      istio.peerAuthentication.mode: PERMISSIVE
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - equal:
+          path: spec.mtls.mode
+          value: PERMISSIVE
+
+  - it: PeerAuthentication should allow overriding mTLS mode to DISABLE
+    set:
+      istio.enabled: true
+      istio.peerAuthentication.mode: DISABLE
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - equal:
+          path: spec.mtls.mode
+          value: DISABLE
+
+  - it: PeerAuthentication should allow overriding mTLS mode to UNSET
+    set:
+      istio.enabled: true
+      istio.peerAuthentication.mode: UNSET
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - equal:
+          path: spec.mtls.mode
+          value: UNSET
+
+  - it: PeerAuthentication should have correct name
+    set:
+      istio.enabled: true
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - equal:
+          path: metadata.name
+          value: RELEASE-NAME-valkey
+
+  - it: PeerAuthentication should include chart labels
+    set:
+      istio.enabled: true
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - isNotNull:
+          path: metadata.labels["helm.sh/chart"]
+      - isNotNull:
+          path: metadata.labels["app.kubernetes.io/name"]
+      - isNotNull:
+          path: metadata.labels["app.kubernetes.io/managed-by"]
+
+  - it: PeerAuthentication should include custom labels
+    set:
+      istio.enabled: true
+      istio.peerAuthentication.labels:
+        security.example.com/policy: strict
+        team: platform
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - equal:
+          path: metadata.labels["security.example.com/policy"]
+          value: strict
+      - equal:
+          path: metadata.labels["team"]
+          value: platform
+
+  - it: PeerAuthentication should include custom annotations
+    set:
+      istio.enabled: true
+      istio.peerAuthentication.annotations:
+        security.example.com/reviewed: "true"
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - equal:
+          path: metadata.annotations["security.example.com/reviewed"]
+          value: "true"
+
+  - it: PeerAuthentication should not have annotations when none are set
+    set:
+      istio.enabled: true
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - notExists:
+          path: metadata.annotations
+
+  - it: PeerAuthentication should include common labels when set
+    set:
+      istio.enabled: true
+      commonLabels:
+        env: production
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - equal:
+          path: metadata.labels.env
+          value: production
+
+  # --- DestinationRule tests (main service) ---
+  - it: DestinationRule should target the main service host in standalone mode
+    set:
+      istio.enabled: true
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - equal:
+          path: spec.host
+          value: RELEASE-NAME-valkey.NAMESPACE.svc.cluster.local
+
+  - it: DestinationRule should default to ISTIO_MUTUAL TLS mode
+    set:
+      istio.enabled: true
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - equal:
+          path: spec.trafficPolicy.tls.mode
+          value: ISTIO_MUTUAL
+
+  - it: DestinationRule should allow overriding TLS mode
+    set:
+      istio.enabled: true
+      istio.destinationRule.mode: MUTUAL
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - equal:
+          path: spec.trafficPolicy.tls.mode
+          value: MUTUAL
+
+  - it: DestinationRule should allow SIMPLE TLS mode
+    set:
+      istio.enabled: true
+      istio.destinationRule.mode: SIMPLE
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - equal:
+          path: spec.trafficPolicy.tls.mode
+          value: SIMPLE
+
+  - it: DestinationRule should allow DISABLE TLS mode
+    set:
+      istio.enabled: true
+      istio.destinationRule.mode: DISABLE
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - equal:
+          path: spec.trafficPolicy.tls.mode
+          value: DISABLE
+
+  - it: DestinationRule should have correct name
+    set:
+      istio.enabled: true
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - equal:
+          path: metadata.name
+          value: RELEASE-NAME-valkey
+
+  - it: DestinationRule should include chart labels
+    set:
+      istio.enabled: true
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - isNotNull:
+          path: metadata.labels["helm.sh/chart"]
+      - isNotNull:
+          path: metadata.labels["app.kubernetes.io/name"]
+
+  - it: DestinationRule should include custom labels
+    set:
+      istio.enabled: true
+      istio.destinationRule.labels:
+        networking.example.com/managed: "true"
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - equal:
+          path: metadata.labels["networking.example.com/managed"]
+          value: "true"
+
+  - it: DestinationRule should include custom annotations
+    set:
+      istio.enabled: true
+      istio.destinationRule.annotations:
+        networking.example.com/reviewed: "true"
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - equal:
+          path: metadata.annotations["networking.example.com/reviewed"]
+          value: "true"
+
+  - it: DestinationRule should not have annotations when none are set
+    set:
+      istio.enabled: true
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - notExists:
+          path: metadata.annotations
+
+  - it: DestinationRule should use custom cluster domain
+    set:
+      istio.enabled: true
+      clusterDomain: my.custom.domain
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - equal:
+          path: spec.host
+          value: RELEASE-NAME-valkey.NAMESPACE.svc.my.custom.domain
+
+  # --- DestinationRule headless service tests (cluster mode) ---
+  - it: should create headless DestinationRule when cluster mode is enabled
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - hasDocuments:
+          count: 2
+
+  - it: headless DestinationRule should target the headless service host
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/istio-destination-rule.yaml
+    documentIndex: 1
+    asserts:
+      - isKind:
+          of: DestinationRule
+      - equal:
+          path: metadata.name
+          value: RELEASE-NAME-valkey-headless
+      - equal:
+          path: spec.host
+          value: RELEASE-NAME-valkey-headless.NAMESPACE.svc.cluster.local
+
+  - it: headless DestinationRule should use same TLS mode as main
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/istio-destination-rule.yaml
+    documentIndex: 1
+    asserts:
+      - equal:
+          path: spec.trafficPolicy.tls.mode
+          value: ISTIO_MUTUAL
+
+  - it: headless DestinationRule should respect overridden TLS mode
+    set:
+      istio.enabled: true
+      istio.destinationRule.mode: MUTUAL
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/istio-destination-rule.yaml
+    documentIndex: 1
+    asserts:
+      - equal:
+          path: spec.trafficPolicy.tls.mode
+          value: MUTUAL
+
+  - it: headless DestinationRule should include custom labels
+    set:
+      istio.enabled: true
+      istio.destinationRule.labels:
+        networking.example.com/managed: "true"
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/istio-destination-rule.yaml
+    documentIndex: 1
+    asserts:
+      - equal:
+          path: metadata.labels["networking.example.com/managed"]
+          value: "true"
+
+  - it: headless DestinationRule should include custom annotations
+    set:
+      istio.enabled: true
+      istio.destinationRule.annotations:
+        networking.example.com/reviewed: "true"
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/istio-destination-rule.yaml
+    documentIndex: 1
+    asserts:
+      - equal:
+          path: metadata.annotations["networking.example.com/reviewed"]
+          value: "true"
+
+  - it: headless DestinationRule should include chart labels
+    set:
+      istio.enabled: true
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/istio-destination-rule.yaml
+    documentIndex: 1
+    asserts:
+      - isNotNull:
+          path: metadata.labels["helm.sh/chart"]
+      - isNotNull:
+          path: metadata.labels["app.kubernetes.io/name"]
+
+  # --- DestinationRule headless service tests (replica mode) ---
+  - it: should create headless DestinationRule when replica mode is enabled
+    set:
+      istio.enabled: true
+      replica.enabled: true
+      replica.persistence.size: "5Gi"
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - hasDocuments:
+          count: 2
+
+  - it: headless DestinationRule should target headless service in replica mode
+    set:
+      istio.enabled: true
+      replica.enabled: true
+      replica.persistence.size: "5Gi"
+    template: templates/istio-destination-rule.yaml
+    documentIndex: 1
+    asserts:
+      - isKind:
+          of: DestinationRule
+      - equal:
+          path: metadata.name
+          value: RELEASE-NAME-valkey-headless
+      - equal:
+          path: spec.host
+          value: RELEASE-NAME-valkey-headless.NAMESPACE.svc.cluster.local
+
+  # --- Standalone mode tests ---
+  - it: should only create main DestinationRule in standalone mode (no headless)
+    set:
+      istio.enabled: true
+      cluster.enabled: false
+      replica.enabled: false
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - hasDocuments:
+          count: 1
+      - equal:
+          path: metadata.name
+          value: RELEASE-NAME-valkey
+
+  # --- Name override tests ---
+  - it: should use fullnameOverride in PeerAuthentication
+    set:
+      istio.enabled: true
+      fullnameOverride: "my-valkey"
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - equal:
+          path: metadata.name
+          value: my-valkey
+
+  - it: should use fullnameOverride in DestinationRule
+    set:
+      istio.enabled: true
+      fullnameOverride: "my-valkey"
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - equal:
+          path: metadata.name
+          value: my-valkey
+      - equal:
+          path: spec.host
+          value: my-valkey.NAMESPACE.svc.cluster.local
+
+  - it: should use fullnameOverride in headless DestinationRule
+    set:
+      istio.enabled: true
+      fullnameOverride: "my-valkey"
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/istio-destination-rule.yaml
+    documentIndex: 1
+    asserts:
+      - equal:
+          path: metadata.name
+          value: my-valkey-headless
+      - equal:
+          path: spec.host
+          value: my-valkey-headless.NAMESPACE.svc.cluster.local
+
+  # --- Mode validation ---
+  # The schema catches typos in istio.mode at install time (before any
+  # template renders). This keeps errors fast and mode-neutral, unlike the
+  # old regime where only templates that happened to render in the chosen
+  # mode would fail — a bogus mode with istio.enabled=false would silently
+  # sail through.
+  - it: should reject invalid istio.mode at the schema layer
+    set:
+      istio.enabled: true
+      istio.mode: waypoint   # typo — not a real mode
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - failedTemplate:
+          errorMessage: "values don't meet the specifications of the schema(s) in the following chart(s):\nvalkey:\n- at '/istio/mode': value must be one of 'sidecar', 'ambient'\n"
+
+  - it: should accept istio.mode=sidecar
+    set:
+      istio.enabled: true
+      istio.mode: sidecar
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - hasDocuments:
+          count: 1
+
+  - it: should accept istio.mode=ambient
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - hasDocuments:
+          count: 1
+
+  # --- Ambient mode: DestinationRule must NOT render ---
+  # ztunnel already wraps pod-to-pod hops in HBONE mTLS; a DR on top would
+  # layer Envoy mTLS inside ztunnel mTLS (double crypto) and requires a
+  # waypoint proxy to even take effect. Keep it off.
+  - it: should not render DestinationRule in ambient mode (standalone)
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should not render DestinationRule in ambient mode (cluster)
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should not render DestinationRule in ambient mode (replica)
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+      replica.enabled: true
+      replica.persistence.size: "5Gi"
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should render DestinationRule in sidecar mode (default)
+    set:
+      istio.enabled: true
+    template: templates/istio-destination-rule.yaml
+    asserts:
+      - hasDocuments:
+          count: 1
+
+  # --- Ambient mode: PeerAuthentication still applies ---
+  # Enforced by ztunnel instead of Envoy, but the CRD shape is identical.
+  - it: should render PeerAuthentication in ambient mode
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - hasDocuments:
+          count: 1
+      - equal:
+          path: spec.mtls.mode
+          value: STRICT
+
+  - it: PeerAuthentication should be identical across modes
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+      istio.peerAuthentication.mode: PERMISSIVE
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - equal:
+          path: spec.mtls.mode
+          value: PERMISSIVE
+
+  # --- Cross-release isolation guards ---
+  # These assert the chart refuses to silently ship an unprotected cluster
+  # bus. The validator is called from the PeerAuthentication template so it
+  # fires regardless of whether the AuthorizationPolicy itself renders.
+
+  - it: should refuse ambient+cluster when AuthorizationPolicy is off
+    # Ambient mode skips the bus-port NetworkPolicy (it would drop HBONE);
+    # disabling the AP on top leaves the port completely unprotected across
+    # releases. The chart must fail closed.
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      istio.authorizationPolicy.enabled: false
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - failedTemplate:
+          errorPattern: "istio.authorizationPolicy.enabled=false in ambient mode.*cluster-bus port unprotected.*"
+
+  - it: should refuse ambient+cluster when serviceAccount collapses to default
+    # serviceAccount.create=false AND serviceAccount.name="" produces the
+    # shared `default` SA. Two such releases in the same namespace generate
+    # identical AP principals; cross-release MEET succeeds. Repro'd live.
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      serviceAccount.create: false
+      serviceAccount.name: ""
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - failedTemplate:
+          errorPattern: "serviceAccount.create=false AND serviceAccount.name empty.*"
+
+  - it: should accept ambient+cluster with serviceAccount.create=true (default)
+    # Per-release SA (default): distinct SPIFFE principal per release,
+    # AP correctly isolates.
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      serviceAccount.create: true
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - hasDocuments:
+          count: 1
+
+  - it: should accept ambient+cluster with explicit serviceAccount.name override
+    # User takes responsibility for distinct naming.
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      serviceAccount.create: false
+      serviceAccount.name: "my-valkey-sa"
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - hasDocuments:
+          count: 1
+
+  # --- Custom trust domain ---
+  - it: PeerAuthentication unaffected by custom trustDomain
+    # PA doesn't reference principals, so trustDomain is a no-op here. Just
+    # prove nothing breaks when the value is set.
+    set:
+      istio.enabled: true
+      istio.mode: ambient
+      istio.trustDomain: "my.mesh.example.com"
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - hasDocuments:
+          count: 1
+      - equal:
+          path: spec.mtls.mode
+          value: STRICT
+
+  # --- Typo defence ---
+  # Schema check fires even when istio.enabled=false so typos surface at
+  # GitOps-commit time, not after someone flips the toggle in production.
+  - it: should reject typo'd istio.mode even with istio.enabled=false
+    set:
+      istio.enabled: false
+      istio.mode: ambiet
+    template: templates/istio-peer-authentication.yaml
+    asserts:
+      - failedTemplate:
+          errorMessage: "values don't meet the specifications of the schema(s) in the following chart(s):\nvalkey:\n- at '/istio/mode': value must be one of 'sidecar', 'ambient'\n"
diff --git a/valkey/tests/netpolicy_test.yaml b/valkey/tests/netpolicy_test.yaml
new file mode 100644
index 00000000..43081f23
--- /dev/null
+++ b/valkey/tests/netpolicy_test.yaml
@@ -0,0 +1,49 @@
+suite: network policy configuration
+templates:
+  - templates/netpolicy.yaml
+tests:
+  - it: should render nothing when networkPolicy is empty
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  # Default-deny ingress is expressed as an empty list; the chart must keep
+  # that distinct from "never declared" — otherwise the opt-in is lost.
+  - it: should render a default-deny ingress policy when ingress is an empty list
+    set:
+      networkPolicy:
+        ingress: []
+    asserts:
+      - isKind:
+          of: NetworkPolicy
+      - equal:
+          path: spec.policyTypes
+          value:
+            - Ingress
+      - equal:
+          path: spec.ingress
+          value: []
+      - notExists:
+          path: spec.egress
+
+  - it: should render both ingress and egress when both declared
+    set:
+      networkPolicy:
+        ingress:
+          - from:
+              - podSelector: {}
+        egress:
+          - to:
+              - podSelector: {}
+    asserts:
+      - isKind:
+          of: NetworkPolicy
+      - equal:
+          path: spec.policyTypes
+          value:
+            - Ingress
+            - Egress
+      - isNotEmpty:
+          path: spec.ingress
+      - isNotEmpty:
+          path: spec.egress
diff --git a/valkey/tests/poddisruptionbudget_test.yaml b/valkey/tests/poddisruptionbudget_test.yaml
index dd7c6079..6999ea22 100644
--- a/valkey/tests/poddisruptionbudget_test.yaml
+++ b/valkey/tests/poddisruptionbudget_test.yaml
@@ -11,14 +11,31 @@ tests:
       - hasDocuments:
           count: 0
 
-  - it: should not create PDB when replica is disabled
+  - it: should not create PDB when neither replica nor cluster is enabled
     set:
       replica.enabled: false
+      cluster.enabled: false
       podDisruptionBudget.enabled: true
     asserts:
       - hasDocuments:
           count: 0
 
+  # Regression: PDB used to be gated on replica mode only; cluster mode was
+  # silently unprotected.
+  - it: should create PDB when enabled with cluster mode
+    set:
+      cluster.enabled: true
+      cluster.persistence.size: "5Gi"
+      podDisruptionBudget.enabled: true
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: PodDisruptionBudget
+      - equal:
+          path: spec.maxUnavailable
+          value: 1
+
   - it: should create PDB when enabled with replica
     set:
       replica.enabled: true
diff --git a/valkey/tests/pvc_test.yaml b/valkey/tests/pvc_test.yaml
new file mode 100644
index 00000000..003939f8
--- /dev/null
+++ b/valkey/tests/pvc_test.yaml
@@ -0,0 +1,165 @@
+suite: pvc configuration
+templates:
+  - templates/pvc.yaml
+tests:
+  - it: should not create PVC when replica.enabled is true
+    set:
+      dataStorage.enabled: true
+      dataStorage.requestedSize: "8Gi"
+      replica.enabled: true
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should not create PVC when cluster.enabled is true
+    set:
+      dataStorage.enabled: true
+      dataStorage.requestedSize: "8Gi"
+      cluster.enabled: true
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should not create PVC when both replica.enabled and cluster.enabled are true
+    set:
+      dataStorage.enabled: true
+      dataStorage.requestedSize: "8Gi"
+      replica.enabled: true
+      cluster.enabled: true
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should create PVC when both replica.enabled and cluster.enabled are false and conditions are met
+    set:
+      replica.enabled: false
+      cluster.enabled: false
+      dataStorage.enabled: true
+      dataStorage.requestedSize: "8Gi"
+      dataStorage.persistentVolumeClaimName: ""
+    asserts:
+      - hasDocuments:
+          count: 1
+      - isKind:
+          of: PersistentVolumeClaim
+
+  - it: should not create PVC when dataStorage.enabled is false
+    set:
+      dataStorage.enabled: false
+      replica.enabled: false
+      cluster.enabled: false
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should not create PVC when dataStorage.requestedSize is empty
+    set:
+      dataStorage.enabled: true
+      dataStorage.requestedSize: ""
+      replica.enabled: false
+      cluster.enabled: false
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should not create PVC when dataStorage.persistentVolumeClaimName is set
+    set:
+      dataStorage.enabled: true
+      dataStorage.requestedSize: "8Gi"
+      dataStorage.persistentVolumeClaimName: "existing-pvc"
+      replica.enabled: false
+      cluster.enabled: false
+    asserts:
+      - hasDocuments:
+          count: 0
+
+  - it: should have correct storage size
+    set:
+      dataStorage.enabled: true
+      dataStorage.requestedSize: "16Gi"
+      replica.enabled: false
+      cluster.enabled: false
+    asserts:
+      - isKind:
+          of: PersistentVolumeClaim
+      - equal:
+          path: spec.resources.requests.storage
+          value: "16Gi"
+
+  - it: should have keepPvc annotation when enabled
+    set:
+      dataStorage.enabled: true
+      dataStorage.requestedSize: "8Gi"
+      dataStorage.keepPvc: true
+      replica.enabled: false
+      cluster.enabled: false
+    asserts:
+      - isKind:
+          of: PersistentVolumeClaim
+      - equal:
+          path: metadata.annotations["helm.sh/resource-policy"]
+          value: keep
+
+  - it: should have custom storage class when specified
+    set:
+      dataStorage.enabled: true
+      dataStorage.requestedSize: "8Gi"
+      dataStorage.className: "fast-ssd"
+      replica.enabled: false
+      cluster.enabled: false
+    asserts:
+      - isKind:
+          of: PersistentVolumeClaim
+      - equal:
+          path: spec.storageClassName
+          value: fast-ssd
+
+  - it: should have custom labels when specified
+    set:
+      dataStorage.enabled: true
+      dataStorage.requestedSize: "8Gi"
+      dataStorage.labels:
+        custom.label: "value"
+        another.label: "test"
+      replica.enabled: false
+      cluster.enabled: false
+    asserts:
+      - isKind:
+          of: PersistentVolumeClaim
+      - equal:
+          path: metadata.labels["custom.label"]
+          value: value
+      - equal:
+          path: metadata.labels["another.label"]
+          value: test
+
+  - it: should have custom annotations when specified
+    set:
+      dataStorage.enabled: true
+      dataStorage.requestedSize: "8Gi"
+      dataStorage.annotations:
+        custom.annotation: "value"
+      replica.enabled: false
+      cluster.enabled: false
+    asserts:
+      - isKind:
+          of: PersistentVolumeClaim
+      - equal:
+          path: metadata.annotations["custom.annotation"]
+          value: value
+
+  - it: should have correct access modes
+    set:
+      dataStorage.enabled: true
+      dataStorage.requestedSize: "8Gi"
+      dataStorage.accessModes:
+        - ReadWriteOnce
+      replica.enabled: false
+      cluster.enabled: false
+    asserts:
+      - isKind:
+          of: PersistentVolumeClaim
+      - equal:
+          path: spec.accessModes
+          value:
+            - ReadWriteOnce
diff --git a/valkey/tests/service_test.yaml b/valkey/tests/service_test.yaml
index 115c137a..364fa083 100644
--- a/valkey/tests/service_test.yaml
+++ b/valkey/tests/service_test.yaml
@@ -86,3 +86,67 @@ tests:
           content:
             app.kubernetes.io/instance: RELEASE-NAME
             app.kubernetes.io/name: valkey
+  - it: should pin to pod-0 when replica.enabled is true
+    set:
+      replica.enabled: true
+    template: templates/service.yaml
+    asserts:
+      - isKind:
+          of: Service
+      - equal:
+          path: spec.selector["statefulset.kubernetes.io/pod-name"]
+          value: RELEASE-NAME-valkey-0
+  - it: should not pin to pod-0 when cluster.enabled is true
+    set:
+      cluster.enabled: true
+    template: templates/service.yaml
+    asserts:
+      - isKind:
+          of: Service
+      - notExists:
+          path: spec.selector["statefulset.kubernetes.io/pod-name"]
+  - it: should not pin to pod-0 when both replica.enabled and cluster.enabled are false
+    set:
+      replica.enabled: false
+      cluster.enabled: false
+    template: templates/service.yaml
+    asserts:
+      - isKind:
+          of: Service
+      - notExists:
+          path: spec.selector["statefulset.kubernetes.io/pod-name"]
+  - it: should never expose the cluster bus port on the frontend service
+    # The bus port carries gossip + failover traffic between nodes; it's
+    # reached via the headless service, not the round-robin frontend.
+    set:
+      cluster.enabled: true
+      cluster.busPort: 16379
+    template: templates/service.yaml
+    asserts:
+      - isKind:
+          of: Service
+      - notContains:
+          path: spec.ports
+          content:
+            name: tcp-bus
+      - notContains:
+          path: spec.ports
+          content:
+            port: 16379
+
+  # Regression: loadBalancerSourceRanges used to render via Go's default
+  # slice-to-string pipeline ([a b]), which the API server rejects as invalid
+  # CIDR. The fix is to emit a proper YAML list.
+  - it: should render loadBalancerSourceRanges as a YAML list
+    set:
+      service.type: LoadBalancer
+      service.loadBalancerSourceRanges:
+        - "1.2.3.4/32"
+        - "5.6.7.8/32"
+    template: templates/service.yaml
+    asserts:
+      - equal:
+          path: spec.loadBalancerSourceRanges
+          value:
+            - "1.2.3.4/32"
+            - "5.6.7.8/32"
diff --git a/valkey/tests/statefulset_test.yaml b/valkey/tests/statefulset_test.yaml
index 6deb88ab..05efddde 100644
--- a/valkey/tests/statefulset_test.yaml
+++ b/valkey/tests/statefulset_test.yaml
@@ -371,3 +371,113 @@ tests:
               secretKeyRef:
                 name: my-custom-secret
                 key: my-password-key
+
+  # Regression: probes must accept NOAUTH as proof of liveness.
+  - it: should use a PONG|NOAUTH-tolerant probe on the valkey container
+    set:
+      replica.enabled: true
+      replica.persistence.size: "5Gi"
+      auth.enabled: true
+      auth.aclUsers:
+        default:
+          permissions: "~* &* +@all"
+          password: "p"
+    template: templates/statefulset.yaml
+    asserts:
+      - matchRegex:
+          path: spec.template.spec.containers[0].startupProbe.exec.command[2]
+          pattern: "PONG\\|NOAUTH"
+      - matchRegex:
+          path: spec.template.spec.containers[0].livenessProbe.exec.command[2]
+          pattern: "PONG\\|NOAUTH"
+      - matchRegex:
+          path: spec.template.spec.containers[0].readinessProbe.exec.command[2]
+          pattern: "PONG\\|NOAUTH"
+
+  - it: should define a readiness probe
+    set:
+      replica.enabled: true
+      replica.persistence.size: "5Gi"
+    template: templates/statefulset.yaml
+    asserts:
+      - exists:
+          path: spec.template.spec.containers[0].readinessProbe
+
+  # --- Tri-state LOADING policy (see cluster_test.yaml for rationale) ---
+  - it: replicated startupProbe must reject LOADING
+    set:
+      replica.enabled: true
+      replica.persistence.size: "5Gi"
+    template: templates/statefulset.yaml
+    asserts:
+      - notMatchRegex:
+          path: spec.template.spec.containers[0].startupProbe.exec.command[2]
+          pattern: "LOADING"
+
+  - it: replicated livenessProbe must accept LOADING (full-resync tolerance)
+    set:
+      replica.enabled: true
+      replica.persistence.size: "5Gi"
+    template: templates/statefulset.yaml
+    asserts:
+      - matchRegex:
+          path: spec.template.spec.containers[0].livenessProbe.exec.command[2]
+          pattern: "PONG\\|NOAUTH\\|LOADING"
+
+  - it: replicated readinessProbe must reject LOADING
+    set:
+      replica.enabled: true
+      replica.persistence.size: "5Gi"
+    template: templates/statefulset.yaml
+    asserts:
+      - notMatchRegex:
+          path: spec.template.spec.containers[0].readinessProbe.exec.command[2]
+          pattern: "LOADING"
+
+  - it: replica.startupProbe overrides should land on the startupProbe only
+    set:
+      replica.enabled: true
+      replica.persistence.size: "5Gi"
+      replica.startupProbe.failureThreshold: 240
+    template: templates/statefulset.yaml
+    asserts:
+      - equal:
+          path: spec.template.spec.containers[0].startupProbe.failureThreshold
+          value: 240
+      - equal:
+          path: spec.template.spec.containers[0].livenessProbe.failureThreshold
+          value: 6
+      - equal:
+          path: spec.template.spec.containers[0].readinessProbe.failureThreshold
+          value: 3
+
+  # --- Istio ambient mode ---
+  - it: should add ambient dataplane-mode label when istio.mode=ambient
+    set:
+      replica.enabled: true
+      replica.persistence.size: "5Gi"
+      istio.enabled: true
+      istio.mode: ambient
+    template: templates/statefulset.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels["istio.io/dataplane-mode"]
+          value: ambient
+
+  # Sidecar mode emits istio.io/dataplane-mode=none (veto ambient capture)
+  # and sidecar.istio.io/inject=true (force injection), so the chart is
+  # self-sufficient on clusters that run both data planes side-by-side.
+  - it: should set sidecar-mode mesh labels in sidecar mode
+    set:
+      replica.enabled: true
+      replica.persistence.size: "5Gi"
+      istio.enabled: true
+      istio.mode: sidecar
+    template: templates/statefulset.yaml
+    asserts:
+      - equal:
+          path: spec.template.metadata.labels["istio.io/dataplane-mode"]
+          value: none
+      - equal:
+          path: spec.template.metadata.labels["sidecar.istio.io/inject"]
+          value: "true"
diff --git a/valkey/values.schema.json b/valkey/values.schema.json
index 5db0f4ac..a02c2884 100644
--- a/valkey/values.schema.json
+++ b/valkey/values.schema.json
@@ -1,5 +1,24 @@
 {
     "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "$defs": {
+        "probeTuning": {
+            "type": "object",
+            "properties": {
+                "periodSeconds": {
+                    "type": "integer",
+                    "minimum": 1
+                },
+                "timeoutSeconds": {
+                    "type": "integer",
+                    "minimum": 1
+                },
+                "failureThreshold": {
+                    "type": "integer",
+                    "minimum": 1
+                }
+            }
+        }
+    },
     "type": "object",
     "properties": {
         "affinity": {
@@ -22,6 +41,103 @@
                 }
             }
         },
+        "cluster": {
+            "type": "object",
+            "properties": {
+                "enabled": {
+                    "type": "boolean"
+                },
+                "shards": {
+                    "type": "integer"
+                },
+                "replicasPerShard": {
+                    "type": "integer"
+                },
+                "replicationUser": {
+                    "type": "string"
+                },
+                "nodeTimeout": {
+                    "type": "integer"
+                },
+                "requireFullCoverage": {
+                    "type": "boolean"
+                },
+                "allowReadsWhenDown": {
+                    "type": "boolean"
+                },
+                "persistence": {
+                    "type": "object",
+                    "properties": {
+                        "size": {
+                            "type": "string"
+                        },
+                        "storageClass": {
+                            "type": "string"
+                        },
+                        "accessModes": {
+                            "type": "array",
+                            "items": {
+                                "type": "string"
+                            }
+                        }
+                    }
+                },
+                "busPort": {
+                    "type": "integer"
+                },
+                "isolation": {
+                    "type": "object",
+                    "properties": {
+                        "enabled": {
+                            "type": "boolean"
+                        }
+                    }
+                },
+                "startupProbe": {
+                    "$ref": "#/$defs/probeTuning"
+                },
+                "livenessProbe": {
+                    "$ref": "#/$defs/probeTuning"
+                },
+                "readinessProbe": {
+                    "$ref": "#/$defs/probeTuning"
+                },
+                "preStopFailover": {
+                    "type": "object",
+                    "properties": {
+                        "enabled": {
+                            "type": "boolean"
+                        },
+                        "timeoutSeconds": {
+                            "type": "integer",
+                            "minimum": 1
+                        }
+                    }
+                },
+                "terminationGracePeriodSeconds": {
+                    "type": "integer",
+                    "minimum": 1
+                },
+                "initJob": {
+                    "type": "object",
+                    "properties": {
+                        "podLabels": {
+                            "type": "object"
+                        },
+                        "podAnnotations": {
+                            "type": "object"
+                        },
+                        "ttlSecondsAfterFinished": {
+                            "type": "integer",
+                            "minimum": 0
+                        }
+                    }
+                },
+                "persistentVolumeClaimRetentionPolicy": {
+                    "type": "object"
+                }
+            }
+        },
         "clusterDomain": {
             "type": "string"
         },
@@ -82,9 +198,6 @@
         "extraInitContainers": {
             "type": "array"
         },
-        "extraSecretValkeyConfigs": {
-            "type": "boolean"
-        },
         "extraValkeyConfigs": {
             "type": "array"
         },
@@ -134,6 +247,78 @@
         "initResources": {
             "type": "object"
         },
+        "istio": {
+            "type": "object",
+            "properties": {
+                "enabled": {
+                    "type": "boolean"
+                },
+                "mode": {
+                    "type": "string",
+                    "enum": [
+                        "sidecar",
+                        "ambient"
+                    ]
+                },
+                "trustDomain": {
+                    "type": "string"
+                },
+                "peerAuthentication": {
+                    "type": "object",
+                    "properties": {
+                        "mode": {
+                            "type": "string",
+                            "enum": [
+                                "STRICT",
+                                "PERMISSIVE",
+                                "DISABLE",
+                                "UNSET"
+                            ]
+                        },
+                        "labels": {
+                            "type": "object"
+                        },
+                        "annotations": {
+                            "type": "object"
+                        }
+                    }
+                },
+                "destinationRule": {
+                    "type": "object",
+                    "properties": {
+                        "mode": {
+                            "type": "string",
+                            "enum": [
+                                "DISABLE",
+                                "SIMPLE",
+                                "MUTUAL",
+                                "ISTIO_MUTUAL"
+                            ]
+                        },
+                        "labels": {
+                            "type": "object"
+                        },
+                        "annotations": {
+                            "type": "object"
+                        }
+                    }
+                },
+                "authorizationPolicy": {
+                    "type": "object",
+                    "properties": {
+                        "enabled": {
+                            "type": "boolean"
+                        },
+                        "labels": {
+                            "type": "object"
+                        },
+                        "annotations": {
+                            "type": "object"
+                        }
+                    }
+                }
+            }
+        },
         "metrics": {
             "type": "object",
             "properties": {
@@ -361,6 +546,15 @@
         "priorityClassName": {
             "type": "string"
         },
+        "startupProbe": {
+            "$ref": "#/$defs/probeTuning"
+        },
+        "livenessProbe": {
+            "$ref": "#/$defs/probeTuning"
+        },
+        "readinessProbe": {
+            "$ref": "#/$defs/probeTuning"
+        },
         "replica": {
             "type": "object",
             "properties": {
@@ -396,6 +590,15 @@
                 "persistentVolumeClaimRetentionPolicy": {
                     "type": "object"
                 },
+                "startupProbe": {
+                    "$ref": "#/$defs/probeTuning"
+                },
+                "livenessProbe": {
+                    "$ref": "#/$defs/probeTuning"
+                },
+                "readinessProbe": {
+                    "$ref": "#/$defs/probeTuning"
+                },
                 "replicas": {
                     "type": "integer"
                 },
diff --git a/valkey/values.yaml b/valkey/values.yaml
index 20e00b62..7517705e 100644
--- a/valkey/values.yaml
+++ b/valkey/values.yaml
@@ -98,6 +98,41 @@ resources: {}
   #   cpu: 100m
   #   memory: 128Mi
 
+# Probe tuning. The probe command is `valkey-cli ping` (with TLS args
+# when tls.enabled). Only the timing knobs are configurable.
+#
+# The three probes have different LOADING policies:
+#
+#   startupProbe — does NOT accept LOADING, so it actually gates
+#     liveness/readiness during initial RDB load. On a slow loader, bump
+#     failureThreshold (or periodSeconds) so failureThreshold *
+#     periodSeconds comfortably exceeds your worst-case RDB load time.
+#     Default 30 * 5s = 150s suits small dev datasets only; a 44 GB
+#     primary needs roughly 240 * 5s = 20 minutes.
+#
+#   livenessProbe — DOES accept LOADING. After startup passes, LOADING
+#     almost always means a full-resync from primary is in progress
+#     (replica fell behind, replication backlog overflowed, etc.).
+#     Killing the pod here loses the in-flight download and forces yet
+#     another full resync, perpetuating the kill-loop. A pod stuck
+#     loading forever is rare and harmless compared to the kill-loop.
+#
+#   readinessProbe — does NOT accept LOADING. A LOADING pod can't serve
+#     traffic, so it must leave the Service endpoint set until it can.
+#     The pod sits 'Running 0/1' during full-resync — exactly right.
+startupProbe:
+  periodSeconds: 5
+  timeoutSeconds: 5
+  failureThreshold: 30
+livenessProbe:
+  periodSeconds: 10
+  timeoutSeconds: 5
+  failureThreshold: 6
+readinessProbe:
+  periodSeconds: 5
+  timeoutSeconds: 3
+  failureThreshold: 3
+
 # Resource limits/requests for init containers
 initResources: {}
   # Example:
@@ -156,9 +191,6 @@ extraValkeySecrets: []
 # Mount additional configMaps into the Valkey container
 extraValkeyConfigs: []
 
-# Mount extra secrets as volume to init container (deprecated, use extraValkeySecrets)
-extraSecretValkeyConfigs: false
-
 # Mount additional emptyDir or hostPath volumes (advanced use)
 extraVolumes: []
 #  - name: hostpath-volume
@@ -264,6 +296,172 @@ replica:
   # More info: https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#persistentvolumeclaim-retention
   persistentVolumeClaimRetentionPolicy: {}
 
+  # Probe tuning for replicated (non-cluster) StatefulSet pods. Same
+  # rationale as the top-level startupProbe block — bump failureThreshold
+  # for large RDBs.
+  startupProbe:
+    periodSeconds: 5
+    timeoutSeconds: 5
+    failureThreshold: 30
+  livenessProbe:
+    periodSeconds: 10
+    timeoutSeconds: 5
+    failureThreshold: 6
+  readinessProbe:
+    periodSeconds: 5
+    timeoutSeconds: 3
+    failureThreshold: 3
+
+# Cluster mode configuration for Valkey Cluster (sharded deployment)
+# Note: cluster.enabled and replica.enabled are mutually exclusive
+cluster:
+  # Enable cluster mode (creates a sharded Valkey cluster)
+  enabled: false
+
+  # Number of shards (primary nodes). Minimum recommended is 3 for cluster mode.
+  # Each shard handles a portion of the hash slot range (16384 slots total).
+  shards: 3
+
+  # Number of replicas per shard (for high availability within each shard)
+  # Total nodes = shards × (1 + replicasPerShard)
+  # For example: 3 shards with 1 replica each = 6 nodes total
+  replicasPerShard: 1
+
+  # Username for cluster replication authentication, ignored if auth.enabled is false.
+  # IMPORTANT: When auth.enabled is true, this user MUST be defined in auth.aclUsers.
+  # The user must have appropriate replication permissions: +psync +replconf +ping
+  replicationUser: "default"
+
+  # Cluster node timeout in milliseconds (how long before a node is considered failed)
+  nodeTimeout: 15000
+
+  # Require all hash slots to be covered for the cluster to accept writes
+  # Set to false to allow partial cluster operation
+  requireFullCoverage: true
+
+  # Allow cluster to serve read requests when in down state
+  allowReadsWhenDown: false
+
+  # Persistence configuration (required for cluster mode)
+  persistence:
+    # Size of the PVC for each node (required when cluster.enabled is true)
+    size: ""
+    # Storage class name (empty = use default storage class)
+    storageClass: ""
+    # Access modes for the PVC
+    accessModes:
+      - ReadWriteOnce
+
+  # Bus port for cluster communication (default: service.port + 10000)
+  # This port is used for node-to-node communication in the cluster
+  busPort: 16379
+
+  # Orderly handover on primary pod shutdown. A `kubectl rollout restart`
+  # (or any voluntary eviction) sends SIGTERM to Valkey directly; without a
+  # preStop hook, the primary dies with open client connections and the
+  # cluster takes up to nodeTimeout to promote a replica. During that
+  # window, connection pools fill with dead sockets and the app errors out
+  # on every pooled command.
+  #
+  # With this enabled, each pod's preStop checks whether it is a primary
+  # and (if so) asks one of its own replicas to run `CLUSTER FAILOVER` —
+  # Valkey's canonical graceful-handover command. The primary demotes to
+  # replica in the same pass, so by the time SIGTERM arrives the shard
+  # already has a new primary and existing connections close cleanly.
+  #
+  # No-op when:
+  #   * the pod is already a replica (nothing to fail over);
+  #   * the shard has no replicas (replicasPerShard=0 — nothing to fail
+  #     over TO);
+  #   * the pod has no healthy replica of its own (FAILOVER would target
+  #     nothing).
+  # The hook is strictly best-effort — any error path falls through to the
+  # normal SIGTERM, which is the pre-existing (suboptimal) behaviour.
+  preStopFailover:
+    # Enable the preStop CLUSTER FAILOVER hook.
+    enabled: true
+    # How long to wait for the ex-primary to observe its own demotion
+    # before giving up and letting kubelet send SIGTERM. Keep comfortably
+    # below terminationGracePeriodSeconds — kubelet counts the preStop
+    # time against that total grace period.
+    timeoutSeconds: 40
+
+  # Grace period for pod shutdown. Needs to accommodate the preStop
+  # CLUSTER FAILOVER handshake (default timeoutSeconds: 40) with headroom
+  # for SIGTERM + flush. The K8s default (30s) is too short for an orderly
+  # cluster rollout — a clipped preStop forces the abrupt close this
+  # feature is trying to avoid.
+  terminationGracePeriodSeconds: 60
+
+  # Isolate this Valkey cluster's gossip bus from any other release in the
+  # Kubernetes cluster. Valkey's CLUSTER MEET has no authentication, so
+  # without this, any pod that can open a TCP connection to a node's bus
+  # port can merge its owner's cluster into this one — regardless of
+  # namespace, since pod-to-pod traffic crosses namespaces freely by
+  # default.
+  #
+  # The generated NetworkPolicy restricts the bus port to same-release pods
+  # only; client and metrics ports remain open.
+  #
+  # Requires a NetworkPolicy-enforcing CNI (Calico, Cilium, kindnet ≥ 0.20,
+  # Antrea, etc.). On a non-enforcing CNI (plain Flannel, in-tree kubenet)
+  # the rendered policy is a no-op and the chart cannot provide isolation
+  # on its own — an Istio AuthorizationPolicy or a separate Kubernetes
+  # cluster per Valkey cluster is the only remaining option. Namespaces
+  # alone do NOT provide isolation.
+  isolation:
+    enabled: true
+
+  # Probe tuning for cluster-mode StatefulSet pods. Same rationale as the
+  # top-level startupProbe block — bump failureThreshold for large RDBs.
+  # In cluster mode, replicas catch up via PSYNC + RDB transfer on (re)
+  # connect, so the post-restart load window can be just as long as for
+  # standalone replicated mode.
+  startupProbe:
+    periodSeconds: 5
+    timeoutSeconds: 5
+    failureThreshold: 30
+  livenessProbe:
+    periodSeconds: 10
+    timeoutSeconds: 5
+    failureThreshold: 6
+  readinessProbe:
+    periodSeconds: 5
+    timeoutSeconds: 3
+    failureThreshold: 3
+
+  # PersistentVolumeClaim retention policy for StatefulSet
+  # Controls when PVCs are deleted (requires Kubernetes 1.23+)
+  # More info: https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#persistentvolumeclaim-retention
+  persistentVolumeClaimRetentionPolicy: {}
+
+  # Override surface for the cluster-init Job pod only. Layered on top of
+  # the chart-wide podLabels / podAnnotations (these win on key collision)
+  # and merged with the chart-computed mesh labels, so the same opt-out
+  # mechanics that apply to the data pods are still in force unless you
+  # explicitly veto a key here.
+  #
+  # The intended use case is excluding the short-lived bootstrap Job from
+  # cluster-wide pod admission webhooks — observability sidecars, mesh
+  # proxies pulled in by namespace labels, policy agents, etc. — that are
+  # appropriate for long-running data pods but turn the Job into a
+  # never-completing batch task. Because the Job is a Helm post-install
+  # hook with backoffLimit=6, an injected sidecar that never exits means
+  # `helm install --wait` blocks until timeout and the chart fails to
+  # converge.
+  #
+  # Both maps default empty; with no overrides set the Job pod inherits
+  # the chart-wide values exactly as it did before this surface existed.
+  initJob:
+    podLabels: {}
+    podAnnotations: {}
+    # Auto-cleanup window for the completed Job. Kubernetes deletes the
+    # Job (and its pod) this many seconds after it transitions to
+    # Complete or Failed via the TTL-after-finished controller. Set
+    # short enough to keep `kubectl get pods` clean across upgrades,
+    # long enough to grab `kubectl logs` for post-mortem if needed.
+    ttlSecondsAfterFinished: 300
+
 tls:
   # Enable TLS
   enabled: false
@@ -280,6 +478,101 @@ tls:
   # Require that clients authenticate with a certificate
   requireClientCertificate: false
 
+istio:
+  # Enable Istio
+  enabled: false
+
+  # Data-plane mode. Two very different shapes:
+  #
+  #   sidecar  — Classic Istio. An Envoy sidecar is injected into every pod.
+  #              Requires the namespace/pod to be labelled for injection
+  #              (`istio-injection=enabled` or `sidecar.istio.io/inject=true`).
+  #              The chart adds `traffic.sidecar.istio.io/exclude*Ports` so
+  #              the cluster-bus port bypasses Envoy (gossip needs raw TCP).
+  #
+  #   ambient  — Istio Ambient Mesh. No sidecar; a node-local ztunnel wraps
+  #              pod traffic in HBONE mTLS transparently. The chart adds
+  #              `istio.io/dataplane-mode: ambient` to every workload pod so
+  #              ztunnel captures their traffic (also works when the whole
+  #              namespace is labelled; the pod label is additive and lets
+  #              operators opt in per-release). The DestinationRule that
+  #              wraps outbound connections in ISTIO_MUTUAL is skipped —
+  #              ztunnel already handles mTLS, and a DR would layer a second
+  #              Envoy mTLS on top, doubling crypto overhead.
+  #
+  # Both modes still render the PeerAuthentication (enforced by ztunnel in
+  # ambient, by Envoy in sidecar) and the chart's AuthorizationPolicy that
+  # pins the cluster-bus port to same-release principals.
+  mode: sidecar  # @schema enum:[sidecar,ambient]
+
+  # SPIFFE trust domain this mesh issues identities under. Used by the
+  # AuthorizationPolicy to build the principal string
+  # `<trustDomain>/ns/<ns>/sa/<sa>`. Must match the `trustDomain` value in
+  # the mesh ConfigMap (`kubectl -n istio-system get cm istio -o jsonpath=
+  # '{.data.mesh}' | grep trustDomain`); a mismatch means the rule matches
+  # nothing and same-release pods get default-denied on the bus port, not
+  # just cross-release ones.
+  trustDomain: cluster.local
+
+  # PeerAuthentication controls mTLS enforcement on inbound connections.
+  # Applies to both sidecar and ambient modes.
+  #
+  # A note on Prometheus scraping: STRICT requires every inbound connection
+  # to present a mesh-issued mTLS cert. An out-of-mesh Prometheus (e.g. a
+  # Prometheus running in a namespace without injection, or on a cluster
+  # without Istio) cannot present such a cert, so scraping the metrics
+  # service fails with "Connection reset by peer". Fix either by putting
+  # the scraper in the mesh (ambient capture or sidecar injection), or by
+  # setting this mode to PERMISSIVE so the mesh accepts plaintext too.
+  # The chart's AuthorizationPolicy keeps 9121 wide open at the identity
+  # layer regardless — PeerAuthentication is the gate that flips.
+  peerAuthentication:
+    # mTLS mode for inbound traffic (STRICT, PERMISSIVE, DISABLE, UNSET)
+    # STRICT: Require mTLS on all ports
+    # PERMISSIVE: Accept both plaintext and mTLS
+    mode: STRICT
+    # Additional labels for the PeerAuthentication resource
+    labels: {}
+    # Additional annotations for the PeerAuthentication resource
+    annotations: {}
+
+  # DestinationRule configures mTLS for outbound connections to Valkey services.
+  # Rendered only when istio.mode is "sidecar" — in ambient mode, ztunnel
+  # handles mTLS transparently and a DR would double-encrypt.
+  destinationRule:
+    # TLS mode for outbound traffic (DISABLE, SIMPLE, MUTUAL, ISTIO_MUTUAL)
+    # ISTIO_MUTUAL: Use Istio-managed certificates for mTLS.
+    # NOTE: When tls.enabled is true the Valkey pods already terminate TLS
+    # themselves. Keeping mode=ISTIO_MUTUAL wraps app-level TLS in Envoy mTLS
+    # (double encryption) and still works, but doubles crypto overhead. If you
+    # only want mesh-level mTLS, set tls.enabled=false here and rely on Istio.
+    # If you prefer app-level TLS only, set mode=DISABLE so Envoy passes the
+    # TLS bytes through untouched.
+    mode: ISTIO_MUTUAL
+    # Additional labels for the DestinationRule resource
+    labels: {}
+    # Additional annotations for the DestinationRule resource
+    annotations: {}
+
+  # AuthorizationPolicy restricts the cluster-bus port to same-release
+  # principals. In sidecar mode, Envoy enforces; in ambient mode, ztunnel
+  # enforces. Unlike the NetworkPolicy (IP-based, requires a policy-enforcing
+  # CNI), this is cryptographic — it requires the caller to hold a SPIFFE
+  # identity matching the Valkey pods' ServiceAccount, so a pod in a different
+  # release cannot forge the check even if it can open a TCP connection.
+  #
+  # Only active in cluster mode (no bus port otherwise).
+  authorizationPolicy:
+    # Enable the chart-owned AuthorizationPolicy.
+    # Defaults to true when istio.enabled is true — the whole point of wiring
+    # Istio in is to get mesh-level enforcement, and this is the piece that
+    # prevents cross-release CLUSTER MEET attacks at L4.
+    enabled: true
+    # Additional labels for the AuthorizationPolicy resource
+    labels: {}
+    # Additional annotations for the AuthorizationPolicy resource
+    annotations: {}
+
 # Node selector for pod assignment
 nodeSelector: {}