diff --git a/roles/cifmw_backup_restore/README.md b/roles/cifmw_backup_restore/README.md index cd4171df1..8340f3e05 100644 --- a/roles/cifmw_backup_restore/README.md +++ b/roles/cifmw_backup_restore/README.md @@ -9,8 +9,9 @@ actions: **backup**, **restore**, and **cleanup**. (via CSI snapshots) and cluster resources. - **restore** — performs an ordered Velero restore sequence (PVCs, foundation, infrastructure, control plane, Galera, optional OVN file restore, - full control plane resume, dataplane, EDPM), then Neutron–OVN verification and - sync (**log** mode, then **repair**, matching the backup-restore user guide Step 12). + full control plane resume, Neutron–OVN sync, dataplane, EDPM), with + Neutron–OVN verification and sync (**log** mode, then **repair**) run before + the EDPM deployment to prevent data plane outage (matching user guide Step 10). - **cleanup** — tears down dataplane and control-plane resources so the namespace is ready for a fresh restore. @@ -27,7 +28,7 @@ OpenShift cluster. * `cifmw_backup_restore_namespace`: (String) Target OpenStack namespace. Defaults to `openstack`. * `cifmw_backup_restore_oadp_namespace`: (String) Namespace where Velero/OADP is running. Defaults to `openshift-adp`. * `cifmw_backup_restore_auto_ack`: (Boolean) Skip interactive pause prompts when `true`. Defaults to `false`. -* `cifmw_backup_restore_ovn_db`: (Boolean) When `true` (default), the **backup** path labels OVN NB/SB PVCs and runs `ovsdb-client` backup before the OADP PVC backup, and the **restore** path runs OVN NB/SB file restore after Galera (when timestamped files exist on the PVC) before resuming the full control plane. Set to `false` to skip both; post-EDPM `neutron-ovn-db-sync` still runs when OVN files were not backed up. +* `cifmw_backup_restore_ovn_db`: (Boolean) When `true` (default), the **backup** path labels OVN NB/SB PVCs and runs `ovsdb-client` backup before the OADP PVC backup, and the **restore** path runs OVN NB/SB file restore after Galera (when timestamped files exist on the PVC) before resuming the full control plane. Set to `false` to skip both; `neutron-ovn-db-sync` still runs (before EDPM) when OVN files were not backed up. * `cifmw_backup_restore_ovn_db_ready_timeout`: (String) Timeout for `oc wait` on OVN database pods during OVN backup/restore. Defaults to `5m`. ### Backup @@ -51,7 +52,7 @@ OpenShift cluster. * `cifmw_backup_restore_restore_content`: (String) Content flag passed to `restore_galera` (`--content`). Defaults to `data`. * `cifmw_backup_restore_edpm_deploy_timeout`: (String) Timeout for `oc wait` on the post-restore EDPM deployment. Defaults to `40m`. * `cifmw_backup_restore_pin_pvcs`: (Boolean) Enable PVC-to-node pinning during restore for WaitForFirstConsumer storage classes. Defaults to `false`. -* Post-EDPM **Neutron–OVN** steps follow [user guide Step 12](https://github.com/openstack-k8s-operators/dev-docs/blob/main/backup-restore/user-guide.md#step-12-verify-and-sync-neutron-to-ovn): run `neutron-ovn-db-sync-util` in `log` mode first (`neutron-dist.conf`, `neutron.conf`, `neutron.conf.d`). **Repair** runs if `cifmw_backup_restore_ovn_db` is `false` (no OVN NB/SB file backup was taken), or if log-mode stdout/stderr contains a `WARNING` line—Neutron reports drift that way while still exiting 0. If OVN file backup/restore was enabled and log output has no `WARNING` lines, repair is skipped as redundant. +* **Neutron–OVN** sync runs after the control plane is ready but before EDPM deployment, following [user guide Step 10](https://github.com/openstack-k8s-operators/dev-docs/blob/main/backup-restore/user-guide.md#step-10-verify-and-sync-neutron-to-ovn): `neutron-ovn-db-sync-util` runs in `log` mode first (`neutron-dist.conf`, `neutron.conf`, `neutron.conf.d`). **Repair** runs if `cifmw_backup_restore_ovn_db` is `false` (no OVN NB/SB file backup was taken), or if log-mode stdout/stderr contains a `WARNING` line—Neutron reports drift that way while still exiting 0. If OVN file backup/restore was enabled and log output has no `WARNING` lines, repair is skipped as redundant. ### End-to-end orchestration (e2e.yml) diff --git a/roles/cifmw_backup_restore/tasks/restore.yml b/roles/cifmw_backup_restore/tasks/restore.yml index 2eed66b72..d25955aa0 100644 --- a/roles/cifmw_backup_restore/tasks/restore.yml +++ b/roles/cifmw_backup_restore/tasks/restore.yml @@ -18,7 +18,7 @@ # # Restores an OpenStack control plane from OADP backups using ordered # Velero Restore CRs, Galera restore, optional OVN NB/SB file restore, -# then staged ControlPlane resume, dataplane, EDPM, and Neutron–OVN sync. +# then staged ControlPlane resume, Neutron–OVN sync, dataplane, and EDPM. - name: Validate backup_timestamp parameter ansible.builtin.fail: @@ -395,7 +395,69 @@ changed_when: false # ======================================== -# Step 10: Restore DataPlane (Order 60) +# Step 10: Verify and sync Neutron to OVN +# ======================================== +# Run after neutron is ready but before EDPM deployment so that +# ovn-controller on dataplane nodes reconnects to populated OVN DBs, +# preventing data plane outage from stale/empty databases. +# Log mode does not change exit code for drift (Neutron logs drift as WARNING lines). +# Repair runs if OVN file backup was skipped, or if log output contains WARNING drift. +# Ref: https://github.com/openstack-k8s-operators/dev-docs/blob/main/backup-restore/user-guide.md#step-10-verify-and-sync-neutron-to-ovn +- name: Verify Neutron vs OVN (neutron-ovn-db-sync-util log mode) + ansible.builtin.shell: | + set -o pipefail + oc exec -n {{ cifmw_backup_restore_namespace }} -c neutron-api deploy/neutron -- \ + neutron-ovn-db-sync-util \ + --config-file /usr/share/neutron/neutron-dist.conf \ + --config-file /etc/neutron/neutron.conf \ + --config-dir /etc/neutron/neutron.conf.d \ + --ovn-neutron_sync_mode=log \ + --debug + register: _neutron_ovn_sync_log + changed_when: false + +- name: Decide whether Neutron–OVN repair sync is required + ansible.builtin.set_fact: + _neutron_ovn_needs_repair: >- + {{ + (not (cifmw_backup_restore_ovn_db | bool)) + or ( + (cifmw_backup_restore_ovn_db | bool) + and ( + ((_neutron_ovn_sync_log.stdout | default('')) ~ (_neutron_ovn_sync_log.stderr | default(''))) + | regex_search('(?i)\bWARNING\b') is not none + ) + ) + }} + +- name: Report Neutron–OVN repair decision + ansible.builtin.debug: + msg: >- + neutron-ovn-db-sync repair: + {{ 'running' if _neutron_ovn_needs_repair | bool else 'skipped' }}. + {% if not (cifmw_backup_restore_ovn_db | bool) %} + Reason: cifmw_backup_restore_ovn_db is false (no OVN NB/SB file backup; OVN must be repopulated from Neutron). + {% elif _neutron_ovn_needs_repair | bool %} + Reason: log-mode output contained WARNING lines (Neutron-reported drift vs OVN). + {% else %} + Reason: OVN file backup/restore was used and log-mode output had no WARNING lines. + {% endif %} + +- name: Sync Neutron state to OVN database (repair mode) + ansible.builtin.shell: | + set -o pipefail + oc exec -n {{ cifmw_backup_restore_namespace }} -c neutron-api deploy/neutron -- \ + neutron-ovn-db-sync-util \ + --config-file /usr/share/neutron/neutron-dist.conf \ + --config-file /etc/neutron/neutron.conf \ + --config-dir /etc/neutron/neutron.conf.d \ + --ovn-neutron_sync_mode=repair \ + --debug + when: _neutron_ovn_needs_repair | bool + changed_when: true + +# ======================================== +# Step 11: Restore DataPlane (Order 60) # ======================================== - name: Render dataplane restore ansible.builtin.template: @@ -415,10 +477,10 @@ ansible.builtin.include_tasks: wait_for_restore.yml vars: _restore_name: "openstack-restore-60-dataplane-{{ _restore_suffix }}" - _step_name: "Step 10 (DataPlane restore)" + _step_name: "Step 11 (DataPlane restore)" # ======================================== -# Step 11: EDPM Deployment +# Step 12: EDPM Deployment # ======================================== - name: Get DataPlaneNodeSet names ansible.builtin.shell: | @@ -462,66 +524,6 @@ --timeout={{ cifmw_backup_restore_edpm_deploy_timeout }} when: _nodeset_names.stdout != "" -# ======================================== -# Step 12: Verify and sync Neutron to OVN (user-guide backup-restore Step 12) -# ======================================== -# Run after EDPM so compute ovn-controller agents reconnect to the SB DB first. -# Log mode does not change exit code for drift (Neutron logs drift as WARNING lines). -# Repair runs if OVN file backup was skipped, or if log output contains WARNING drift. -# Ref: https://github.com/openstack-k8s-operators/dev-docs/blob/main/backup-restore/user-guide.md#step-12-verify-and-sync-neutron-to-ovn -- name: Verify Neutron vs OVN (neutron-ovn-db-sync-util log mode) - ansible.builtin.shell: | - set -o pipefail - oc exec -n {{ cifmw_backup_restore_namespace }} -c neutron-api deploy/neutron -- \ - neutron-ovn-db-sync-util \ - --config-file /usr/share/neutron/neutron-dist.conf \ - --config-file /etc/neutron/neutron.conf \ - --config-dir /etc/neutron/neutron.conf.d \ - --ovn-neutron_sync_mode=log \ - --debug - register: _neutron_ovn_sync_log - changed_when: false - -- name: Decide whether Neutron–OVN repair sync is required - ansible.builtin.set_fact: - _neutron_ovn_needs_repair: >- - {{ - (not (cifmw_backup_restore_ovn_db | bool)) - or ( - (cifmw_backup_restore_ovn_db | bool) - and ( - ((_neutron_ovn_sync_log.stdout | default('')) ~ (_neutron_ovn_sync_log.stderr | default(''))) - | regex_search('(?i)\bWARNING\b') is not none - ) - ) - }} - -- name: Report Neutron–OVN repair decision - ansible.builtin.debug: - msg: >- - neutron-ovn-db-sync repair: - {{ 'running' if _neutron_ovn_needs_repair | bool else 'skipped' }}. - {% if not (cifmw_backup_restore_ovn_db | bool) %} - Reason: cifmw_backup_restore_ovn_db is false (no OVN NB/SB file backup; OVN must be repopulated from Neutron). - {% elif _neutron_ovn_needs_repair | bool %} - Reason: log-mode output contained WARNING lines (Neutron-reported drift vs OVN). - {% else %} - Reason: OVN file backup/restore was used and log-mode output had no WARNING lines. - {% endif %} - -- name: Sync Neutron state to OVN database (repair mode) - ansible.builtin.shell: | - set -o pipefail - oc exec -n {{ cifmw_backup_restore_namespace }} -c neutron-api deploy/neutron -- \ - neutron-ovn-db-sync-util \ - --config-file /usr/share/neutron/neutron-dist.conf \ - --config-file /etc/neutron/neutron.conf \ - --config-dir /etc/neutron/neutron.conf.d \ - --ovn-neutron_sync_mode=repair \ - --debug - when: _neutron_ovn_needs_repair | bool - changed_when: true - # ======================================== # Cleanup and Summary # ======================================== diff --git a/roles/openshift_adp/tasks/main.yml b/roles/openshift_adp/tasks/main.yml index b6c1e3d77..f470165ed 100644 --- a/roles/openshift_adp/tasks/main.yml +++ b/roles/openshift_adp/tasks/main.yml @@ -176,7 +176,7 @@ kind: Pod namespace: "{{ cifmw_openshift_adp_namespace }}" label_selectors: - - app.kubernetes.io/name=node-agent + - role=node-agent wait: true wait_timeout: 300 wait_condition: