From 762eb425c04460d140f090bbb353e9aecd88fc24 Mon Sep 17 00:00:00 2001 From: Edgard Castro Date: Mon, 15 Jun 2026 16:29:21 +0200 Subject: [PATCH 1/2] Add DR restore metadata and backup tasks --- README.md | 7 + Taskfile.yaml | 64 ++++ ansible/playbooks/backup-check.yml | 9 + ansible/playbooks/backup-snapshots.yml | 9 + ansible/playbooks/dr-drill.yml | 9 + ansible/playbooks/restore-all-appdata.yml | 9 + ansible/playbooks/restore-app.yml | 9 + ansible/roles/backup/defaults/main.yml | 20 ++ ansible/roles/backup/tasks/build_job.yml | 47 +++ ansible/roles/backup/tasks/check.yml | 17 + .../backup/tasks/collect_restore_all_app.yml | 45 +++ ansible/roles/backup/tasks/drill.yml | 62 ++++ ansible/roles/backup/tasks/load_app.yml | 68 ++++ .../backup/tasks/restore_all_appdata.yml | 293 ++++++++++++++++++ ansible/roles/backup/tasks/restore_app.yml | 247 +++++++++++++++ ansible/roles/backup/tasks/run_job.yml | 85 +++++ ansible/roles/backup/tasks/snapshots.yml | 17 + ansible/tests/backup.yml | 235 ++++++++++++++ ansible/tests/layout.yml | 15 + ansible/tests/native-conventions.yml | 2 + apps/argocd/argocd/app.yaml | 4 + apps/home-automation/homeassistant/app.yaml | 4 + apps/home-automation/scrypted/app.yaml | 4 + apps/kube-system/coredns/app.yaml | 4 + apps/kube-system/k8s-gateway/app.yaml | 4 + apps/kube-system/k8tz/app.yaml | 4 + apps/kube-system/metrics-server/app.yaml | 4 + apps/kube-system/multus/app.yaml | 4 + apps/kube-system/nfs-provisioner/app.yaml | 4 + apps/media/bazarr/app.yaml | 4 + apps/media/flaresolverr/app.yaml | 4 + apps/media/plex/app.yaml | 4 + apps/media/plextraktsync/app.yaml | 4 + apps/media/prowlarr/app.yaml | 4 + apps/media/qbittorrent/app.yaml | 4 + apps/media/radarr/app.yaml | 4 + apps/media/recyclarr/app.yaml | 4 + apps/media/sonarr/app.yaml | 4 + apps/media/unpackerr/app.yaml | 4 + apps/platform-system/cert-manager/app.yaml | 4 + apps/platform-system/external-dns/app.yaml | 4 + .../platform-system/external-secrets/app.yaml | 4 + apps/platform-system/gateway-api/app.yaml | 4 + apps/platform-system/istio-base/app.yaml | 4 + apps/platform-system/istio/app.yaml | 4 + apps/platform-system/reloader/app.yaml | 4 + .../platform-system/tailscale-router/app.yaml | 4 + apps/platform-system/tuppr/app.yaml | 4 + apps/selfhosted/atuin/app.yaml | 4 + apps/selfhosted/bambuddy/app.yaml | 4 + apps/selfhosted/changedetection/app.yaml | 4 + apps/selfhosted/echo/app.yaml | 4 + apps/selfhosted/gatus/app.yaml | 4 + apps/selfhosted/homepage/app.yaml | 4 + apps/selfhosted/karakeep/app.yaml | 4 + apps/selfhosted/paperless/app.yaml | 4 + apps/selfhosted/renovate-operator/app.yaml | 4 + apps/selfhosted/restic/app.yaml | 4 + apps/selfhosted/restic/values.yaml | 7 +- apps/selfhosted/twitch-drops-miner/app.yaml | 4 + docs/dr/drill.md | 22 ++ docs/dr/restore.md | 81 +++++ policy/metadata/app_metadata.rego | 89 ++++++ policy/metadata/app_metadata_test.rego | 93 ++++++ scripts/validate-kubernetes.sh | 30 +- 65 files changed, 1744 insertions(+), 3 deletions(-) create mode 100644 ansible/playbooks/backup-check.yml create mode 100644 ansible/playbooks/backup-snapshots.yml create mode 100644 ansible/playbooks/dr-drill.yml create mode 100644 ansible/playbooks/restore-all-appdata.yml create mode 100644 ansible/playbooks/restore-app.yml create mode 100644 ansible/roles/backup/defaults/main.yml create mode 100644 ansible/roles/backup/tasks/build_job.yml create mode 100644 ansible/roles/backup/tasks/check.yml create mode 100644 ansible/roles/backup/tasks/collect_restore_all_app.yml create mode 100644 ansible/roles/backup/tasks/drill.yml create mode 100644 ansible/roles/backup/tasks/load_app.yml create mode 100644 ansible/roles/backup/tasks/restore_all_appdata.yml create mode 100644 ansible/roles/backup/tasks/restore_app.yml create mode 100644 ansible/roles/backup/tasks/run_job.yml create mode 100644 ansible/roles/backup/tasks/snapshots.yml create mode 100644 ansible/tests/backup.yml create mode 100644 docs/dr/drill.md create mode 100644 docs/dr/restore.md diff --git a/README.md b/README.md index f5e6ee394..1ded75c7f 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,13 @@ task platform:destroy # Uninstall platform bootstrap components task argo:sync # Refresh every Argo CD application task argo:sync app=plex # Refresh one Argo CD application +# Backup / DR +task backup:snapshots # List local restic appdata snapshots +task backup:check # Check the local restic repository +task restore:app app=paperless snapshot=latest confirm=RESTORE +task restore:all-appdata snapshot=latest confirm=RESTORE_ALL +task dr:drill app=atuin snapshot=latest + # Vault task vault:edit-talos # Edit encrypted Talos bootstrap secrets diff --git a/Taskfile.yaml b/Taskfile.yaml index 97b62379e..71924e90e 100644 --- a/Taskfile.yaml +++ b/Taskfile.yaml @@ -228,6 +228,70 @@ tasks: cmds: - '"{{.ANSIBLE_PLAYBOOK}}" ansible/playbooks/argo-refresh.yml {{if .app}}-e "app={{.app}}"{{end}}' + # Backup and disaster recovery operations + backup:snapshots: + desc: List restic appdata backup snapshots + dir: "{{.TASKFILE_DIR}}" + deps: + - task: ansible:check + cmds: + - '"{{.ANSIBLE_PLAYBOOK}}" ansible/playbooks/backup-snapshots.yml' + + backup:check: + desc: Check the local restic backup repository + dir: "{{.TASKFILE_DIR}}" + deps: + - task: ansible:check + cmds: + - '"{{.ANSIBLE_PLAYBOOK}}" ansible/playbooks/backup-check.yml' + + restore:app: + desc: Restore one restic-backed app from a snapshot + interactive: true + dir: "{{.TASKFILE_DIR}}" + deps: + - task: ansible:check + vars: + app: '{{.app | default ""}}' + snapshot: '{{.snapshot | default "latest"}}' + confirm: '{{.confirm | default ""}}' + requires: + vars: + - app + - confirm + cmds: + - '"{{.ANSIBLE_PLAYBOOK}}" ansible/playbooks/restore-app.yml -e "backup_app={{.app}} backup_snapshot={{.snapshot}} backup_confirm={{.confirm}}"' + + restore:all-appdata: + desc: Restore all restic-backed appdata apps from a snapshot; requires confirm=RESTORE_ALL + interactive: true + dir: "{{.TASKFILE_DIR}}" + deps: + - task: ansible:check + vars: + snapshot: '{{.snapshot | default "latest"}}' + confirm: '{{.confirm | default ""}}' + requires: + vars: + - confirm + cmds: + - '"{{.ANSIBLE_PLAYBOOK}}" ansible/playbooks/restore-all-appdata.yml -e "backup_snapshot={{.snapshot}} backup_confirm={{.confirm}}"' + + dr:drill: + desc: Restore one app into a temporary drill path and verify restored content + interactive: true + dir: "{{.TASKFILE_DIR}}" + deps: + - task: ansible:check + vars: + app: '{{.app | default ""}}' + snapshot: '{{.snapshot | default "latest"}}' + requires: + vars: + - app + cmds: + - '"{{.ANSIBLE_PLAYBOOK}}" ansible/playbooks/dr-drill.yml -e "backup_app={{.app}} backup_snapshot={{.snapshot}}"' + # Terraform operations tf:plan: desc: Terraform plan diff --git a/ansible/playbooks/backup-check.yml b/ansible/playbooks/backup-check.yml new file mode 100644 index 000000000..5b93d5dc7 --- /dev/null +++ b/ansible/playbooks/backup-check.yml @@ -0,0 +1,9 @@ +--- +- name: Check restic backup repository + hosts: localhost + gather_facts: false + tasks: + - name: Check restic repository + ansible.builtin.include_role: + name: backup + tasks_from: check diff --git a/ansible/playbooks/backup-snapshots.yml b/ansible/playbooks/backup-snapshots.yml new file mode 100644 index 000000000..abb05f3af --- /dev/null +++ b/ansible/playbooks/backup-snapshots.yml @@ -0,0 +1,9 @@ +--- +- name: List restic backup snapshots + hosts: localhost + gather_facts: false + tasks: + - name: List restic snapshots + ansible.builtin.include_role: + name: backup + tasks_from: snapshots diff --git a/ansible/playbooks/dr-drill.yml b/ansible/playbooks/dr-drill.yml new file mode 100644 index 000000000..5c7e96efc --- /dev/null +++ b/ansible/playbooks/dr-drill.yml @@ -0,0 +1,9 @@ +--- +- name: Run app restore drill + hosts: localhost + gather_facts: false + tasks: + - name: Run restore drill + ansible.builtin.include_role: + name: backup + tasks_from: drill diff --git a/ansible/playbooks/restore-all-appdata.yml b/ansible/playbooks/restore-all-appdata.yml new file mode 100644 index 000000000..7dfd4f9c1 --- /dev/null +++ b/ansible/playbooks/restore-all-appdata.yml @@ -0,0 +1,9 @@ +--- +- name: Restore all restic appdata from restic + hosts: localhost + gather_facts: false + tasks: + - name: Restore all appdata + ansible.builtin.include_role: + name: backup + tasks_from: restore_all_appdata diff --git a/ansible/playbooks/restore-app.yml b/ansible/playbooks/restore-app.yml new file mode 100644 index 000000000..331d09e1a --- /dev/null +++ b/ansible/playbooks/restore-app.yml @@ -0,0 +1,9 @@ +--- +- name: Restore app data from restic + hosts: localhost + gather_facts: false + tasks: + - name: Restore app data + ansible.builtin.include_role: + name: backup + tasks_from: restore_app diff --git a/ansible/roles/backup/defaults/main.yml b/ansible/roles/backup/defaults/main.yml new file mode 100644 index 000000000..724ba103e --- /dev/null +++ b/ansible/roles/backup/defaults/main.yml @@ -0,0 +1,20 @@ +--- +backup_repo_root: "{{ playbook_dir }}/../.." +backup_namespace: selfhosted +backup_argocd_namespace: argocd +backup_appdata_claim: restic-appdata +backup_restic_image: restic/restic:0.19.0 +backup_restic_password_secret: restic-credentials +backup_restic_password_key: RESTIC_PASSWORD +backup_restic_repository: rest:http://restic:$(RESTIC_PASSWORD)@restic.selfhosted.svc.cluster.local:8000/ +backup_restic_host: homelab +backup_restic_tag: appdata +backup_retry_lock: 30m +backup_job_ttl_seconds: 3600 +backup_job_wait_sleep: 10 +backup_job_wait_timeout: 3600 +backup_restore_root: /restore-root/data/appdata +backup_restore_staging: /restore-staging +backup_snapshot: latest +backup_confirm: "" +backup_test_mode: false diff --git a/ansible/roles/backup/tasks/build_job.yml b/ansible/roles/backup/tasks/build_job.yml new file mode 100644 index 000000000..a3b1aa51c --- /dev/null +++ b/ansible/roles/backup/tasks/build_job.yml @@ -0,0 +1,47 @@ +--- +- name: Build restic job definition + ansible.builtin.set_fact: + backup_job_definition: + apiVersion: batch/v1 + kind: Job + metadata: + name: "{{ backup_job_name }}" + namespace: "{{ backup_namespace }}" + spec: + backoffLimit: 0 + ttlSecondsAfterFinished: "{{ backup_job_ttl_seconds }}" + template: + metadata: + labels: + app.kubernetes.io/name: restic + app.kubernetes.io/component: "{{ backup_job_component }}" + spec: + restartPolicy: Never + containers: + - name: restic + image: "{{ backup_restic_image }}" + imagePullPolicy: IfNotPresent + command: + - /bin/sh + - -c + - "{{ backup_job_script }}" + env: >- + {{ + [ + { + 'name': 'RESTIC_PASSWORD', + 'valueFrom': { + 'secretKeyRef': { + 'name': backup_restic_password_secret, + 'key': backup_restic_password_key + } + } + }, + { + 'name': 'RESTIC_REPOSITORY', + 'value': backup_restic_repository + } + ] + (backup_job_env_extra | default([])) + }} + volumeMounts: "{{ backup_job_volume_mounts | default([]) }}" + volumes: "{{ backup_job_volumes | default([]) }}" diff --git a/ansible/roles/backup/tasks/check.yml b/ansible/roles/backup/tasks/check.yml new file mode 100644 index 000000000..29706fea0 --- /dev/null +++ b/ansible/roles/backup/tasks/check.yml @@ -0,0 +1,17 @@ +--- +- name: Build check job + ansible.builtin.include_tasks: build_job.yml + vars: + backup_job_name: "restic-check-{{ lookup('ansible.builtin.pipe', 'date -u +%Y%m%d%H%M%S') }}" + backup_job_component: check + backup_job_script: | + set -eu + restic check --retry-lock {{ backup_retry_lock }} + +- name: Record check job + ansible.builtin.set_fact: + backup_check_job: "{{ backup_job_definition }}" + +- name: Run check job + when: not (backup_test_mode | bool) + ansible.builtin.include_tasks: run_job.yml diff --git a/ansible/roles/backup/tasks/collect_restore_all_app.yml b/ansible/roles/backup/tasks/collect_restore_all_app.yml new file mode 100644 index 000000000..e5923a957 --- /dev/null +++ b/ansible/roles/backup/tasks/collect_restore_all_app.yml @@ -0,0 +1,45 @@ +--- +- name: Load candidate app metadata + ansible.builtin.include_vars: + file: "{{ backup_restore_all_candidate }}" + name: backup_restore_all_candidate_metadata + +- name: Load candidate app values + ansible.builtin.include_vars: + file: "{{ backup_restore_all_candidate | dirname }}/values.yaml" + name: backup_restore_all_candidate_values + +- name: Reset candidate local PVC inventory + ansible.builtin.set_fact: + backup_restore_all_candidate_local_pvc_names: [] + +- name: Collect candidate local PVC persistence entries + ansible.builtin.set_fact: + backup_restore_all_candidate_local_pvc_names: "{{ backup_restore_all_candidate_local_pvc_names + [item.key] }}" + loop: "{{ backup_restore_all_candidate_values.persistence | default({}) | dict2items }}" + loop_control: + label: "{{ item.key }}" + when: + - (item.value.type | default('persistentVolumeClaim')) == 'persistentVolumeClaim' + - (item.value.existingClaim | default('')) == '' + +- name: Validate restic appdata candidate + when: backup_restore_all_candidate_metadata.dr.restore.mode | default('') == "restic-appdata" + ansible.builtin.assert: + that: + - backup_restore_all_candidate_local_pvc_names | length == 1 + - backup_restore_all_candidate_values.nameOverride is not defined + - backup_restore_all_candidate_values.fullnameOverride is not defined + fail_msg: "App {{ backup_restore_all_candidate | dirname | basename }} must define exactly one local PVC and avoid nameOverride/fullnameOverride to use restore:all-appdata." + +- name: Add restic appdata candidate + when: backup_restore_all_candidate_metadata.dr.restore.mode | default('') == "restic-appdata" + ansible.builtin.set_fact: + backup_restore_all_apps: "{{ backup_restore_all_apps + [backup_restore_all_candidate_app] }}" + vars: + backup_restore_all_candidate_app: + app_name: "{{ backup_restore_all_candidate | dirname | basename }}" + namespace: "{{ backup_restore_all_candidate | dirname | dirname | basename }}" + application_name: "{{ backup_restore_all_candidate | dirname | basename }}" + restore_path: "{{ backup_restore_all_candidate | dirname | dirname | basename }}/{{ backup_restore_all_candidate | dirname | basename }}" + local_pvc_name: "{{ backup_restore_all_candidate_local_pvc_names[0] }}" diff --git a/ansible/roles/backup/tasks/drill.yml b/ansible/roles/backup/tasks/drill.yml new file mode 100644 index 000000000..881a8c009 --- /dev/null +++ b/ansible/roles/backup/tasks/drill.yml @@ -0,0 +1,62 @@ +--- +- name: Load app drill context + ansible.builtin.include_tasks: load_app.yml + +- name: Assert app uses restic appdata restore + ansible.builtin.assert: + that: + - backup_app_restore_mode == "restic-appdata" + - backup_app_local_pvc_names | length == 1 + - backup_app_restore_paths | length > 0 + - not (backup_app_has_name_override | bool) + - not (backup_app_has_fullname_override | bool) + fail_msg: "App {{ backup_app }} must use dr.restore.mode restic-appdata, define exactly one local PVC, and avoid nameOverride/fullnameOverride to use dr:drill." + +- name: Set drill timestamp + ansible.builtin.set_fact: + backup_restore_timestamp: "{{ lookup('ansible.builtin.pipe', 'date -u +%Y%m%dT%H%M%SZ') }}" + +- name: Build drill job + ansible.builtin.include_tasks: build_job.yml + vars: + backup_job_name: "restic-drill-{{ backup_app_name }}-{{ lookup('ansible.builtin.pipe', 'date -u +%Y%m%d%H%M%S') }}" + backup_job_component: drill + backup_job_volumes: + - name: appdata + persistentVolumeClaim: + claimName: "{{ backup_appdata_claim }}" + backup_job_volume_mounts: + - name: appdata + mountPath: "{{ backup_restore_root }}" + backup_job_script: | + set -eu + DRILL_PATH="{{ backup_restore_root }}/.drill/{{ backup_app_name }}-{{ backup_restore_timestamp }}" + trap 'rm -rf "$DRILL_PATH"' EXIT + mkdir -p "$DRILL_PATH" + for RESTORE_INCLUDE in {% for restore_path in backup_app_restore_paths %}"/data/appdata/{{ restore_path }}" {% endfor %}; do + RESTORE_PATH="${RESTORE_INCLUDE#/data/appdata/}" + case "$RESTORE_PATH" in + /*|*..*) echo "Refusing unsafe drill path: $RESTORE_PATH"; exit 1 ;; + esac + restic restore "$RESTIC_SNAPSHOT" \ + --host {{ backup_restic_host }} \ + --tag {{ backup_restic_tag }} \ + --include "$RESTORE_INCLUDE" \ + --target "$DRILL_PATH" + RESTORED="$DRILL_PATH/data/appdata/${RESTORE_PATH}" + test -d "$RESTORED" + find "$RESTORED" -mindepth 1 -print -quit | grep -q . + done + backup_job_env_extra: + - name: RESTIC_SNAPSHOT + value: "{{ backup_snapshot }}" + +- name: Record drill job + ansible.builtin.set_fact: + backup_drill_job: "{{ backup_job_definition }}" + +- name: Run drill job + when: not (backup_test_mode | bool) + ansible.builtin.include_tasks: run_job.yml + vars: + backup_job_definition: "{{ backup_drill_job }}" diff --git a/ansible/roles/backup/tasks/load_app.yml b/ansible/roles/backup/tasks/load_app.yml new file mode 100644 index 000000000..432bd188c --- /dev/null +++ b/ansible/roles/backup/tasks/load_app.yml @@ -0,0 +1,68 @@ +--- +- name: Require backup app name + ansible.builtin.assert: + that: + - backup_app is defined + - backup_app | length > 0 + fail_msg: backup_app is required. + +- name: Find app metadata files + ansible.builtin.find: + paths: "{{ backup_repo_root }}/apps" + patterns: + - app.yaml + recurse: true + file_type: file + register: backup_app_metadata_files + +- name: Locate app metadata file + ansible.builtin.set_fact: + backup_app_files: >- + {{ + backup_app_metadata_files.files + | map(attribute='path') + | select('search', '/' + backup_app + '/app.yaml$') + | list + }} + +- name: Assert app metadata file is unique + ansible.builtin.assert: + that: + - backup_app_files | length == 1 + fail_msg: "DR app metadata not found or not unique: {{ backup_app }}" + +- name: Load app metadata + ansible.builtin.include_vars: + file: "{{ backup_app_files[0] }}" + name: backup_app_metadata + +- name: Load app values + ansible.builtin.include_vars: + file: "{{ backup_app_files[0] | dirname }}/values.yaml" + name: backup_app_values + +- name: Reset app local PVC inventory + ansible.builtin.set_fact: + backup_app_local_pvc_names: [] + +- name: Collect app local PVC persistence entries + ansible.builtin.set_fact: + backup_app_local_pvc_names: "{{ backup_app_local_pvc_names + [item.key] }}" + loop: "{{ backup_app_values.persistence | default({}) | dict2items }}" + loop_control: + label: "{{ item.key }}" + when: + - (item.value.type | default('persistentVolumeClaim')) == 'persistentVolumeClaim' + - (item.value.existingClaim | default('')) == '' + +- name: Record app DR context + ansible.builtin.set_fact: + backup_app_name: "{{ backup_app_files[0] | dirname | basename }}" + backup_app_namespace: "{{ backup_app_files[0] | dirname | dirname | basename }}" + backup_app_tier: "{{ backup_app_metadata.dr.tier | default('') }}" + backup_app_restore_mode: "{{ backup_app_metadata.dr.restore.mode | default('') }}" + backup_app_restore_paths: + - "{{ backup_app_files[0] | dirname | dirname | basename }}/{{ backup_app_files[0] | dirname | basename }}" + backup_app_has_name_override: "{{ backup_app_values.nameOverride is defined }}" + backup_app_has_fullname_override: "{{ backup_app_values.fullnameOverride is defined }}" + backup_application_name: "{{ backup_app_files[0] | dirname | basename }}" diff --git a/ansible/roles/backup/tasks/restore_all_appdata.yml b/ansible/roles/backup/tasks/restore_all_appdata.yml new file mode 100644 index 000000000..909e0de5e --- /dev/null +++ b/ansible/roles/backup/tasks/restore_all_appdata.yml @@ -0,0 +1,293 @@ +--- +- name: Require destructive bulk restore confirmation + ansible.builtin.assert: + that: + - backup_confirm == "RESTORE_ALL" + fail_msg: "Set confirm=RESTORE_ALL to run restore:all-appdata." + +- name: Find app metadata files for bulk restore + ansible.builtin.find: + paths: "{{ backup_repo_root }}/apps" + patterns: + - app.yaml + recurse: true + file_type: file + register: backup_restore_all_metadata_files + +- name: Reset bulk restore app inventory + ansible.builtin.set_fact: + backup_restore_all_apps: [] + +- name: Collect restic appdata apps + ansible.builtin.include_tasks: collect_restore_all_app.yml + loop: "{{ backup_restore_all_metadata_files.files | map(attribute='path') | sort }}" + loop_control: + loop_var: backup_restore_all_candidate + label: "{{ backup_restore_all_candidate | dirname | basename }}" + +- name: Record bulk restore paths + ansible.builtin.set_fact: + backup_restore_all_paths: "{{ backup_restore_all_apps | map(attribute='restore_path') | list }}" + +- name: Assert bulk restore has appdata targets + ansible.builtin.assert: + that: + - backup_restore_all_apps | length > 0 + - backup_restore_all_paths | length == backup_restore_all_apps | length + fail_msg: "No restic-appdata apps were found for restore:all-appdata." + +- name: Set bulk restore timestamp + ansible.builtin.set_fact: + backup_restore_timestamp: "{{ lookup('ansible.builtin.pipe', 'date -u +%Y%m%dT%H%M%SZ') }}" + +- name: Reset bulk Argo CD patch lists + ansible.builtin.set_fact: + backup_restore_all_argocd_pause_patches: [] + backup_restore_all_argocd_resume_patches: [] + +- name: Build Argo CD pause and resume patches for bulk restore + ansible.builtin.set_fact: + backup_restore_all_argocd_pause_patches: "{{ backup_restore_all_argocd_pause_patches + [backup_restore_all_argocd_pause_patch] }}" + backup_restore_all_argocd_resume_patches: "{{ backup_restore_all_argocd_resume_patches + [backup_restore_all_argocd_resume_patch] }}" + loop: "{{ backup_restore_all_apps }}" + loop_control: + label: "{{ item.application_name }}" + vars: + backup_restore_all_argocd_pause_patch: + apiVersion: argoproj.io/v1alpha1 + kind: Application + metadata: + name: "{{ item.application_name }}" + namespace: "{{ backup_argocd_namespace }}" + annotations: + argocd.argoproj.io/skip-reconcile: "true" + backup_restore_all_argocd_resume_patch: + apiVersion: argoproj.io/v1alpha1 + kind: Application + metadata: + name: "{{ item.application_name }}" + namespace: "{{ backup_argocd_namespace }}" + annotations: + argocd.argoproj.io/skip-reconcile: + +- name: Build bulk restore job + ansible.builtin.include_tasks: build_job.yml + vars: + backup_job_name: "restic-restore-all-appdata-{{ lookup('ansible.builtin.pipe', 'date -u +%Y%m%d%H%M%S') }}" + backup_job_component: restore-all-appdata + backup_job_volumes: + - name: appdata + persistentVolumeClaim: + claimName: "{{ backup_appdata_claim }}" + backup_job_volume_mounts: + - name: appdata + mountPath: "{{ backup_restore_root }}" + backup_job_script: | + set -eu + RESTORE_TIMESTAMP="{{ backup_restore_timestamp }}" + ARCHIVE_INDEX="{{ backup_restore_staging }}/archive-index" + + restore_rollback() { + restore_status=$? + if [ "$restore_status" -ne 0 ] && [ -f "$ARCHIVE_INDEX" ]; then + while IFS='|' read -r restore_target restore_archive; do + if [ -n "$restore_target" ] && [ ! -e "$restore_target" ] && [ -e "$restore_archive" ]; then + mv "$restore_archive" "$restore_target" + fi + done < "$ARCHIVE_INDEX" + fi + exit "$restore_status" + } + trap restore_rollback EXIT + + rm -rf {{ backup_restore_staging }} + mkdir -p {{ backup_restore_staging }} + : > "$ARCHIVE_INDEX" + + for RESTORE_INCLUDE in {% for restore_path in backup_restore_all_paths %}"/data/appdata/{{ restore_path }}" {% endfor %}; do + RESTORE_PATH="${RESTORE_INCLUDE#/data/appdata/}" + case "$RESTORE_PATH" in + /*|*..*) echo "Refusing unsafe restore path: $RESTORE_PATH"; exit 1 ;; + esac + + restic restore "$RESTIC_SNAPSHOT" \ + --host {{ backup_restic_host }} \ + --tag {{ backup_restic_tag }} \ + --include "$RESTORE_INCLUDE" \ + --target {{ backup_restore_staging }} + + STAGED="{{ backup_restore_staging }}/data/appdata/${RESTORE_PATH}" + test -d "$STAGED" + find "$STAGED" -mindepth 1 -print -quit | grep -q . + done + + for RESTORE_PATH in {% for restore_path in backup_restore_all_paths %}"{{ restore_path }}" {% endfor %}; do + STAGED="{{ backup_restore_staging }}/data/appdata/${RESTORE_PATH}" + TARGET="{{ backup_restore_root }}/${RESTORE_PATH}" + SAFE_PATH="$(printf '%s' "$RESTORE_PATH" | tr '/' '-')" + ARCHIVE="{{ backup_restore_root }}/.pre-restore/all-${RESTORE_TIMESTAMP}-${SAFE_PATH}" + + mkdir -p "$(dirname "$TARGET")" + if [ -e "$TARGET" ]; then + mkdir -p "$(dirname "$ARCHIVE")" + mv "$TARGET" "$ARCHIVE" + printf '%s|%s\n' "$TARGET" "$ARCHIVE" >> "$ARCHIVE_INDEX" + fi + mv "$STAGED" "$TARGET" + done + rm -rf {{ backup_restore_staging }} + backup_job_env_extra: + - name: RESTIC_SNAPSHOT + value: "{{ backup_snapshot }}" + +- name: Record bulk restore job + ansible.builtin.set_fact: + backup_restore_all_job: "{{ backup_job_definition }}" + +- name: Run bulk appdata restore + when: not (backup_test_mode | bool) + block: + - name: Pause Argo CD reconciliation for appdata apps + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + state: patched + definition: "{{ item }}" + loop: "{{ backup_restore_all_argocd_pause_patches }}" + loop_control: + label: "{{ item.metadata.name }}" + + - name: Read app Deployments + kubernetes.core.k8s_info: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + api_version: apps/v1 + kind: Deployment + namespace: "{{ item.namespace }}" + label_selectors: + - "app.kubernetes.io/instance={{ item.app_name }}" + loop: "{{ backup_restore_all_apps }}" + loop_control: + label: "{{ item.namespace }}/{{ item.app_name }}" + register: backup_restore_all_deployment_results + + - name: Read app CronJobs + kubernetes.core.k8s_info: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + api_version: batch/v1 + kind: CronJob + namespace: "{{ item.namespace }}" + label_selectors: + - "app.kubernetes.io/instance={{ item.app_name }}" + loop: "{{ backup_restore_all_apps }}" + loop_control: + label: "{{ item.namespace }}/{{ item.app_name }}" + register: backup_restore_all_cronjob_results + + - name: Record bulk restore workload state + ansible.builtin.set_fact: + backup_restore_all_original_deployments: "{{ backup_restore_all_deployment_results.results | map(attribute='resources') | flatten }}" + backup_restore_all_original_cronjobs: "{{ backup_restore_all_cronjob_results.results | map(attribute='resources') | flatten }}" + + - name: Scale appdata Deployments down + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + state: patched + definition: + apiVersion: apps/v1 + kind: Deployment + metadata: + name: "{{ item.metadata.name }}" + namespace: "{{ item.metadata.namespace }}" + spec: + replicas: 0 + loop: "{{ backup_restore_all_original_deployments }}" + loop_control: + label: "{{ item.metadata.namespace }}/{{ item.metadata.name }}" + + - name: Suspend appdata CronJobs + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + state: patched + definition: + apiVersion: batch/v1 + kind: CronJob + metadata: + name: "{{ item.metadata.name }}" + namespace: "{{ item.metadata.namespace }}" + spec: + suspend: true + loop: "{{ backup_restore_all_original_cronjobs }}" + loop_control: + label: "{{ item.metadata.namespace }}/{{ item.metadata.name }}" + + - name: Run bulk restore job + ansible.builtin.include_tasks: run_job.yml + vars: + backup_job_definition: "{{ backup_restore_all_job }}" + always: + - name: Restore appdata Deployment replicas + when: backup_restore_all_original_deployments is defined + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + state: patched + definition: + apiVersion: apps/v1 + kind: Deployment + metadata: + name: "{{ item.metadata.name }}" + namespace: "{{ item.metadata.namespace }}" + spec: + replicas: "{{ item.spec.replicas | default(1) }}" + loop: "{{ backup_restore_all_original_deployments | default([]) }}" + loop_control: + label: "{{ item.metadata.namespace }}/{{ item.metadata.name }}" + + - name: Restore appdata CronJob suspend state + when: backup_restore_all_original_cronjobs is defined + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + state: patched + definition: + apiVersion: batch/v1 + kind: CronJob + metadata: + name: "{{ item.metadata.name }}" + namespace: "{{ item.metadata.namespace }}" + spec: + suspend: "{{ item.spec.suspend | default(false) }}" + loop: "{{ backup_restore_all_original_cronjobs | default([]) }}" + loop_control: + label: "{{ item.metadata.namespace }}/{{ item.metadata.name }}" + + - name: Resume Argo CD reconciliation for appdata apps + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + state: patched + definition: "{{ item }}" + loop: "{{ backup_restore_all_argocd_resume_patches | default([]) }}" + loop_control: + label: "{{ item.metadata.name }}" + + - name: Refresh restored Argo CD applications + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + state: patched + definition: + apiVersion: argoproj.io/v1alpha1 + kind: Application + metadata: + name: "{{ item.application_name }}" + namespace: "{{ backup_argocd_namespace }}" + annotations: + argocd.argoproj.io/refresh: hard + loop: "{{ backup_restore_all_apps | default([]) }}" + loop_control: + label: "{{ item.application_name }}" diff --git a/ansible/roles/backup/tasks/restore_app.yml b/ansible/roles/backup/tasks/restore_app.yml new file mode 100644 index 000000000..1da90ddf1 --- /dev/null +++ b/ansible/roles/backup/tasks/restore_app.yml @@ -0,0 +1,247 @@ +--- +- name: Require destructive restore confirmation + ansible.builtin.assert: + that: + - backup_confirm == "RESTORE" + fail_msg: "Set confirm=RESTORE to run restore:app." + +- name: Load app restore context + ansible.builtin.include_tasks: load_app.yml + +- name: Assert app uses restic appdata restore + ansible.builtin.assert: + that: + - backup_app_restore_mode == "restic-appdata" + - backup_app_local_pvc_names | length == 1 + - backup_app_restore_paths | length > 0 + - not (backup_app_has_name_override | bool) + - not (backup_app_has_fullname_override | bool) + fail_msg: "App {{ backup_app }} must use dr.restore.mode restic-appdata, define exactly one local PVC, and avoid nameOverride/fullnameOverride to use restore:app." + +- name: Set restore timestamp + ansible.builtin.set_fact: + backup_restore_timestamp: "{{ lookup('ansible.builtin.pipe', 'date -u +%Y%m%dT%H%M%SZ') }}" + +- name: Build Argo CD pause and resume patches + ansible.builtin.set_fact: + backup_argocd_pause_patch: + apiVersion: argoproj.io/v1alpha1 + kind: Application + metadata: + name: "{{ backup_application_name }}" + namespace: "{{ backup_argocd_namespace }}" + annotations: + argocd.argoproj.io/skip-reconcile: "true" + backup_argocd_resume_patch: + apiVersion: argoproj.io/v1alpha1 + kind: Application + metadata: + name: "{{ backup_application_name }}" + namespace: "{{ backup_argocd_namespace }}" + annotations: + argocd.argoproj.io/skip-reconcile: + +- name: Build restore job + ansible.builtin.include_tasks: build_job.yml + vars: + backup_job_name: "restic-restore-{{ backup_app_name }}-{{ lookup('ansible.builtin.pipe', 'date -u +%Y%m%d%H%M%S') }}" + backup_job_component: restore + backup_job_volumes: + - name: appdata + persistentVolumeClaim: + claimName: "{{ backup_appdata_claim }}" + backup_job_volume_mounts: + - name: appdata + mountPath: "{{ backup_restore_root }}" + backup_job_script: | + set -eu + RESTORE_TIMESTAMP="{{ backup_restore_timestamp }}" + restore_in_progress=0 + restore_target= + restore_archive= + + restore_rollback() { + restore_status=$? + if [ "$restore_status" -ne 0 ] && [ "$restore_in_progress" = "1" ] && [ -n "$restore_target" ] && [ ! -e "$restore_target" ] && [ -e "$restore_archive" ]; then + mv "$restore_archive" "$restore_target" + fi + exit "$restore_status" + } + trap restore_rollback EXIT + + {% for restore_path in backup_app_restore_paths %} + # restore target: {{ backup_restore_root }}/{{ restore_path }} + {% endfor %} + for RESTORE_INCLUDE in {% for restore_path in backup_app_restore_paths %}"/data/appdata/{{ restore_path }}" {% endfor %}; do + RESTORE_PATH="${RESTORE_INCLUDE#/data/appdata/}" + case "$RESTORE_PATH" in + /*|*..*) echo "Refusing unsafe restore path: $RESTORE_PATH"; exit 1 ;; + esac + + rm -rf {{ backup_restore_staging }} + mkdir -p {{ backup_restore_staging }} + restic restore "$RESTIC_SNAPSHOT" \ + --host {{ backup_restic_host }} \ + --tag {{ backup_restic_tag }} \ + --include "$RESTORE_INCLUDE" \ + --target {{ backup_restore_staging }} + + STAGED="{{ backup_restore_staging }}/data/appdata/${RESTORE_PATH}" + TARGET="{{ backup_restore_root }}/${RESTORE_PATH}" + SAFE_PATH="$(printf '%s' "$RESTORE_PATH" | tr '/' '-')" + ARCHIVE="{{ backup_restore_root }}/.pre-restore/{{ backup_app_name }}-${RESTORE_TIMESTAMP}-${SAFE_PATH}" + + test -d "$STAGED" + find "$STAGED" -mindepth 1 -print -quit | grep -q . + restore_target="$TARGET" + restore_archive="$ARCHIVE" + restore_in_progress=0 + if [ -e "$TARGET" ]; then + mkdir -p "$(dirname "$ARCHIVE")" + mv "$TARGET" "$ARCHIVE" + restore_in_progress=1 + fi + mkdir -p "$(dirname "$TARGET")" + mv "$STAGED" "$TARGET" + restore_in_progress=0 + done + backup_job_env_extra: + - name: RESTIC_SNAPSHOT + value: "{{ backup_snapshot }}" + +- name: Record restore job + ansible.builtin.set_fact: + backup_restore_job: "{{ backup_job_definition }}" + +- name: Run app restore + when: not (backup_test_mode | bool) + block: + - name: Pause Argo CD reconciliation for app + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + state: patched + definition: "{{ backup_argocd_pause_patch }}" + + - name: Read app Deployments + kubernetes.core.k8s_info: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + api_version: apps/v1 + kind: Deployment + namespace: "{{ backup_app_namespace }}" + label_selectors: + - "app.kubernetes.io/instance={{ backup_app_name }}" + register: backup_restore_deployments + + - name: Read app CronJobs + kubernetes.core.k8s_info: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + api_version: batch/v1 + kind: CronJob + namespace: "{{ backup_app_namespace }}" + label_selectors: + - "app.kubernetes.io/instance={{ backup_app_name }}" + register: backup_restore_cronjobs + + - name: Record app workload state + ansible.builtin.set_fact: + backup_restore_original_deployments: "{{ backup_restore_deployments.resources }}" + backup_restore_original_cronjobs: "{{ backup_restore_cronjobs.resources }}" + + - name: Scale app Deployments down + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + state: patched + definition: + apiVersion: apps/v1 + kind: Deployment + metadata: + name: "{{ item.metadata.name }}" + namespace: "{{ item.metadata.namespace }}" + spec: + replicas: 0 + loop: "{{ backup_restore_original_deployments }}" + loop_control: + label: "{{ item.metadata.namespace }}/{{ item.metadata.name }}" + + - name: Suspend app CronJobs + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + state: patched + definition: + apiVersion: batch/v1 + kind: CronJob + metadata: + name: "{{ item.metadata.name }}" + namespace: "{{ item.metadata.namespace }}" + spec: + suspend: true + loop: "{{ backup_restore_original_cronjobs }}" + loop_control: + label: "{{ item.metadata.namespace }}/{{ item.metadata.name }}" + + - name: Run restore job + ansible.builtin.include_tasks: run_job.yml + vars: + backup_job_definition: "{{ backup_restore_job }}" + always: + - name: Restore app Deployment replicas + when: backup_restore_original_deployments is defined + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + state: patched + definition: + apiVersion: apps/v1 + kind: Deployment + metadata: + name: "{{ item.metadata.name }}" + namespace: "{{ item.metadata.namespace }}" + spec: + replicas: "{{ item.spec.replicas | default(1) }}" + loop: "{{ backup_restore_original_deployments | default([]) }}" + loop_control: + label: "{{ item.metadata.namespace }}/{{ item.metadata.name }}" + + - name: Restore app CronJob suspend state + when: backup_restore_original_cronjobs is defined + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + state: patched + definition: + apiVersion: batch/v1 + kind: CronJob + metadata: + name: "{{ item.metadata.name }}" + namespace: "{{ item.metadata.namespace }}" + spec: + suspend: "{{ item.spec.suspend | default(false) }}" + loop: "{{ backup_restore_original_cronjobs | default([]) }}" + loop_control: + label: "{{ item.metadata.namespace }}/{{ item.metadata.name }}" + + - name: Resume Argo CD reconciliation for app + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + state: patched + definition: "{{ backup_argocd_resume_patch }}" + + - name: Refresh restored Argo CD application + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + state: patched + definition: + apiVersion: argoproj.io/v1alpha1 + kind: Application + metadata: + name: "{{ backup_application_name }}" + namespace: "{{ backup_argocd_namespace }}" + annotations: + argocd.argoproj.io/refresh: hard diff --git a/ansible/roles/backup/tasks/run_job.yml b/ansible/roles/backup/tasks/run_job.yml new file mode 100644 index 000000000..c5f3fc9d1 --- /dev/null +++ b/ansible/roles/backup/tasks/run_job.yml @@ -0,0 +1,85 @@ +--- +- name: Run restic job and collect logs + block: + - name: Apply restic job + kubernetes.core.k8s: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + state: present + definition: "{{ backup_job_definition }}" + + - name: Wait for restic job to finish + kubernetes.core.k8s_info: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + api_version: batch/v1 + kind: Job + namespace: "{{ backup_job_definition.metadata.namespace }}" + name: "{{ backup_job_definition.metadata.name }}" + register: backup_job_status + changed_when: false + failed_when: false + until: >- + backup_job_status.resources | length == 1 + and ( + ( + backup_job_status.resources[0].status.conditions | default([]) + | selectattr('type', 'equalto', 'Complete') + | selectattr('status', 'equalto', 'True') + | list + | length + ) > 0 + or ( + backup_job_status.resources[0].status.conditions | default([]) + | selectattr('type', 'equalto', 'Failed') + | selectattr('status', 'equalto', 'True') + | list + | length + ) > 0 + ) + retries: "{{ (backup_job_wait_timeout / backup_job_wait_sleep) | int }}" + delay: "{{ backup_job_wait_sleep }}" + + - name: Record restic job result + ansible.builtin.set_fact: + backup_job_succeeded: >- + {{ + ( + backup_job_status.resources[0].status.conditions | default([]) + | selectattr('type', 'equalto', 'Complete') + | selectattr('status', 'equalto', 'True') + | list + | length + ) > 0 + }} + backup_job_failed: >- + {{ + ( + backup_job_status.resources[0].status.conditions | default([]) + | selectattr('type', 'equalto', 'Failed') + | selectattr('status', 'equalto', 'True') + | list + | length + ) > 0 + }} + always: + - name: Read restic job logs + kubernetes.core.k8s_log: + kubeconfig: "{{ kubeconfig_path }}" + context: "{{ kube_context }}" + namespace: "{{ backup_job_definition.metadata.namespace }}" + label_selectors: + - "job-name={{ backup_job_definition.metadata.name }}" + register: backup_job_logs + failed_when: false + + - name: Print restic job logs + ansible.builtin.debug: + msg: "{{ backup_job_logs.log_lines | default([]) }}" + +- name: Assert restic job succeeded + ansible.builtin.assert: + that: + - backup_job_succeeded | default(false) | bool + - not (backup_job_failed | default(false) | bool) + fail_msg: "Restic job {{ backup_job_definition.metadata.namespace }}/{{ backup_job_definition.metadata.name }} failed or timed out." diff --git a/ansible/roles/backup/tasks/snapshots.yml b/ansible/roles/backup/tasks/snapshots.yml new file mode 100644 index 000000000..c29fb6066 --- /dev/null +++ b/ansible/roles/backup/tasks/snapshots.yml @@ -0,0 +1,17 @@ +--- +- name: Build snapshots job + ansible.builtin.include_tasks: build_job.yml + vars: + backup_job_name: "restic-snapshots-{{ lookup('ansible.builtin.pipe', 'date -u +%Y%m%d%H%M%S') }}" + backup_job_component: snapshots + backup_job_script: | + set -eu + restic snapshots --host {{ backup_restic_host }} --tag {{ backup_restic_tag }} + +- name: Record snapshots job + ansible.builtin.set_fact: + backup_snapshots_job: "{{ backup_job_definition }}" + +- name: Run snapshots job + when: not (backup_test_mode | bool) + ansible.builtin.include_tasks: run_job.yml diff --git a/ansible/tests/backup.yml b/ansible/tests/backup.yml new file mode 100644 index 000000000..1d368b127 --- /dev/null +++ b/ansible/tests/backup.yml @@ -0,0 +1,235 @@ +--- +- name: Validate backup and restore role contracts + hosts: localhost + gather_facts: false + vars: + backup_test_common: + backup_test_mode: true + backup_repo_root: "{{ playbook_dir }}/../.." + backup_operator_surface_files: + - Taskfile.yaml + - README.md + - docs/dr/restore.md + - docs/dr/drill.md + backup_role_contract_files: + - ansible/roles/backup/tasks/run_job.yml + + tasks: + - name: Read backup operator surface files + ansible.builtin.slurp: + src: "{{ backup_test_common.backup_repo_root }}/{{ item }}" + loop: "{{ backup_operator_surface_files }}" + register: backup_operator_surface_file_contents + + - name: Assert consistent backup command is not exposed + ansible.builtin.assert: + that: + - "'backup:create-consistent' not in (item.content | b64decode)" + - "'backup-create-consistent' not in (item.content | b64decode)" + - "'create_consistent' not in (item.content | b64decode)" + fail_msg: "Consistent backup command is still exposed in {{ item.item }}" + loop: "{{ backup_operator_surface_file_contents.results }}" + loop_control: + label: "{{ item.item }}" + + - name: Assert drill evidence log is not required + ansible.builtin.assert: + that: + - "'Evidence Log' not in (item.content | b64decode)" + - "'Record the result' not in (item.content | b64decode)" + - "'record date' not in ((item.content | b64decode) | lower)" + fail_msg: "DR drill docs still require recording drill evidence in {{ item.item }}" + loop: "{{ backup_operator_surface_file_contents.results }}" + loop_control: + label: "{{ item.item }}" + + - name: Assert restore all appdata command is exposed + ansible.builtin.assert: + that: + - "'restore:all-appdata' in (backup_operator_surface_file_contents.results | selectattr('item', 'equalto', 'Taskfile.yaml') | first).content | b64decode" + - "'restore-all-appdata.yml' in (backup_operator_surface_file_contents.results | selectattr('item', 'equalto', 'Taskfile.yaml') | first).content | b64decode" + - "'RESTORE_ALL' in (backup_operator_surface_file_contents.results | selectattr('item', 'equalto', 'Taskfile.yaml') | first).content | b64decode" + - "'restore:all-appdata' in (backup_operator_surface_file_contents.results | selectattr('item', 'equalto', 'README.md') | first).content | b64decode" + - "'restore:all-appdata' in (backup_operator_surface_file_contents.results | selectattr('item', 'equalto', 'docs/dr/restore.md') | first).content | b64decode" + fail_msg: "Bulk appdata restore command is not exposed in the operator docs." + + - name: Read backup role contract files + ansible.builtin.slurp: + src: "{{ backup_test_common.backup_repo_root }}/{{ item }}" + loop: "{{ backup_role_contract_files }}" + register: backup_role_contract_file_contents + + - name: Assert restic jobs fail fast and expose logs + ansible.builtin.assert: + that: + - "'Failed' in (item.content | b64decode)" + - "'backup_job_succeeded' in (item.content | b64decode)" + - "'backup_job_failed' in (item.content | b64decode)" + - "'failed_when' in (item.content | b64decode)" + fail_msg: "Restic job runner does not explicitly handle failed Jobs in {{ item.item }}" + loop: "{{ backup_role_contract_file_contents.results }}" + loop_control: + label: "{{ item.item }}" + + - name: Build generic restic job with extra env + ansible.builtin.include_role: + name: backup + tasks_from: build_job + vars: + backup_job_name: restic-contract-test + backup_job_component: contract-test + backup_job_script: "restic version" + backup_job_env_extra: + - name: TEST_EXTRA + value: present + + - name: Assert generic restic job keeps extra env + ansible.builtin.assert: + that: + - backup_job_definition.spec.template.spec.containers[0].env | selectattr('name', 'equalto', 'TEST_EXTRA') | list | length == 1 + + - name: Reject unknown restore app + block: + - name: Try unknown app restore + ansible.builtin.include_role: + name: backup + tasks_from: restore_app + vars: + backup_app: not-a-real-app + backup_confirm: RESTORE + backup_snapshot: latest + backup_test_mode: "{{ backup_test_common.backup_test_mode }}" + backup_repo_root: "{{ backup_test_common.backup_repo_root }}" + rescue: + - name: Record unknown app rejection + ansible.builtin.set_fact: + backup_unknown_app_rejected: true + + - name: Assert unknown app was rejected + ansible.builtin.assert: + that: + - backup_unknown_app_rejected | default(false) + + - name: Reject unsupported restore mode + block: + - name: Try disposable app restore + ansible.builtin.include_role: + name: backup + tasks_from: restore_app + vars: + backup_app: echo + backup_confirm: RESTORE + backup_snapshot: latest + backup_test_mode: "{{ backup_test_common.backup_test_mode }}" + backup_repo_root: "{{ backup_test_common.backup_repo_root }}" + rescue: + - name: Record unsupported restore rejection + ansible.builtin.set_fact: + backup_unsupported_app_rejected: true + + - name: Assert unsupported restore mode was rejected + ansible.builtin.assert: + that: + - backup_unsupported_app_rejected | default(false) + + - name: Build restore job for valid app + ansible.builtin.include_role: + name: backup + tasks_from: restore_app + vars: + backup_app: atuin + backup_confirm: RESTORE + backup_snapshot: latest + backup_test_mode: "{{ backup_test_common.backup_test_mode }}" + backup_repo_root: "{{ backup_test_common.backup_repo_root }}" + + - name: Assert restore job contract + ansible.builtin.assert: + that: + - backup_app_metadata.dr.restore.paths is not defined + - backup_app_restore_paths == ["selfhosted/atuin"] + - backup_restore_job.metadata.namespace == "selfhosted" + - backup_restore_job.spec.template.spec.volumes[0].persistentVolumeClaim.claimName == "restic-appdata" + - backup_restore_job.spec.template.spec.containers[0].volumeMounts[0].mountPath == "/restore-root/data/appdata" + - "'/data/appdata/selfhosted/atuin' in backup_restore_job.spec.template.spec.containers[0].command[2]" + - "'--target /restore-staging' in backup_restore_job.spec.template.spec.containers[0].command[2]" + - "'/restore-root/data/appdata/selfhosted/atuin' in backup_restore_job.spec.template.spec.containers[0].command[2]" + - '''find "$STAGED" -mindepth 1 -print -quit | grep -q .'' in backup_restore_job.spec.template.spec.containers[0].command[2]' + - "'restore_rollback' in backup_restore_job.spec.template.spec.containers[0].command[2]" + - "'trap restore_rollback EXIT' in backup_restore_job.spec.template.spec.containers[0].command[2]" + - backup_restore_job.spec.template.spec.containers[0].env | selectattr('name', 'equalto', 'RESTIC_SNAPSHOT') | list | length == 1 + - backup_argocd_pause_patch.metadata.annotations["argocd.argoproj.io/skip-reconcile"] == "true" + - backup_argocd_resume_patch.metadata.annotations["argocd.argoproj.io/skip-reconcile"] is none + + - name: Reject restore all without bulk confirmation + block: + - name: Try restore all without confirmation + ansible.builtin.include_role: + name: backup + tasks_from: restore_all_appdata + vars: + backup_confirm: RESTORE + backup_snapshot: latest + backup_test_mode: "{{ backup_test_common.backup_test_mode }}" + backup_repo_root: "{{ backup_test_common.backup_repo_root }}" + rescue: + - name: Record restore all confirmation rejection + ansible.builtin.set_fact: + backup_restore_all_confirmation_rejected: true + + - name: Assert restore all confirmation was rejected + ansible.builtin.assert: + that: + - backup_restore_all_confirmation_rejected | default(false) + + - name: Build restore all appdata job + ansible.builtin.include_role: + name: backup + tasks_from: restore_all_appdata + vars: + backup_confirm: RESTORE_ALL + backup_snapshot: latest + backup_test_mode: "{{ backup_test_common.backup_test_mode }}" + backup_repo_root: "{{ backup_test_common.backup_repo_root }}" + + - name: Assert restore all appdata job contract + ansible.builtin.assert: + that: + - backup_restore_all_apps | selectattr('app_name', 'equalto', 'atuin') | list | length == 1 + - backup_restore_all_apps | selectattr('app_name', 'equalto', 'paperless') | list | length == 1 + - backup_restore_all_apps | selectattr('app_name', 'equalto', 'echo') | list | length == 0 + - "'selfhosted/atuin' in backup_restore_all_paths" + - "'selfhosted/paperless' in backup_restore_all_paths" + - "'selfhosted/echo' not in backup_restore_all_paths" + - backup_restore_all_job.metadata.namespace == "selfhosted" + - backup_restore_all_job.spec.template.spec.volumes[0].persistentVolumeClaim.claimName == "restic-appdata" + - backup_restore_all_job.spec.template.spec.containers[0].volumeMounts[0].mountPath == "/restore-root/data/appdata" + - "'/data/appdata/selfhosted/atuin' in backup_restore_all_job.spec.template.spec.containers[0].command[2]" + - "'/data/appdata/selfhosted/paperless' in backup_restore_all_job.spec.template.spec.containers[0].command[2]" + - "'/data/appdata/selfhosted/echo' not in backup_restore_all_job.spec.template.spec.containers[0].command[2]" + - "'--target /restore-staging' in backup_restore_all_job.spec.template.spec.containers[0].command[2]" + - '''find "$STAGED" -mindepth 1 -print -quit | grep -q .'' in backup_restore_all_job.spec.template.spec.containers[0].command[2]' + - "'restore_rollback' in backup_restore_all_job.spec.template.spec.containers[0].command[2]" + - "'ARCHIVE_INDEX' in backup_restore_all_job.spec.template.spec.containers[0].command[2]" + - backup_restore_all_job.spec.template.spec.containers[0].env | selectattr('name', 'equalto', 'RESTIC_SNAPSHOT') | list | length == 1 + - backup_restore_all_argocd_pause_patches | selectattr('metadata.name', 'equalto', 'atuin') | list | length == 1 + - backup_restore_all_argocd_pause_patches | selectattr('metadata.name', 'equalto', 'paperless') | list | length == 1 + - backup_restore_all_argocd_pause_patches | selectattr('metadata.name', 'equalto', 'echo') | list | length == 0 + + - name: Build drill job for valid app + ansible.builtin.include_role: + name: backup + tasks_from: drill + vars: + backup_app: atuin + backup_snapshot: latest + backup_test_mode: "{{ backup_test_common.backup_test_mode }}" + backup_repo_root: "{{ backup_test_common.backup_repo_root }}" + + - name: Assert drill job derives appdata path + ansible.builtin.assert: + that: + - backup_drill_job.metadata.namespace == "selfhosted" + - backup_app_restore_paths == ["selfhosted/atuin"] + - "'/data/appdata/selfhosted/atuin' in backup_drill_job.spec.template.spec.containers[0].command[2]" + - backup_drill_job.spec.template.spec.containers[0].env | selectattr('name', 'equalto', 'RESTIC_SNAPSHOT') | list | length == 1 diff --git a/ansible/tests/layout.yml b/ansible/tests/layout.yml index cd0aefdb6..e7c6a0101 100644 --- a/ansible/tests/layout.yml +++ b/ansible/tests/layout.yml @@ -11,6 +11,7 @@ - ansible/inventory/hosts.yml - ansible/inventory/group_vars/all.yml - ansible/roles/argocd/defaults/main.yml + - ansible/roles/backup/defaults/main.yml - ansible/roles/platform/defaults/main.yml - ansible/roles/talos/defaults/main.yml - ansible/roles/talos/vars/main.yml @@ -25,10 +26,21 @@ - ansible/roles/platform/tasks/bootstrap.yml - ansible/roles/platform/tasks/destroy.yml - ansible/roles/argocd/tasks/refresh.yml + - ansible/roles/backup/tasks/snapshots.yml + - ansible/roles/backup/tasks/check.yml + - ansible/roles/backup/tasks/restore_app.yml + - ansible/roles/backup/tasks/restore_all_appdata.yml + - ansible/roles/backup/tasks/collect_restore_all_app.yml + - ansible/roles/backup/tasks/drill.yml - ansible/roles/tofu/tasks/plan.yml - ansible/roles/tofu/tasks/apply.yml - ansible/roles/tofu/tasks/clean.yml - .ansible-lint + - ansible/playbooks/backup-snapshots.yml + - ansible/playbooks/backup-check.yml + - ansible/playbooks/restore-app.yml + - ansible/playbooks/restore-all-appdata.yml + - ansible/playbooks/dr-drill.yml layout_removed_paths: - ansible/ansible.cfg - ansible/group_vars/all.yml @@ -38,6 +50,9 @@ - scripts/talos-cluster.sh - scripts/test-ansible-argo-refresh.sh - scripts/test-ansible-playbooks.sh + - ansible/playbooks/backup-create-consistent.yml + - ansible/roles/backup/tasks/collect_restore_app.yml + - ansible/roles/backup/tasks/create_consistent.yml tasks: - name: Check required files diff --git a/ansible/tests/native-conventions.yml b/ansible/tests/native-conventions.yml index c53fc426d..f71f744f5 100644 --- a/ansible/tests/native-conventions.yml +++ b/ansible/tests/native-conventions.yml @@ -6,6 +6,7 @@ conventions_repo_root: "{{ playbook_dir }}/../.." conventions_role_files: - ansible/roles/argocd/defaults/main.yml + - ansible/roles/backup/defaults/main.yml - ansible/roles/platform/defaults/main.yml - ansible/roles/talos/defaults/main.yml - ansible/roles/talos/vars/main.yml @@ -103,6 +104,7 @@ ansible.builtin.find: paths: - "{{ conventions_repo_root }}/ansible/roles/argocd/tasks" + - "{{ conventions_repo_root }}/ansible/roles/backup/tasks" - "{{ conventions_repo_root }}/ansible/roles/platform/tasks" - "{{ conventions_repo_root }}/ansible/roles/talos/tasks" - "{{ conventions_repo_root }}/ansible/roles/tofu/tasks" diff --git a/apps/argocd/argocd/app.yaml b/apps/argocd/argocd/app.yaml index f27eb577c..ea2b0a5bb 100644 --- a/apps/argocd/argocd/app.yaml +++ b/apps/argocd/argocd/app.yaml @@ -15,3 +15,7 @@ ignoreDifferences: - /data/admin.passwordMtime - /metadata/annotations/argocd.argoproj.io~1tracking-id - /metadata/annotations/reconcile.external-secrets.io~1data-hash +dr: + tier: "platform" + restore: + mode: "gitops" diff --git a/apps/home-automation/homeassistant/app.yaml b/apps/home-automation/homeassistant/app.yaml index f456da8fc..9db81410f 100644 --- a/apps/home-automation/homeassistant/app.yaml +++ b/apps/home-automation/homeassistant/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "critical" + restore: + mode: "restic-appdata" diff --git a/apps/home-automation/scrypted/app.yaml b/apps/home-automation/scrypted/app.yaml index f456da8fc..9db81410f 100644 --- a/apps/home-automation/scrypted/app.yaml +++ b/apps/home-automation/scrypted/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "critical" + restore: + mode: "restic-appdata" diff --git a/apps/kube-system/coredns/app.yaml b/apps/kube-system/coredns/app.yaml index d3bda9fdf..00fbe6638 100644 --- a/apps/kube-system/coredns/app.yaml +++ b/apps/kube-system/coredns/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "-4" +dr: + tier: "platform" + restore: + mode: "gitops" diff --git a/apps/kube-system/k8s-gateway/app.yaml b/apps/kube-system/k8s-gateway/app.yaml index 4be65df33..9630830c6 100644 --- a/apps/kube-system/k8s-gateway/app.yaml +++ b/apps/kube-system/k8s-gateway/app.yaml @@ -5,3 +5,7 @@ chart: version: 3.7.1 sync: wave: "-4" +dr: + tier: "platform" + restore: + mode: "gitops" diff --git a/apps/kube-system/k8tz/app.yaml b/apps/kube-system/k8tz/app.yaml index 73680086e..5392350ed 100644 --- a/apps/kube-system/k8tz/app.yaml +++ b/apps/kube-system/k8tz/app.yaml @@ -5,3 +5,7 @@ chart: version: 0.19.0 sync: wave: "-1" +dr: + tier: "platform" + restore: + mode: "gitops" diff --git a/apps/kube-system/metrics-server/app.yaml b/apps/kube-system/metrics-server/app.yaml index 16f914230..9e1dbb1c0 100644 --- a/apps/kube-system/metrics-server/app.yaml +++ b/apps/kube-system/metrics-server/app.yaml @@ -5,3 +5,7 @@ chart: version: 3.13.1 sync: wave: "-3" +dr: + tier: "platform" + restore: + mode: "gitops" diff --git a/apps/kube-system/multus/app.yaml b/apps/kube-system/multus/app.yaml index 030c33ebd..c3322a0f8 100644 --- a/apps/kube-system/multus/app.yaml +++ b/apps/kube-system/multus/app.yaml @@ -5,3 +5,7 @@ chart: version: 1.3.2 sync: wave: "-4" +dr: + tier: "platform" + restore: + mode: "gitops" diff --git a/apps/kube-system/nfs-provisioner/app.yaml b/apps/kube-system/nfs-provisioner/app.yaml index 208cb6ef6..1176148c3 100644 --- a/apps/kube-system/nfs-provisioner/app.yaml +++ b/apps/kube-system/nfs-provisioner/app.yaml @@ -5,3 +5,7 @@ chart: version: 4.13.2 sync: wave: "-3" +dr: + tier: "platform" + restore: + mode: "gitops" diff --git a/apps/media/bazarr/app.yaml b/apps/media/bazarr/app.yaml index f456da8fc..bfa92f64e 100644 --- a/apps/media/bazarr/app.yaml +++ b/apps/media/bazarr/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "standard" + restore: + mode: "restic-appdata" diff --git a/apps/media/flaresolverr/app.yaml b/apps/media/flaresolverr/app.yaml index f456da8fc..c9f474b37 100644 --- a/apps/media/flaresolverr/app.yaml +++ b/apps/media/flaresolverr/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "disposable" + restore: + mode: "none" diff --git a/apps/media/plex/app.yaml b/apps/media/plex/app.yaml index f456da8fc..bfa92f64e 100644 --- a/apps/media/plex/app.yaml +++ b/apps/media/plex/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "standard" + restore: + mode: "restic-appdata" diff --git a/apps/media/plextraktsync/app.yaml b/apps/media/plextraktsync/app.yaml index f456da8fc..bfa92f64e 100644 --- a/apps/media/plextraktsync/app.yaml +++ b/apps/media/plextraktsync/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "standard" + restore: + mode: "restic-appdata" diff --git a/apps/media/prowlarr/app.yaml b/apps/media/prowlarr/app.yaml index f456da8fc..bfa92f64e 100644 --- a/apps/media/prowlarr/app.yaml +++ b/apps/media/prowlarr/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "standard" + restore: + mode: "restic-appdata" diff --git a/apps/media/qbittorrent/app.yaml b/apps/media/qbittorrent/app.yaml index f456da8fc..bfa92f64e 100644 --- a/apps/media/qbittorrent/app.yaml +++ b/apps/media/qbittorrent/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "standard" + restore: + mode: "restic-appdata" diff --git a/apps/media/radarr/app.yaml b/apps/media/radarr/app.yaml index f456da8fc..bfa92f64e 100644 --- a/apps/media/radarr/app.yaml +++ b/apps/media/radarr/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "standard" + restore: + mode: "restic-appdata" diff --git a/apps/media/recyclarr/app.yaml b/apps/media/recyclarr/app.yaml index f456da8fc..bfa92f64e 100644 --- a/apps/media/recyclarr/app.yaml +++ b/apps/media/recyclarr/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "standard" + restore: + mode: "restic-appdata" diff --git a/apps/media/sonarr/app.yaml b/apps/media/sonarr/app.yaml index f456da8fc..bfa92f64e 100644 --- a/apps/media/sonarr/app.yaml +++ b/apps/media/sonarr/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "standard" + restore: + mode: "restic-appdata" diff --git a/apps/media/unpackerr/app.yaml b/apps/media/unpackerr/app.yaml index f456da8fc..b44d7d979 100644 --- a/apps/media/unpackerr/app.yaml +++ b/apps/media/unpackerr/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "media" + restore: + mode: "external" diff --git a/apps/platform-system/cert-manager/app.yaml b/apps/platform-system/cert-manager/app.yaml index bbd69161e..392da1956 100644 --- a/apps/platform-system/cert-manager/app.yaml +++ b/apps/platform-system/cert-manager/app.yaml @@ -5,3 +5,7 @@ chart: version: v1.20.2 sync: wave: "-4" +dr: + tier: "platform" + restore: + mode: "gitops" diff --git a/apps/platform-system/external-dns/app.yaml b/apps/platform-system/external-dns/app.yaml index 5ef91eab5..cbc96f759 100644 --- a/apps/platform-system/external-dns/app.yaml +++ b/apps/platform-system/external-dns/app.yaml @@ -5,3 +5,7 @@ chart: version: 1.21.1 sync: wave: "-3" +dr: + tier: "platform" + restore: + mode: "gitops" diff --git a/apps/platform-system/external-secrets/app.yaml b/apps/platform-system/external-secrets/app.yaml index a72bc3097..cc12f09b3 100644 --- a/apps/platform-system/external-secrets/app.yaml +++ b/apps/platform-system/external-secrets/app.yaml @@ -5,3 +5,7 @@ chart: version: 2.6.0 sync: wave: "-4" +dr: + tier: "platform" + restore: + mode: "gitops" diff --git a/apps/platform-system/gateway-api/app.yaml b/apps/platform-system/gateway-api/app.yaml index d3bda9fdf..00fbe6638 100644 --- a/apps/platform-system/gateway-api/app.yaml +++ b/apps/platform-system/gateway-api/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "-4" +dr: + tier: "platform" + restore: + mode: "gitops" diff --git a/apps/platform-system/istio-base/app.yaml b/apps/platform-system/istio-base/app.yaml index d5249ced0..d280b6bbe 100644 --- a/apps/platform-system/istio-base/app.yaml +++ b/apps/platform-system/istio-base/app.yaml @@ -5,3 +5,7 @@ chart: version: 1.30.1 sync: wave: "-4" +dr: + tier: "platform" + restore: + mode: "gitops" diff --git a/apps/platform-system/istio/app.yaml b/apps/platform-system/istio/app.yaml index 61d37c554..aef6da53a 100644 --- a/apps/platform-system/istio/app.yaml +++ b/apps/platform-system/istio/app.yaml @@ -5,3 +5,7 @@ chart: version: 1.30.1 sync: wave: "-2" +dr: + tier: "platform" + restore: + mode: "gitops" diff --git a/apps/platform-system/reloader/app.yaml b/apps/platform-system/reloader/app.yaml index 1d28fb9f1..ed52e7fd3 100644 --- a/apps/platform-system/reloader/app.yaml +++ b/apps/platform-system/reloader/app.yaml @@ -5,3 +5,7 @@ chart: version: 2.2.12 sync: wave: "-4" +dr: + tier: "platform" + restore: + mode: "gitops" diff --git a/apps/platform-system/tailscale-router/app.yaml b/apps/platform-system/tailscale-router/app.yaml index 1762c1aa8..afac14ff8 100644 --- a/apps/platform-system/tailscale-router/app.yaml +++ b/apps/platform-system/tailscale-router/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "-3" +dr: + tier: "platform" + restore: + mode: "gitops" diff --git a/apps/platform-system/tuppr/app.yaml b/apps/platform-system/tuppr/app.yaml index 5643bcb50..ee6ffe72b 100644 --- a/apps/platform-system/tuppr/app.yaml +++ b/apps/platform-system/tuppr/app.yaml @@ -5,3 +5,7 @@ chart: version: 0.2.5 sync: wave: "-3" +dr: + tier: "platform" + restore: + mode: "gitops" diff --git a/apps/selfhosted/atuin/app.yaml b/apps/selfhosted/atuin/app.yaml index f456da8fc..9db81410f 100644 --- a/apps/selfhosted/atuin/app.yaml +++ b/apps/selfhosted/atuin/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "critical" + restore: + mode: "restic-appdata" diff --git a/apps/selfhosted/bambuddy/app.yaml b/apps/selfhosted/bambuddy/app.yaml index f456da8fc..bfa92f64e 100644 --- a/apps/selfhosted/bambuddy/app.yaml +++ b/apps/selfhosted/bambuddy/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "standard" + restore: + mode: "restic-appdata" diff --git a/apps/selfhosted/changedetection/app.yaml b/apps/selfhosted/changedetection/app.yaml index f456da8fc..bfa92f64e 100644 --- a/apps/selfhosted/changedetection/app.yaml +++ b/apps/selfhosted/changedetection/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "standard" + restore: + mode: "restic-appdata" diff --git a/apps/selfhosted/echo/app.yaml b/apps/selfhosted/echo/app.yaml index f456da8fc..c9f474b37 100644 --- a/apps/selfhosted/echo/app.yaml +++ b/apps/selfhosted/echo/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "disposable" + restore: + mode: "none" diff --git a/apps/selfhosted/gatus/app.yaml b/apps/selfhosted/gatus/app.yaml index f456da8fc..bfa92f64e 100644 --- a/apps/selfhosted/gatus/app.yaml +++ b/apps/selfhosted/gatus/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "standard" + restore: + mode: "restic-appdata" diff --git a/apps/selfhosted/homepage/app.yaml b/apps/selfhosted/homepage/app.yaml index f456da8fc..c9f474b37 100644 --- a/apps/selfhosted/homepage/app.yaml +++ b/apps/selfhosted/homepage/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "disposable" + restore: + mode: "none" diff --git a/apps/selfhosted/karakeep/app.yaml b/apps/selfhosted/karakeep/app.yaml index f456da8fc..9db81410f 100644 --- a/apps/selfhosted/karakeep/app.yaml +++ b/apps/selfhosted/karakeep/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "critical" + restore: + mode: "restic-appdata" diff --git a/apps/selfhosted/paperless/app.yaml b/apps/selfhosted/paperless/app.yaml index f456da8fc..9db81410f 100644 --- a/apps/selfhosted/paperless/app.yaml +++ b/apps/selfhosted/paperless/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "critical" + restore: + mode: "restic-appdata" diff --git a/apps/selfhosted/renovate-operator/app.yaml b/apps/selfhosted/renovate-operator/app.yaml index ecbb3eea4..fb41b4d06 100644 --- a/apps/selfhosted/renovate-operator/app.yaml +++ b/apps/selfhosted/renovate-operator/app.yaml @@ -5,3 +5,7 @@ chart: version: 4.11.0 sync: wave: "-3" +dr: + tier: "platform" + restore: + mode: "gitops" diff --git a/apps/selfhosted/restic/app.yaml b/apps/selfhosted/restic/app.yaml index 1762c1aa8..d76ab2059 100644 --- a/apps/selfhosted/restic/app.yaml +++ b/apps/selfhosted/restic/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "-3" +dr: + tier: "platform" + restore: + mode: "external" diff --git a/apps/selfhosted/restic/values.yaml b/apps/selfhosted/restic/values.yaml index a887a27e0..bc7875112 100644 --- a/apps/selfhosted/restic/values.yaml +++ b/apps/selfhosted/restic/values.yaml @@ -104,6 +104,8 @@ controllers: - homelab - --tag - appdata + - --retry-lock + - 30m - --verbose env: RESTIC_REPOSITORY: rest:http://restic:$(RESTIC_PASSWORD)@restic.selfhosted.svc.cluster.local:8000/ @@ -122,7 +124,7 @@ controllers: maintenance: type: cronjob cronjob: - schedule: "0 4 * * 1" + schedule: "0 5 * * 1" pod: restartPolicy: OnFailure containers: @@ -143,9 +145,10 @@ controllers: --keep-monthly 12 \ --keep-yearly 3 \ --prune \ + --retry-lock 30m \ --verbose echo "Running check with full data verification..." - restic check --read-data --verbose + restic check --read-data --retry-lock 30m --verbose env: RESTIC_REPOSITORY: rest:http://restic:$(RESTIC_PASSWORD)@restic.selfhosted.svc.cluster.local:8000/ RESTIC_PASSWORD: diff --git a/apps/selfhosted/twitch-drops-miner/app.yaml b/apps/selfhosted/twitch-drops-miner/app.yaml index f456da8fc..bfa92f64e 100644 --- a/apps/selfhosted/twitch-drops-miner/app.yaml +++ b/apps/selfhosted/twitch-drops-miner/app.yaml @@ -5,3 +5,7 @@ chart: version: 5.0.1 sync: wave: "0" +dr: + tier: "standard" + restore: + mode: "restic-appdata" diff --git a/docs/dr/drill.md b/docs/dr/drill.md new file mode 100644 index 000000000..b39dff6b8 --- /dev/null +++ b/docs/dr/drill.md @@ -0,0 +1,22 @@ +--- +# DR Drill Procedure + +Run a restore drill monthly and after major storage, Talos, Kubernetes, or backup changes. + +## Drill Steps + +1. Confirm the local repository is readable: + ```bash + task backup:snapshots + ``` +2. Check repository integrity: + ```bash + task backup:check + ``` +3. Restore a small critical app into the drill path: + ```bash + task dr:drill app=atuin snapshot=latest + ``` +4. Confirm the command completed successfully. + +The drill task derives the appdata path from the app namespace and rendered PVC name, restores into `.drill/-` under appdata, asserts restored content exists, and removes the drill path when the Job exits. diff --git a/docs/dr/restore.md b/docs/dr/restore.md new file mode 100644 index 000000000..e53cb197a --- /dev/null +++ b/docs/dr/restore.md @@ -0,0 +1,81 @@ +--- +# Disaster Recovery Restore Runbook + +This runbook restores the homelab from GitOps state plus the local restic repository. The daily offsite backup of `/mnt/dpool/restic` is managed outside this repository; if TrueNAS local storage is lost, restore that directory from the offsite system before using the in-cluster restore commands here. + +## Recovery Inputs + +- GitHub access to this repository. +- Operator environment or `.envrc` with `TALOS_NODE`, `TALOS_CLUSTER_NAME`, `TALOS_INSTALL_DISK`, `BWS_ACCESS_TOKEN`, and Terraform backend credentials. +- `ANSIBLE_VAULT_PASSWORD` for `ansible/roles/talos/files/secrets.vault.yml`. +- Bitwarden Secrets Manager access for External Secrets and Terraform provider credentials. +- TrueNAS exports restored and reachable: + - `/mnt/spool/appdata` + - `/mnt/dpool/media` + - `/mnt/dpool/restic` + +## Cold Cluster Restore + +1. Clone the repository and check out the desired commit. +2. Load local operator credentials. +3. Install dependencies: + ```bash + task deps + ``` +4. Recreate Talos, Kubernetes, and bootstrap platform services: + ```bash + task cluster:create + ``` +5. Reapply external infrastructure if needed: + ```bash + task tf:apply + ``` +6. Let Argo CD discover apps, then refresh desired state: + ```bash + task argo:sync + ``` +7. Confirm the local restic repository is reachable: + ```bash + task backup:snapshots + task backup:check + ``` + +## App Data Restore + +Use `restore:app` for a surgical restore. The restore task pauses Argo CD reconciliation, scales Deployments to zero, suspends CronJobs for the app, restores into staging, moves the live path to `.pre-restore/`, moves staged data into place, then restores workload state. + +For `restic-appdata` apps, the appdata path is derived as `/` from the app directory and rendered PVC name. Do not maintain restore paths by hand in `app.yaml`. + +```bash +task restore:app app=paperless snapshot=latest confirm=RESTORE +``` + +For cold DR when the appdata root needs to be rebuilt, restore all `restic-appdata` apps in one operation: + +```bash +task restore:all-appdata snapshot=latest confirm=RESTORE_ALL +``` + +Use an explicit snapshot ID for important restores: + +```bash +task restore:app app=paperless snapshot= confirm=RESTORE +task restore:all-appdata snapshot= confirm=RESTORE_ALL +``` + +After each app restore: + +1. Run `task argo:sync app=`. +2. Wait for the app to become healthy. +3. Verify login and expected data manually. +4. Keep `.pre-restore/--` until the restored app is accepted. + +## Consistency Notes + +The scheduled restic CronJob backs up live appdata and is crash-consistent. This repository does not provide a quiesced appdata backup command because the intended operating model is to rely on the scheduled backups, verify the repository with `task backup:check`, and prove recoverability with `task dr:drill`. + +For SQLite-heavy or critical apps, prefer app-native exports or a storage-level snapshot workflow if stronger consistency is needed later. Keep those mechanisms documented next to the app if they are added. + +## Offsite Dependency + +This repository does not operate or verify the daily offsite backup. If `/mnt/dpool/restic` is unavailable, complete the external offsite restore of that directory first, then run `task backup:snapshots` before restoring apps. diff --git a/policy/metadata/app_metadata.rego b/policy/metadata/app_metadata.rego index aacf90b74..d676aad76 100644 --- a/policy/metadata/app_metadata.rego +++ b/policy/metadata/app_metadata.rego @@ -3,6 +3,8 @@ package main import rego.v1 allowed_sync_waves := {"-4", "-3", "-2", "-1", "0"} +allowed_dr_tiers := {"critical", "standard", "platform", "media", "disposable"} +allowed_dr_restore_modes := {"restic-appdata", "gitops", "external", "none"} deny contains msg if { some app in input.apps @@ -61,6 +63,84 @@ deny contains msg if { msg := sprintf("sync.wave must be one of -4, -3, -2, -1, 0 in %s", [app.app_file]) } +deny contains msg if { + some app in input.apps + app.has_app_file + app.dr_tier == "" + msg := sprintf("Missing dr.tier in %s", [app.app_file]) +} + +deny contains msg if { + some app in input.apps + app.has_app_file + app.dr_tier != "" + not allowed_dr_tiers[app.dr_tier] + msg := sprintf("dr.tier must be one of critical, standard, platform, media, disposable in %s", [app.app_file]) +} + +deny contains msg if { + some app in input.apps + app.has_app_file + app.dr_restore_mode == "" + msg := sprintf("Missing dr.restore.mode in %s", [app.app_file]) +} + +deny contains msg if { + some app in input.apps + app.has_app_file + app.dr_restore_mode != "" + not allowed_dr_restore_modes[app.dr_restore_mode] + msg := sprintf("dr.restore.mode must be one of restic-appdata, gitops, external, none in %s", [app.app_file]) +} + +deny contains msg if { + some app in input.apps + app.has_app_file + app.has_dr_restore_paths + msg := sprintf("dr.restore.paths must not be set in %s; restic-appdata paths are derived from namespace and PVC name", [app.app_file]) +} + +deny contains msg if { + some app in input.apps + app.has_app_file + app.dr_restore_mode == "restic-appdata" + app.local_persistent_volume_claim_count != 1 + msg := sprintf("restic-appdata apps must define exactly one local non-existingClaim PVC in %s", [app.values_file]) +} + +deny contains msg if { + some app in input.apps + app.has_app_file + app.dr_restore_mode == "restic-appdata" + app.has_name_override + msg := sprintf("restic-appdata apps must not set nameOverride or fullnameOverride in %s", [app.values_file]) +} + +deny contains msg if { + some app in input.apps + app.has_app_file + app.dr_restore_mode == "restic-appdata" + app.has_fullname_override + msg := sprintf("restic-appdata apps must not set nameOverride or fullnameOverride in %s", [app.values_file]) +} + +deny contains msg if { + some app in input.apps + app.has_app_file + app.dr_tier == "critical" + app.dr_restore_mode == "none" + msg := sprintf("critical apps must not use dr.restore.mode none in %s", [app.app_file]) +} + +deny contains msg if { + some app in input.apps + app.has_app_file + app.local_persistent_volume_claim_count > 0 + app.dr_tier != "disposable" + app.dr_restore_mode != "restic-appdata" + msg := sprintf("apps with local PVCs must use dr.restore.mode restic-appdata unless dr.tier is disposable in %s", [app.app_file]) +} + deny contains msg if { some app in input.apps app.has_app_file @@ -97,3 +177,12 @@ deny contains msg if { first.generated_name == second.generated_name msg := sprintf("Duplicate generated application name '%s': %s and %s", [first.generated_name, first.path, second.path]) } + +deny contains msg if { + some i, j + i < j + first := input.apps[i] + second := input.apps[j] + first.app_name == second.app_name + msg := sprintf("Duplicate Argo CD application name '%s': %s and %s", [first.app_name, first.path, second.path]) +} diff --git a/policy/metadata/app_metadata_test.rego b/policy/metadata/app_metadata_test.rego index 493bb317d..07a8008d2 100644 --- a/policy/metadata/app_metadata_test.rego +++ b/policy/metadata/app_metadata_test.rego @@ -16,6 +16,13 @@ valid_app := { "chart_version": "1.2.3", "chart_name": "", "sync_wave": "0", + "dr_tier": "standard", + "dr_restore_mode": "restic-appdata", + "has_dr_restore_paths": false, + "has_name_override": false, + "has_fullname_override": false, + "local_persistent_volume_claim_count": 1, + "has_existing_claim": false, "has_ignore_differences": false, "ignore_differences_type": "", "ignore_differences": [], @@ -69,6 +76,80 @@ test_missing_sync_wave_denied if { "Missing sync.wave in /repo/apps/selfhosted/demo/app.yaml" in deny with input as {"apps": [app]} } +test_missing_dr_tier_denied if { + app := object.union(valid_app, {"dr_tier": ""}) + "Missing dr.tier in /repo/apps/selfhosted/demo/app.yaml" in deny with input as {"apps": [app]} +} + +test_invalid_dr_tier_denied if { + app := object.union(valid_app, {"dr_tier": "gold"}) + "dr.tier must be one of critical, standard, platform, media, disposable in /repo/apps/selfhosted/demo/app.yaml" in deny with input as {"apps": [app]} +} + +test_missing_dr_restore_mode_denied if { + app := object.union(valid_app, {"dr_restore_mode": ""}) + "Missing dr.restore.mode in /repo/apps/selfhosted/demo/app.yaml" in deny with input as {"apps": [app]} +} + +test_invalid_dr_restore_mode_denied if { + app := object.union(valid_app, {"dr_restore_mode": "snapshot"}) + "dr.restore.mode must be one of restic-appdata, gitops, external, none in /repo/apps/selfhosted/demo/app.yaml" in deny with input as {"apps": [app]} +} + +test_dr_restore_paths_are_rejected if { + app := object.union(valid_app, {"has_dr_restore_paths": true}) + "dr.restore.paths must not be set in /repo/apps/selfhosted/demo/app.yaml; restic-appdata paths are derived from namespace and PVC name" in deny with input as {"apps": [app]} +} + +test_restic_appdata_requires_one_local_pvc if { + app := object.union(valid_app, {"local_persistent_volume_claim_count": 0}) + "restic-appdata apps must define exactly one local non-existingClaim PVC in /repo/apps/selfhosted/demo/values.yaml" in deny with input as {"apps": [app]} +} + +test_restic_appdata_rejects_multiple_local_pvcs if { + app := object.union(valid_app, {"local_persistent_volume_claim_count": 2}) + "restic-appdata apps must define exactly one local non-existingClaim PVC in /repo/apps/selfhosted/demo/values.yaml" in deny with input as {"apps": [app]} +} + +test_restic_appdata_rejects_name_override if { + app := object.union(valid_app, {"has_name_override": true}) + "restic-appdata apps must not set nameOverride or fullnameOverride in /repo/apps/selfhosted/demo/values.yaml" in deny with input as {"apps": [app]} +} + +test_restic_appdata_rejects_fullname_override if { + app := object.union(valid_app, {"has_fullname_override": true}) + "restic-appdata apps must not set nameOverride or fullnameOverride in /repo/apps/selfhosted/demo/values.yaml" in deny with input as {"apps": [app]} +} + +test_critical_app_cannot_use_no_restore if { + app := object.union(valid_app, { + "dr_tier": "critical", + "dr_restore_mode": "none", + }) + "critical apps must not use dr.restore.mode none in /repo/apps/selfhosted/demo/app.yaml" in deny with input as {"apps": [app]} +} + +test_local_pvc_app_requires_restic_or_disposable_classification if { + app := object.union(valid_app, { + "dr_tier": "standard", + "dr_restore_mode": "gitops", + "local_persistent_volume_claim_count": 1, + "has_existing_claim": false, + }) + "apps with local PVCs must use dr.restore.mode restic-appdata unless dr.tier is disposable in /repo/apps/selfhosted/demo/app.yaml" in deny with input as {"apps": [app]} +} + +test_disposable_local_pvc_app_can_skip_restic_restore if { + app := object.union(valid_app, { + "dr_tier": "disposable", + "dr_restore_mode": "none", + "local_persistent_volume_claim_count": 1, + "has_existing_claim": false, + }) + results := deny with input as {"apps": [app]} + count(results) == 0 +} + test_ignore_differences_must_be_list if { app := object.union(valid_app, { "has_ignore_differences": true, @@ -106,3 +187,15 @@ test_duplicate_generated_name_denied if { "Duplicate generated application name 'selfhosted-demo': /repo/apps/selfhosted/demo and /repo/apps/media/demo" in deny with input as {"apps": [valid_app, other_app]} } + +test_duplicate_app_basename_denied if { + other_app := object.union(valid_app, { + "path": "/repo/apps/media/demo", + "category": "media", + "generated_name": "media-demo", + "app_file": "/repo/apps/media/demo/app.yaml", + "values_file": "/repo/apps/media/demo/values.yaml", + }) + + "Duplicate Argo CD application name 'demo': /repo/apps/selfhosted/demo and /repo/apps/media/demo" in deny with input as {"apps": [valid_app, other_app]} +} diff --git a/scripts/validate-kubernetes.sh b/scripts/validate-kubernetes.sh index 7420a34d1..5f737b207 100755 --- a/scripts/validate-kubernetes.sh +++ b/scripts/validate-kubernetes.sh @@ -271,7 +271,10 @@ write_metadata_inventory() { local values_file="$app/values.yaml" local category="${app#"${apps_root}/"}" local app_name chart_repo chart_version chart_name sync_wave - local has_app_file has_values_file has_nonempty_values_file has_ignore_differences ignore_differences_type + local dr_tier dr_restore_mode has_dr_restore_paths + local local_persistent_volume_claim_count has_existing_claim + local has_app_file has_values_file has_nonempty_values_file has_name_override has_fullname_override + local has_ignore_differences ignore_differences_type category="${category%%/*}" app_name="${app##*/}" @@ -282,6 +285,13 @@ write_metadata_inventory() { chart_version="" chart_name="" sync_wave="" + dr_tier="" + dr_restore_mode="" + has_dr_restore_paths=false + local_persistent_volume_claim_count=0 + has_existing_claim=false + has_name_override=false + has_fullname_override=false has_ignore_differences=false ignore_differences_type="" @@ -291,6 +301,9 @@ write_metadata_inventory() { chart_version="$(yq eval '.chart.version // ""' "$app_file")" chart_name="$(yq eval '.chart.name // ""' "$app_file")" sync_wave="$(yq eval '.sync.wave // ""' "$app_file")" + dr_tier="$(yq eval '.dr.tier // ""' "$app_file")" + dr_restore_mode="$(yq eval '.dr.restore.mode // ""' "$app_file")" + has_dr_restore_paths="$(yq eval '.dr.restore | has("paths")' "$app_file" 2>/dev/null || echo false)" has_ignore_differences="$(yq eval 'has("ignoreDifferences")' "$app_file")" if [ "$has_ignore_differences" = "true" ]; then ignore_differences_type="$(yq eval '.ignoreDifferences | type' "$app_file")" @@ -302,6 +315,14 @@ write_metadata_inventory() { if [ -s "$values_file" ]; then has_nonempty_values_file=true fi + local_persistent_volume_claim_count="$( + yq eval '.persistence // {} | to_entries | map(select((.value.type // "persistentVolumeClaim") == "persistentVolumeClaim" and ((.value.existingClaim // "") == ""))) | length' "$values_file" + )" + if [ "$(yq eval '.persistence // {} | to_entries | map(select((.value.type // "persistentVolumeClaim") == "persistentVolumeClaim" and ((.value.existingClaim // "") != ""))) | length' "$values_file")" -gt 0 ]; then + has_existing_claim=true + fi + has_name_override="$(yq eval 'has("nameOverride")' "$values_file")" + has_fullname_override="$(yq eval 'has("fullnameOverride")' "$values_file")" fi printf ' - path: %s\n' "$(yaml_quote "$app")" @@ -317,6 +338,13 @@ write_metadata_inventory() { printf ' chart_version: %s\n' "$(yaml_quote "$chart_version")" printf ' chart_name: %s\n' "$(yaml_quote "$chart_name")" printf ' sync_wave: %s\n' "$(yaml_quote "$sync_wave")" + printf ' dr_tier: %s\n' "$(yaml_quote "$dr_tier")" + printf ' dr_restore_mode: %s\n' "$(yaml_quote "$dr_restore_mode")" + printf ' has_dr_restore_paths: %s\n' "$has_dr_restore_paths" + printf ' local_persistent_volume_claim_count: %s\n' "$local_persistent_volume_claim_count" + printf ' has_existing_claim: %s\n' "$has_existing_claim" + printf ' has_name_override: %s\n' "$has_name_override" + printf ' has_fullname_override: %s\n' "$has_fullname_override" printf ' has_ignore_differences: %s\n' "$has_ignore_differences" printf ' ignore_differences_type: %s\n' "$(yaml_quote "$ignore_differences_type")" printf ' ignore_differences:\n' From 8724f5b3836bebbcd391522593cab8c569100b38 Mon Sep 17 00:00:00 2001 From: Edgard Castro Date: Mon, 15 Jun 2026 16:40:57 +0200 Subject: [PATCH 2/2] fix(dr): restore appdata without xattrs Restic can fail restores on the NFS-backed appdata PVC when it attempts to replay server-managed xattrs such as system.nfs4_acl. Skip xattrs for app restores and drills so file content recovery remains reliable on the target storage. Keep the maintenance CronJob at the existing 04:00 schedule while retaining retry-lock behavior for repository lock contention. --- ansible/roles/backup/tasks/drill.yml | 1 + ansible/roles/backup/tasks/restore_all_appdata.yml | 1 + ansible/roles/backup/tasks/restore_app.yml | 1 + ansible/tests/backup.yml | 3 +++ apps/selfhosted/restic/values.yaml | 2 +- 5 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ansible/roles/backup/tasks/drill.yml b/ansible/roles/backup/tasks/drill.yml index 881a8c009..661a65db9 100644 --- a/ansible/roles/backup/tasks/drill.yml +++ b/ansible/roles/backup/tasks/drill.yml @@ -41,6 +41,7 @@ restic restore "$RESTIC_SNAPSHOT" \ --host {{ backup_restic_host }} \ --tag {{ backup_restic_tag }} \ + --exclude-xattr '*' \ --include "$RESTORE_INCLUDE" \ --target "$DRILL_PATH" RESTORED="$DRILL_PATH/data/appdata/${RESTORE_PATH}" diff --git a/ansible/roles/backup/tasks/restore_all_appdata.yml b/ansible/roles/backup/tasks/restore_all_appdata.yml index 909e0de5e..ca6ea93f4 100644 --- a/ansible/roles/backup/tasks/restore_all_appdata.yml +++ b/ansible/roles/backup/tasks/restore_all_appdata.yml @@ -113,6 +113,7 @@ restic restore "$RESTIC_SNAPSHOT" \ --host {{ backup_restic_host }} \ --tag {{ backup_restic_tag }} \ + --exclude-xattr '*' \ --include "$RESTORE_INCLUDE" \ --target {{ backup_restore_staging }} diff --git a/ansible/roles/backup/tasks/restore_app.yml b/ansible/roles/backup/tasks/restore_app.yml index 1da90ddf1..4aab40e22 100644 --- a/ansible/roles/backup/tasks/restore_app.yml +++ b/ansible/roles/backup/tasks/restore_app.yml @@ -83,6 +83,7 @@ restic restore "$RESTIC_SNAPSHOT" \ --host {{ backup_restic_host }} \ --tag {{ backup_restic_tag }} \ + --exclude-xattr '*' \ --include "$RESTORE_INCLUDE" \ --target {{ backup_restore_staging }} diff --git a/ansible/tests/backup.yml b/ansible/tests/backup.yml index 1d368b127..7337a2496 100644 --- a/ansible/tests/backup.yml +++ b/ansible/tests/backup.yml @@ -152,6 +152,7 @@ - backup_restore_job.spec.template.spec.volumes[0].persistentVolumeClaim.claimName == "restic-appdata" - backup_restore_job.spec.template.spec.containers[0].volumeMounts[0].mountPath == "/restore-root/data/appdata" - "'/data/appdata/selfhosted/atuin' in backup_restore_job.spec.template.spec.containers[0].command[2]" + - '"--exclude-xattr ''*''" in backup_restore_job.spec.template.spec.containers[0].command[2]' - "'--target /restore-staging' in backup_restore_job.spec.template.spec.containers[0].command[2]" - "'/restore-root/data/appdata/selfhosted/atuin' in backup_restore_job.spec.template.spec.containers[0].command[2]" - '''find "$STAGED" -mindepth 1 -print -quit | grep -q .'' in backup_restore_job.spec.template.spec.containers[0].command[2]' @@ -207,6 +208,7 @@ - "'/data/appdata/selfhosted/atuin' in backup_restore_all_job.spec.template.spec.containers[0].command[2]" - "'/data/appdata/selfhosted/paperless' in backup_restore_all_job.spec.template.spec.containers[0].command[2]" - "'/data/appdata/selfhosted/echo' not in backup_restore_all_job.spec.template.spec.containers[0].command[2]" + - '"--exclude-xattr ''*''" in backup_restore_all_job.spec.template.spec.containers[0].command[2]' - "'--target /restore-staging' in backup_restore_all_job.spec.template.spec.containers[0].command[2]" - '''find "$STAGED" -mindepth 1 -print -quit | grep -q .'' in backup_restore_all_job.spec.template.spec.containers[0].command[2]' - "'restore_rollback' in backup_restore_all_job.spec.template.spec.containers[0].command[2]" @@ -232,4 +234,5 @@ - backup_drill_job.metadata.namespace == "selfhosted" - backup_app_restore_paths == ["selfhosted/atuin"] - "'/data/appdata/selfhosted/atuin' in backup_drill_job.spec.template.spec.containers[0].command[2]" + - '"--exclude-xattr ''*''" in backup_drill_job.spec.template.spec.containers[0].command[2]' - backup_drill_job.spec.template.spec.containers[0].env | selectattr('name', 'equalto', 'RESTIC_SNAPSHOT') | list | length == 1 diff --git a/apps/selfhosted/restic/values.yaml b/apps/selfhosted/restic/values.yaml index bc7875112..0d061a06e 100644 --- a/apps/selfhosted/restic/values.yaml +++ b/apps/selfhosted/restic/values.yaml @@ -124,7 +124,7 @@ controllers: maintenance: type: cronjob cronjob: - schedule: "0 5 * * 1" + schedule: "0 4 * * 1" pod: restartPolicy: OnFailure containers: