diff --git a/.github/workflows/genesis.yml b/.github/workflows/genesis.yml new file mode 100644 index 0000000..33e0650 --- /dev/null +++ b/.github/workflows/genesis.yml @@ -0,0 +1,31 @@ +name: genesis + +on: + push: + branches: [master] + paths: + - 'bootstrap/genesis/**' + pull_request: + paths: + - 'bootstrap/genesis/**' + +jobs: + lint-and-format: + runs-on: ubuntu-latest + defaults: + run: + working-directory: bootstrap/genesis + + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + version: "latest" + + - name: Check formatting + run: uvx ruff format --check . + + - name: Lint + run: uvx ruff check . diff --git a/.mergify.yml b/.mergify.yml index dd71300..ad12482 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -1,6 +1,3 @@ -# TODO: Once vyos-build workflow is implemented, add check-success condition -# to verify the VyOS build before auto-merging image update PRs. - pull_request_rules: - name: Auto-merge automated image updates conditions: diff --git a/README.md b/README.md index d21745c..4541576 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ # Homelab This repository contains all source code and configuration related to my homelab. -It is currently heavily a work in progress. \ No newline at end of file +It is currently heavily a work in progress. diff --git a/bootstrap/genesis/README.md b/bootstrap/genesis/README.md new file mode 100644 index 0000000..e8acd7b --- /dev/null +++ b/bootstrap/genesis/README.md @@ -0,0 +1,196 @@ +# Genesis Bootstrap + +Scripts and tools for bootstrapping the lab from scratch. + +## Overview + +The genesis bootstrap process provisions a USB drive with the boot media needed +to initialize the lab infrastructure. It installs Ventoy via VMware Fusion Pro, +downloads VyOS and Talos ISOs from e2 storage, and copies the VyOS gateway +configuration. + +## Scripts + +### provision-usb.py + +Automates USB drive preparation for lab bootstrap. Since Ventoy only runs on +Linux/Windows and development machines are macOS, the script uses VMware Fusion +Pro and an Ubuntu cloud image with cloud-init to install Ventoy automatically. + +**Usage:** + +```bash +# Interactive mode (prompts for device) +./scripts/provision-usb.py + +# Specify device directly +./scripts/provision-usb.py -d disk4 + +# Skip Ventoy installation (USB already has Ventoy) +./scripts/provision-usb.py -d disk4 --skip-ventoy + +# Non-interactive mode +./scripts/provision-usb.py -d disk4 -y + +# Show help +./scripts/provision-usb.py --help +``` + +**Options:** + +| Option | Description | +|:-------|:------------| +| `-d, --device DEVICE` | USB device to provision (e.g., disk4) | +| `-s, --skip-download` | Skip ISO download, use cached files | +| `-v, --skip-ventoy` | Skip Ventoy installation | +| `-y, --yes` | Skip confirmation prompts | +| `-h, --help` | Show help message | + +## Justfile + +A `justfile` is provided for formatting and linting the genesis tooling. + +From the repo root: + +```bash +just -f bootstrap/genesis/justfile check +just -f bootstrap/genesis/justfile fmt +just -f bootstrap/genesis/justfile lint +just -f bootstrap/genesis/justfile clean +``` + +From `bootstrap/genesis/`: + +```bash +just check +``` + +## Prerequisites + +### uv + +The script uses [uv](https://docs.astral.sh/uv/) for dependency management. Install it with: + +```bash +curl -LsSf https://astral.sh/uv/install.sh | sh +``` + +The script automatically installs its dependencies (click, rich, boto3, httpx) +on first run. + +### VMware Fusion Pro + +Required for Ventoy installation on macOS. VMware Fusion Pro is now free for +personal use. + +1. Download from: https://www.vmware.com/products/fusion.html +2. Install and run once to complete setup +3. The script uses the `vmrun` CLI for VM management + +### sops + +Required to decrypt the e2 storage credentials in `images/e2.sops.yaml`: + +```bash +brew install sops +``` + +### qemu (ARM64 Macs only) + +Required on Apple Silicon Macs to convert Ubuntu cloud images: + +```bash +brew install qemu +``` + +This provides `qemu-img` which converts QCOW2 images to VMDK format for VMware. + +### e2 credentials and image manifest + +The script reads ISO paths from `images/images.yaml` and decrypts +`images/e2.sops.yaml` via `sops -d` to load e2 credentials. Ensure both files +exist and are up to date. + +### USB Drive + +- Minimum 8GB recommended +- Will be completely erased during provisioning + +## How It Works + +1. **Device selection**: lists external USB devices and prompts for selection +2. **ISO config**: resolves VyOS/Talos paths from `images/images.yaml` +3. **Credential load**: decrypts `images/e2.sops.yaml` to access e2 storage +4. **ISO download**: fetches ISOs from e2 storage with progress bars +5. **VM prep**: downloads Ubuntu cloud image and generates a VMX with USB passthrough +6. **Ventoy install**: cloud-init downloads Ventoy in the VM and installs it automatically +7. **File copy**: mounts the Ventoy partition and copies ISOs + `gateway.conf` +8. **Cleanup**: ejects the USB and removes transient VM artifacts + +### VMware Fusion Approach + +Since Ventoy does not run natively on macOS, the script: + +1. Creates a VMX configuration with USB auto-connect based on VID:PID +2. Boots an Ubuntu cloud image with cloud-init to run the install script +3. Waits for the VM to shut down on successful Ventoy installation +4. Removes transient VM files while keeping cached disks for faster reruns + +This approach was chosen because: +- VMware Fusion Pro is now free +- Reliable USB passthrough via VID:PID matching +- `vmrun` provides CLI control of VMs +- No manual steps required inside the VM + +## Cache Directory + +Downloaded files are cached at `~/.cache/lab-bootstrap/`: + +``` +~/.cache/lab-bootstrap/ +├── ubuntu-noble-cloudimg-x86_64.ova +├── ubuntu-noble-cloudimg-arm64.img +├── isos/ +│ ├── vyos-.iso +│ └── talos--um760.iso +└── vms/ + └── ventoy-installer/ + ├── ubuntu-noble-cloudimg-.vmdk + └── ubuntu-noble-cloudimg-.vmdk.meta.json +``` + +## Troubleshooting + +### sops decryption fails + +1. Ensure `sops` is installed (`brew install sops`) +2. Verify `images/e2.sops.yaml` exists and you have access to decrypt it + +### USB device not detected by VM + +1. Ensure USB is unmounted before VM starts +2. Check VID:PID detection - the script will prompt for manual entry if needed +3. Run `ioreg -r -c IOUSBHostDevice -l` to inspect USB devices + +### Ventoy installation times out + +1. Check that the VM is running in VMware Fusion +2. Ensure the USB device appears as `/dev/sdX` in the VM +3. Re-run with `--skip-ventoy` only if Ventoy is already installed + +### ISO download fails + +1. Verify `images/images.yaml` references valid objects +2. Confirm credentials in `images/e2.sops.yaml` are current +3. Use `--skip-download` to force cached ISOs + +### Script fails to start + +1. Ensure uv is installed: `which uv` +2. Try running directly: `uv run --script ./scripts/provision-usb.py --help` + +## Related Documentation + +- [Bootstrap Procedure](../../docs/architecture/appendices/B_bootstrap_procedure.md) - Full bootstrap runbook +- [VyOS GitOps](../../docs/architecture/09_design_decisions/003_vyos_gitops.md) - VyOS configuration management +- [images.yaml](../../images/images.yaml) - Image manifest for labctl sync diff --git a/bootstrap/genesis/justfile b/bootstrap/genesis/justfile new file mode 100644 index 0000000..6274626 --- /dev/null +++ b/bootstrap/genesis/justfile @@ -0,0 +1,20 @@ +# Default recipe +default: check + +# Run formatting checks and auto-fix +fmt: + uvx ruff format . + +# Run linters +lint: + uvx ruff check . --fix + +# Run checks (CI mode - no auto-fix) +check: + uvx ruff format --check . + uvx ruff check . + +# Clean up cache files +clean: + rm -rf .ruff_cache + rm -rf .cache diff --git a/bootstrap/genesis/scripts/.gitignore b/bootstrap/genesis/scripts/.gitignore new file mode 100644 index 0000000..ed8ebf5 --- /dev/null +++ b/bootstrap/genesis/scripts/.gitignore @@ -0,0 +1 @@ +__pycache__ \ No newline at end of file diff --git a/bootstrap/genesis/scripts/build-vyos-image.sh b/bootstrap/genesis/scripts/build-vyos-image.sh deleted file mode 100755 index 8a9624d..0000000 --- a/bootstrap/genesis/scripts/build-vyos-image.sh +++ /dev/null @@ -1,208 +0,0 @@ -#!/usr/bin/env bash -# Build VyOS Gateway Image -# Creates a raw disk image for the VP6630 gateway router using vyos-build -# -# Prerequisites: -# - Docker -# - SSH public key -# -# Usage: -# ./build-vyos-image.sh [options] -# -# Options: -# -o, --output DIR Output directory (default: ./output-vyos) -# -k, --ssh-key PATH SSH public key file (default: ~/.ssh/id_rsa.pub) -# -v, --version VER VyOS version string (default: timestamp) -# -h, --help Show this help message -# -# Network configuration is embedded in the build flavor at: -# infrastructure/network/vyos/vyos-build/build-flavors/gateway.toml - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" -VYOS_BUILD_DIR="${REPO_ROOT}/infrastructure/network/vyos/vyos-build" - -# Defaults -OUTPUT_DIR="${SCRIPT_DIR}/output-vyos" -SSH_KEY_FILE="${HOME}/.ssh/id_rsa.pub" -VERSION="$(date +%Y%m%d%H%M%S)" -BUILD_BY="genesis@lab.gilman.io" - -usage() { - head -20 "$0" | grep -E '^#' | sed 's/^# \?//' - exit 0 -} - -log() { - echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" -} - -error() { - echo "[ERROR] $*" >&2 - exit 1 -} - -check_prerequisites() { - log "Checking prerequisites..." - - if ! command -v docker &>/dev/null; then - error "Docker not found. Install Docker to continue." - fi - - if ! docker info &>/dev/null; then - error "Docker daemon not running or not accessible." - fi - - # Check SSH key exists - if [[ ! -f "${SSH_KEY_FILE}" ]]; then - error "SSH public key not found: ${SSH_KEY_FILE}\nUse --ssh-key to specify a different key file." - fi - - log "Prerequisites satisfied" -} - -generate_flavor() { - log "Generating build flavor with SSH credentials..." - - # Extract SSH key components - SSH_KEY_TYPE=$(awk '{print $1}' "${SSH_KEY_FILE}") - SSH_KEY_BODY=$(awk '{print $2}' "${SSH_KEY_FILE}") - - if [[ -z "${SSH_KEY_TYPE}" ]] || [[ -z "${SSH_KEY_BODY}" ]]; then - error "Invalid SSH public key format in ${SSH_KEY_FILE}" - fi - - log " SSH Key Type: ${SSH_KEY_TYPE}" - - # Create temp directory for build files - BUILD_TEMP=$(mktemp -d) - trap "rm -rf ${BUILD_TEMP}" EXIT - - # Generate flavor from template - TEMPLATE_FILE="${VYOS_BUILD_DIR}/build-flavors/gateway.toml" - GENERATED_FLAVOR="${BUILD_TEMP}/gateway.toml" - - if [[ ! -f "${TEMPLATE_FILE}" ]]; then - error "Flavor template not found: ${TEMPLATE_FILE}" - fi - - sed -e "s|%%SSH_KEY_TYPE%%|${SSH_KEY_TYPE}|g" \ - -e "s|%%SSH_PUBLIC_KEY%%|${SSH_KEY_BODY}|g" \ - "${TEMPLATE_FILE}" > "${GENERATED_FLAVOR}" - - log "Generated flavor: ${GENERATED_FLAVOR}" -} - -run_vyos_build() { - log "Starting vyos-build..." - log " Version: ${VERSION}" - log " Build By: ${BUILD_BY}" - log " Output: ${OUTPUT_DIR}" - - mkdir -p "${OUTPUT_DIR}" - - # Pull the vyos-build container - log "Pulling vyos-build container..." - docker pull vyos/vyos-build:current - - # Run the build inside the container - # The container needs: - # - Privileged mode for raw disk image creation - # - /dev access for disk operations - # - Generated flavor file copied to build-flavors directory - log "Running VyOS image build..." - - docker run --rm --privileged \ - -v "${BUILD_TEMP}/gateway.toml:/vyos/data/build-flavors/gateway.toml:ro" \ - -v "${OUTPUT_DIR}:/output" \ - -v /dev:/dev \ - -e VYOS_BUILD_BY="${BUILD_BY}" \ - -e VYOS_VERSION="${VERSION}" \ - vyos/vyos-build:current \ - bash -c " - set -e - echo 'Building VyOS gateway image...' - cd /vyos - sudo ./build-vyos-image \ - --architecture amd64 \ - --build-by '${BUILD_BY}' \ - --build-type release \ - --version '${VERSION}' \ - gateway - - echo 'Copying output files...' - if [ -d /vyos/build ]; then - cp -v /vyos/build/*.raw /output/ 2>/dev/null || true - cp -v /vyos/build/*.qcow2 /output/ 2>/dev/null || true - fi - - echo 'Build complete!' - " - - log "vyos-build completed successfully!" -} - -show_results() { - echo "" - echo "==============================================" - echo "VyOS Gateway Image Build Complete" - echo "==============================================" - echo "" - echo "Output directory: ${OUTPUT_DIR}" - if [[ -d "${OUTPUT_DIR}" ]]; then - echo "" - echo "Files:" - ls -lah "${OUTPUT_DIR}/" - fi - echo "" - echo "Next steps:" - echo " 1. Upload image to e2 storage for Synology Cloud Sync:" - echo " labctl images upload ${OUTPUT_DIR}/vyos-*.raw" - echo "" - echo " 2. Or copy directly to NAS:" - echo " scp ${OUTPUT_DIR}/vyos-*.raw nas:/volume1/images/vyos/" - echo "" - echo " 3. Or write directly to USB/SSD for manual install:" - echo " sudo dd if=${OUTPUT_DIR}/vyos-*.raw of=/dev/sdX bs=4M status=progress" - echo "" - echo "Network configuration is embedded in the build flavor at:" - echo " infrastructure/network/vyos/vyos-build/build-flavors/gateway.toml" - echo "" -} - -main() { - while [[ $# -gt 0 ]]; do - case $1 in - -o|--output) - OUTPUT_DIR="$2" - shift 2 - ;; - -k|--ssh-key) - SSH_KEY_FILE="$2" - shift 2 - ;; - -v|--version) - VERSION="$2" - shift 2 - ;; - -h|--help) - usage - ;; - *) - error "Unknown option: $1" - ;; - esac - done - - log "VyOS Gateway Image Builder (vyos-build)" - log "Repository root: ${REPO_ROOT}" - - check_prerequisites - generate_flavor - run_vyos_build - show_results -} - -main "$@" diff --git a/bootstrap/genesis/scripts/provision-usb.py b/bootstrap/genesis/scripts/provision-usb.py new file mode 100755 index 0000000..b079a7e --- /dev/null +++ b/bootstrap/genesis/scripts/provision-usb.py @@ -0,0 +1,1252 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "click>=8.1", +# "rich>=13.0", +# "boto3>=1.34", +# "httpx>=0.27", +# "pyyaml>=6.0", +# ] +# /// +""" +Provision a USB drive with Ventoy and lab bootstrap media. + +This script automates USB preparation for lab bootstrap by: +1. Installing Ventoy on the USB drive via VMware Fusion Pro (Ubuntu cloud VM) +2. Downloading VyOS and Talos ISOs from iDrive e2 storage +3. Copying the VyOS gateway configuration file + +The script uses Ubuntu cloud images with cloud-init for fully automated +Ventoy installation - no manual intervention required. +""" + +from __future__ import annotations + +import json +import re +import shutil +import subprocess +import tarfile +import time +from dataclasses import dataclass +from pathlib import Path + +import yaml +import boto3 +import click +import httpx +from rich.console import Console as RichConsole +from rich.panel import Panel +from rich.progress import ( + BarColumn, + DownloadColumn, + Progress, + TextColumn, + TimeRemainingColumn, + TransferSpeedColumn, +) +from rich.prompt import Confirm, Prompt +from rich.table import Table + +# ============================================================================= +# Configuration +# ============================================================================= + +SCRIPT_DIR = Path(__file__).parent.resolve() +REPO_ROOT = SCRIPT_DIR.parent.parent.parent + +# VMware Fusion paths +VMWARE_APP = Path("/Applications/VMware Fusion.app") +VMRUN = VMWARE_APP / "Contents/Library/vmrun" +OVFTOOL = VMWARE_APP / "Contents/Library/VMware OVF Tool/ovftool" + +# Ventoy configuration +VENTOY_VERSION = "1.0.99" + +# Ubuntu cloud image configuration +# Note: OVA is only available for amd64, ARM64 requires .img conversion +UBUNTU_VERSION = "noble" # 24.04 LTS +UBUNTU_CLOUD_BASE = f"https://cloud-images.ubuntu.com/{UBUNTU_VERSION}/current" +UBUNTU_CLOUD_IMAGES = { + "x86_64": { + "url": f"{UBUNTU_CLOUD_BASE}/{UBUNTU_VERSION}-server-cloudimg-amd64.ova", + "format": "ova", + }, + "arm64": { + "url": f"{UBUNTU_CLOUD_BASE}/{UBUNTU_VERSION}-server-cloudimg-arm64.img", + "format": "qcow2", # Ubuntu .img files are QCOW2 format + }, +} + +# ISO paths in e2 storage (bucket/endpoint loaded from SOPS credentials) +# Note: labctl stores images with an "images/" prefix +# These are now loaded dynamically from images.yaml +IMAGES_MANIFEST = REPO_ROOT / "images/images.yaml" + +# Local cache directory +CACHE_DIR = Path.home() / ".cache/lab-bootstrap" +ISO_CACHE_DIR = CACHE_DIR / "isos" + +# VyOS configuration file +VYOS_CONFIG = REPO_ROOT / "infrastructure/network/vyos/configs/gateway.conf" + +# e2 credentials file (SOPS encrypted) +E2_CREDENTIALS_FILE = REPO_ROOT / "images/e2.sops.yaml" + + +@dataclass +class E2Credentials: + """e2 storage credentials.""" + + access_key: str + secret_key: str + endpoint: str + bucket: str + + +@dataclass +class USBDevice: + """USB device information.""" + + identifier: str + name: str + size: str + + +def get_host_arch() -> str: + """Get the host machine architecture.""" + import platform + + machine = platform.machine() + if machine in ("arm64", "aarch64"): + return "arm64" + return "x86_64" + + +def load_iso_config() -> tuple[str, str]: + """Load ISO paths from images.yaml.""" + if not IMAGES_MANIFEST.exists(): + raise FileNotFoundError(f"Images manifest not found: {IMAGES_MANIFEST}") + + with IMAGES_MANIFEST.open() as f: + data = yaml.safe_load(f) + + vyos_path = "" + talos_path = "" + + for image in data.get("spec", {}).get("images", []): + if image["name"] == "vyos-stream": + vyos_path = f"images/{image['destination']}" + elif image["name"] == "talos-um760": + talos_path = f"images/{image['destination']}" + + if not vyos_path: + raise ValueError("Could not find 'vyos-stream' image in manifest") + if not talos_path: + raise ValueError("Could not find 'talos-um760' image in manifest") + + return vyos_path, talos_path + + +def load_e2_credentials() -> E2Credentials: + """Load e2 credentials from SOPS-encrypted file.""" + if not E2_CREDENTIALS_FILE.exists(): + raise FileNotFoundError(f"e2 credentials file not found: {E2_CREDENTIALS_FILE}") + + # Check if sops is available + if not shutil.which("sops"): + raise RuntimeError("sops is not installed. Install with: brew install sops") + + # Decrypt the file using sops + result = subprocess.run( + ["sops", "-d", str(E2_CREDENTIALS_FILE)], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + raise RuntimeError(f"Failed to decrypt e2 credentials: {result.stderr}") + + # Parse the YAML + data = yaml.safe_load(result.stdout) + + try: + return E2Credentials( + access_key=data["access_key"], + secret_key=data["secret_key"], + endpoint=data["endpoint"], + bucket=data["bucket"], + ) + except KeyError as e: + raise ValueError(f"Missing required key in e2 credentials: {e}") from e + + +# ============================================================================= +# Console Output +# ============================================================================= + + +class Console: + """Pretty console output using rich.""" + + def __init__(self) -> None: + self.console = RichConsole() + + def info(self, message: str) -> None: + self.console.print(f"[blue][INFO][/blue] {message}") + + def success(self, message: str) -> None: + self.console.print(f"[green][OK][/green] {message}") + + def warn(self, message: str) -> None: + self.console.print(f"[yellow][WARN][/yellow] {message}") + + def error(self, message: str) -> None: + self.console.print(f"[red][ERROR][/red] {message}") + + def banner(self, title: str) -> None: + self.console.print(Panel(title, style="bold blue")) + + def confirm(self, message: str, default: bool = False) -> bool: + return Confirm.ask(message, default=default) + + def prompt(self, message: str) -> str: + return Prompt.ask(message) + + def table(self, title: str, columns: list[str], rows: list[list[str]]) -> None: + table = Table(title=title) + for col in columns: + table.add_column(col) + for row in rows: + table.add_row(*row) + self.console.print(table) + + def progress(self) -> Progress: + return Progress( + TextColumn("[bold blue]{task.description}"), + BarColumn(), + DownloadColumn(), + TransferSpeedColumn(), + TimeRemainingColumn(), + console=self.console, + ) + + +console = Console() + + +# ============================================================================= +# USB Device Management +# ============================================================================= + + +class USBDeviceManager: + """Manage USB device detection and operations.""" + + def list_external_devices(self) -> list[USBDevice]: + """List external USB devices.""" + result = subprocess.run( + ["diskutil", "list", "external", "physical"], + capture_output=True, + text=True, + ) + + if result.returncode != 0 or not result.stdout.strip(): + return [] + + devices = [] + for line in result.stdout.splitlines(): + # Match lines like "/dev/disk4 (external, physical):" + match = re.match(r"(/dev/disk\d+)\s+\(external", line) + if match: + device_path = match.group(1) + identifier = device_path.replace("/dev/", "") + info = self.get_device_info(identifier) + if info: + devices.append(info) + + return devices + + def get_device_info(self, identifier: str) -> USBDevice | None: + """Get detailed information about a device.""" + result = subprocess.run( + ["diskutil", "info", identifier], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + return None + + name = "Unknown" + size = "Unknown" + + for line in result.stdout.splitlines(): + if "Device / Media Name:" in line: + name = line.split(":", 1)[1].strip() + elif "Disk Size:" in line: + # Extract human-readable size + size = line.split(":", 1)[1].strip() + if "(" in size: + size = size.split("(")[0].strip() + + return USBDevice(identifier=identifier, name=name, size=size) + + def extract_vid_pid(self, identifier: str) -> str | None: + """Extract USB VID:PID from ioreg. + + Strategy: Get the device name from diskutil, then find matching + USB device in ioreg to get VID:PID. + """ + # Get the device name from diskutil + device_info = self.get_device_info(identifier) + if not device_info: + return None + + device_name = device_info.name + + # Query USB devices from ioreg + result = subprocess.run( + ["ioreg", "-r", "-c", "IOUSBHostDevice", "-l"], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + return None + + # Parse ioreg output to find the USB device with matching name + # We look for blocks containing the device name and extract VID/PID + output = result.stdout + + # Split into device blocks (each starts with +-o) + blocks = re.split(r"\+-o ", output) + + for block in blocks: + # Check if this block contains our device name + if ( + f'"USB Product Name" = "{device_name}"' in block + or f'"kUSBProductString" = "{device_name}"' in block + ): + # Extract VID and PID from this block + vid_match = re.search(r'"idVendor"\s*=\s*(\d+)', block) + pid_match = re.search(r'"idProduct"\s*=\s*(\d+)', block) + + if vid_match and pid_match: + vid = int(vid_match.group(1)) + pid = int(pid_match.group(1)) + return f"{vid:04x}:{pid:04x}" + + return None + + def unmount(self, identifier: str) -> bool: + """Unmount a disk.""" + result = subprocess.run( + ["diskutil", "unmountDisk", f"/dev/{identifier}"], + capture_output=True, + text=True, + ) + return result.returncode == 0 + + def eject(self, identifier: str) -> bool: + """Eject a disk.""" + result = subprocess.run( + ["diskutil", "eject", f"/dev/{identifier}"], + capture_output=True, + text=True, + ) + return result.returncode == 0 + + def mount_partition(self, identifier: str, partition: int = 1) -> Path | None: + """Mount a partition and return the mount point.""" + partition_id = f"{identifier}s{partition}" + result = subprocess.run( + ["diskutil", "mount", partition_id], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + return None + + # Extract mount point from output or query it + info_result = subprocess.run( + ["diskutil", "info", partition_id], + capture_output=True, + text=True, + ) + + for line in info_result.stdout.splitlines(): + if "Mount Point:" in line: + mount_point = line.split(":", 1)[1].strip() + if mount_point and Path(mount_point).exists(): + return Path(mount_point) + + return None + + def wait_for_partition( + self, identifier: str, partition: int = 1, timeout: int = 30 + ) -> bool: + """Wait for a partition to appear.""" + partition_id = f"{identifier}s{partition}" + for _ in range(timeout): + result = subprocess.run( + ["diskutil", "info", partition_id], + capture_output=True, + text=True, + ) + if result.returncode == 0: + return True + time.sleep(1) + return False + + +# ============================================================================= +# Download Manager +# ============================================================================= + + +class DownloadManager: + """Manage file downloads with progress.""" + + def __init__(self, credentials: E2Credentials | None = None) -> None: + CACHE_DIR.mkdir(parents=True, exist_ok=True) + ISO_CACHE_DIR.mkdir(parents=True, exist_ok=True) + self.credentials = credentials + + # Load ISO paths dynamically + try: + self.vyos_path, self.talos_path = load_iso_config() + except Exception as e: + # Only fail if we actually need to download them and they aren't cached + # But we can't check cache meaningfully without the path, so warn + console.warn(f"Failed to load ISO config: {e}") + self.vyos_path = "" + self.talos_path = "" + + def download_http(self, url: str, dest: Path, description: str) -> bool: + """Download a file via HTTP with progress.""" + if dest.exists(): + console.info(f"Using cached: {dest.name}") + return True + + console.info(f"Downloading {description}...") + + try: + with httpx.stream("GET", url, follow_redirects=True) as response: + response.raise_for_status() + total = int(response.headers.get("content-length", 0)) + + with ( + console.progress() as progress, + dest.open("wb") as f, + ): + task = progress.add_task(description, total=total) + for chunk in response.iter_bytes(chunk_size=8192): + f.write(chunk) + progress.update(task, advance=len(chunk)) + + console.success(f"Downloaded: {dest.name}") + return True + except httpx.HTTPError as e: + console.error(f"Download failed: {e}") + if dest.exists(): + dest.unlink() + return False + + def download_s3(self, key: str, dest: Path, description: str) -> bool: + """Download a file from S3 with progress.""" + if dest.exists(): + console.info(f"Using cached: {dest.name}") + return True + + if not self.credentials: + console.error("e2 credentials not configured") + return False + + console.info(f"Downloading {description} from e2 storage...") + + try: + s3 = boto3.client( + "s3", + endpoint_url=self.credentials.endpoint, + aws_access_key_id=self.credentials.access_key, + aws_secret_access_key=self.credentials.secret_key, + ) + + # Get file size + head = s3.head_object(Bucket=self.credentials.bucket, Key=key) + total = head["ContentLength"] + + with console.progress() as progress: + task = progress.add_task(description, total=total) + + def callback(bytes_transferred: int) -> None: + progress.update(task, advance=bytes_transferred) + + s3.download_file( + self.credentials.bucket, key, str(dest), Callback=callback + ) + + console.success(f"Downloaded: {dest.name}") + return True + except Exception as e: + console.error(f"S3 download failed: {e}") + if dest.exists(): + dest.unlink() + return False + + def download_ubuntu_image(self) -> tuple[Path, str] | None: + """Download Ubuntu cloud image for the host architecture. + + Returns: + Tuple of (path, format) where format is 'ova' or 'qcow2', + or None if download failed. + """ + arch = get_host_arch() + image_info = UBUNTU_CLOUD_IMAGES[arch] + url = image_info["url"] + fmt = image_info["format"] + + ext = "ova" if fmt == "ova" else "img" + dest = CACHE_DIR / f"ubuntu-{UBUNTU_VERSION}-cloudimg-{arch}.{ext}" + + if self.download_http(url, dest, f"Ubuntu Cloud Image ({arch})"): + return dest, fmt + return None + + def download_vyos_iso(self) -> Path | None: + """Download VyOS ISO from e2.""" + if not self.vyos_path: + return None + dest = ISO_CACHE_DIR / Path(self.vyos_path).name + if self.download_s3(self.vyos_path, dest, "VyOS ISO"): + return dest + return None + + def download_talos_iso(self) -> Path | None: + """Download Talos ISO from e2.""" + if not self.talos_path: + return None + dest = ISO_CACHE_DIR / Path(self.talos_path).name + if self.download_s3(self.talos_path, dest, "Talos ISO"): + return dest + return None + + +# ============================================================================= +# VMware Manager +# ============================================================================= + + +class VMwareManager: + """Manage VMware Fusion VMs for Ventoy installation.""" + + def __init__(self) -> None: + self.vm_name = "ventoy-installer" + self.vm_dir = CACHE_DIR / "vms" / self.vm_name + self.vmx_path = self.vm_dir / f"{self.vm_name}.vmx" + + def check_vmware_fusion(self) -> bool: + """Check if VMware Fusion Pro is installed.""" + if not VMWARE_APP.exists(): + console.error( + "VMware Fusion Pro is not installed. " + "Download from: https://www.vmware.com/products/fusion.html" + ) + return False + + if not VMRUN.exists(): + console.error(f"vmrun not found at {VMRUN}") + return False + + console.success("VMware Fusion Pro found") + return True + + def check_qemu_img(self) -> bool: + """Check if qemu-img is available (needed for ARM64).""" + return shutil.which("qemu-img") is not None + + def _vmdk_metadata_path(self, vmdk_path: Path) -> Path: + return vmdk_path.with_name(f"{vmdk_path.name}.meta.json") + + def _vmdk_is_fresh( + self, vmdk_path: Path, image_path: Path, image_format: str + ) -> bool: + meta_path = self._vmdk_metadata_path(vmdk_path) + if not vmdk_path.exists() or not meta_path.exists(): + return False + try: + data = json.loads(meta_path.read_text()) + except (OSError, json.JSONDecodeError): + return False + + stat = image_path.stat() + return ( + data.get("source_name") == image_path.name + and data.get("source_size") == stat.st_size + and data.get("source_mtime_ns") == stat.st_mtime_ns + and data.get("source_format") == image_format + ) + + def _write_vmdk_metadata( + self, vmdk_path: Path, image_path: Path, image_format: str + ) -> None: + stat = image_path.stat() + data = { + "source_name": image_path.name, + "source_size": stat.st_size, + "source_mtime_ns": stat.st_mtime_ns, + "source_format": image_format, + } + meta_path = self._vmdk_metadata_path(vmdk_path) + meta_path.write_text(json.dumps(data, indent=2)) + + def convert_qcow2_to_vmdk(self, qcow2_path: Path, vmdk_path: Path) -> bool: + """Convert QCOW2 image to VMDK format using qemu-img.""" + console.info("Converting QCOW2 to VMDK (this may take a moment)...") + + result = subprocess.run( + [ + "qemu-img", + "convert", + "-f", + "qcow2", + "-O", + "vmdk", + "-o", + "adapter_type=lsilogic", # Compatible with VMware + str(qcow2_path), + str(vmdk_path), + ], + capture_output=True, + text=True, + ) + + if result.returncode == 0: + console.success("Disk converted successfully") + return True + + console.error(f"Conversion failed: {result.stderr}") + return False + + def prepare_disk(self, image_path: Path, image_format: str) -> Path: + """Prepare VMDK from downloaded image. + + Handles both OVA (x86_64) and QCOW2 (ARM64) formats. + """ + self.vm_dir.mkdir(parents=True, exist_ok=True) + + vmdk_path = self.vm_dir / f"{image_path.stem}.vmdk" + + if self._vmdk_is_fresh(vmdk_path, image_path, image_format): + console.info("Using cached VM disk") + return vmdk_path + if vmdk_path.exists(): + console.info("Cached VM disk is stale; rebuilding...") + vmdk_path.unlink() + self._vmdk_metadata_path(vmdk_path).unlink(missing_ok=True) + + if image_format == "qcow2": + # ARM64: Convert QCOW2 to VMDK + if not self.check_qemu_img(): + console.error( + "qemu-img is required for ARM64 Macs.\n" + "Install with: brew install qemu" + ) + raise RuntimeError("qemu-img not found") + + if not self.convert_qcow2_to_vmdk(image_path, vmdk_path): + raise RuntimeError("Failed to convert QCOW2 to VMDK") + + self._write_vmdk_metadata(vmdk_path, image_path, image_format) + return vmdk_path + + # OVA format (x86_64) + console.info("Importing Ubuntu cloud OVA...") + + # Use ovftool to extract VMDK from OVA + if OVFTOOL.exists(): + result = subprocess.run( + [ + str(OVFTOOL), + "--lax", # Be lenient with OVA format + "--diskMode=monolithicSparse", + str(image_path), + str(self.vm_dir / "imported.vmx"), + ], + capture_output=True, + text=True, + ) + if result.returncode == 0: + # Find the extracted VMDK + vmdk_files = sorted( + self.vm_dir.glob("*.vmdk"), + key=lambda path: path.stat().st_mtime, + reverse=True, + ) + if vmdk_files: + vmdk = vmdk_files[0] + if vmdk.name != vmdk_path.name: + vmdk.rename(vmdk_path) + # Clean up the imported VMX + for f in self.vm_dir.glob("imported.*"): + if f.suffix != ".vmdk": + f.unlink() + if vmdk_path.exists(): + self._write_vmdk_metadata(vmdk_path, image_path, image_format) + console.success("OVA imported successfully") + return vmdk_path + + # Fallback: Extract OVA manually (it's a tar file) + console.info("Extracting OVA manually...") + with tarfile.open(image_path, "r") as tar: + for member in tar.getmembers(): + if member.name.endswith(".vmdk"): + member.name = vmdk_path.name + tar.extract(member, self.vm_dir, filter="data") + self._write_vmdk_metadata(vmdk_path, image_path, image_format) + console.success("VMDK extracted from OVA") + return vmdk_path + + raise RuntimeError("Could not extract VMDK from OVA") + + def create_cloud_init_iso(self) -> Path: + """Create cloud-init ISO with Ventoy installation script.""" + iso_path = self.vm_dir / "cloud-init.iso" + staging_dir = self.vm_dir / "cloud-init-staging" + + # Clean up any existing staging + if staging_dir.exists(): + shutil.rmtree(staging_dir) + staging_dir.mkdir(parents=True) + + # Create meta-data + meta_data = staging_dir / "meta-data" + meta_data.write_text("instance-id: ventoy-installer\nlocal-hostname: ventoy\n") + + # Create user-data with Ventoy installation script + user_data = staging_dir / "user-data" + user_data.write_text(f"""#cloud-config +users: + - name: ubuntu + sudo: ALL=(ALL) NOPASSWD:ALL + shell: /bin/bash + lock_passwd: false + +# Set password for vmrun authentication +chpasswd: + expire: false + users: + - name: ubuntu + password: ubuntu + type: text + +write_files: + - path: /opt/install-ventoy.sh + permissions: '0755' + content: | + #!/bin/bash + set -ex + + echo "=== Ventoy USB Installation Script ===" + + # Download Ventoy + echo "Downloading Ventoy {VENTOY_VERSION}..." + cd /opt + wget -q https://github.com/ventoy/Ventoy/releases/download/v{VENTOY_VERSION}/ventoy-{VENTOY_VERSION}-linux.tar.gz + tar xzf ventoy-{VENTOY_VERSION}-linux.tar.gz + rm ventoy-{VENTOY_VERSION}-linux.tar.gz + + # Wait for USB device to appear + echo "Waiting for USB device..." + for i in {{1..30}}; do + for dev in /dev/sd?; do + if [ -b "$dev" ]; then + devname=$(basename "$dev") + if [ -f "/sys/block/${{devname}}/removable" ]; then + removable=$(cat "/sys/block/${{devname}}/removable") + if [ "$removable" = "1" ]; then + size=$(cat "/sys/block/${{devname}}/size") + if [ "$size" -gt 1000000 ]; then + USB_DEV="$dev" + echo "Found USB device: $USB_DEV" + break 2 + fi + fi + fi + fi + done + sleep 1 + done + + if [ -z "$USB_DEV" ]; then + echo "ERROR: No USB device found!" + echo "VENTOY_INSTALL_FAILED" > /tmp/ventoy-status + exit 1 + fi + + # Install Ventoy + echo "Installing Ventoy on $USB_DEV..." + cd /opt/ventoy-{VENTOY_VERSION} + + # Run Ventoy installation (non-interactive, force install) + echo "y" | ./Ventoy2Disk.sh -I "$USB_DEV" + + if [ $? -eq 0 ]; then + echo "VENTOY_INSTALL_SUCCESS" > /tmp/ventoy-status + echo "=== Ventoy installation complete ===" + # Signal success by shutting down + sleep 2 + poweroff + else + echo "VENTOY_INSTALL_FAILED" > /tmp/ventoy-status + echo "=== Ventoy installation FAILED ===" + exit 1 + fi + +runcmd: + - /opt/install-ventoy.sh + +final_message: "Cloud-init completed. Ventoy installation status in /tmp/ventoy-status" +""") + + # Create ISO using hdiutil (macOS) + console.info("Creating cloud-init ISO...") + result = subprocess.run( + [ + "hdiutil", + "makehybrid", + "-iso", + "-joliet", + "-iso-volume-name", + "cidata", + "-joliet-volume-name", + "cidata", + "-o", + str(iso_path), + str(staging_dir), + ], + capture_output=True, + text=True, + ) + if result.returncode != 0: + console.error(f"Failed to create ISO: {result.stderr}") + raise RuntimeError("Failed to create cloud-init ISO") + + # Cleanup staging + shutil.rmtree(staging_dir) + + console.success("Cloud-init ISO created") + return iso_path + + def generate_vmx(self, vmdk_path: Path, cloud_init_iso: Path, vid_pid: str) -> Path: + """Generate VMX configuration file for Ubuntu cloud image.""" + self.vm_dir.mkdir(parents=True, exist_ok=True) + + vid, pid = vid_pid.split(":") + + arch = get_host_arch() + guest_os = "arm-ubuntu-64" if arch == "arm64" else "ubuntu-64" + + template_name = ( + "vmx-arm64.template" if arch == "arm64" else "vmx-x86_64.template" + ) + template_path = SCRIPT_DIR / "templates" / template_name + + if not template_path.exists(): + raise FileNotFoundError(f"VMX template not found: {template_path}") + + template = template_path.read_text() + + vmx_content = template.format( + vm_name=self.vm_name, + guest_os=guest_os, + vmdk_path=vmdk_path, + cloud_init_iso=cloud_init_iso, + vid=vid, + pid=pid, + ) + + self.vmx_path.write_text(vmx_content) + console.success("VMX configuration generated") + return self.vmx_path + + def start_vm(self) -> bool: + """Start the VM.""" + console.info("Starting VM...") + result = subprocess.run( + [str(VMRUN), "-T", "fusion", "start", str(self.vmx_path), "gui"], + capture_output=True, + text=True, + ) + if result.returncode == 0: + console.success("VM started") + return True + console.error(f"Failed to start VM: {result.stderr}") + return False + + def is_vm_running(self) -> bool: + """Check if the VM is currently running.""" + result = subprocess.run( + [str(VMRUN), "list"], + capture_output=True, + text=True, + ) + return str(self.vmx_path) in result.stdout + + def wait_for_ventoy_install(self, timeout: int = 300) -> bool: + """Wait for Ventoy installation to complete (VM will shut down on success).""" + console.info( + "Waiting for Ventoy installation (VM will shut down when complete)..." + ) + start_time = time.time() + + while time.time() - start_time < timeout: + # VM shuts down on successful installation + if not self.is_vm_running(): + console.success("VM shut down - Ventoy installation complete!") + return True + + elapsed = int(time.time() - start_time) + if elapsed % 15 == 0 and elapsed > 0: + console.info(f"Still installing Ventoy... ({elapsed}s)") + + time.sleep(5) + + console.error("Timeout waiting for Ventoy installation") + return False + + def stop_vm(self) -> None: + """Stop the VM if running.""" + result = subprocess.run( + [str(VMRUN), "list"], + capture_output=True, + text=True, + ) + if str(self.vmx_path) in result.stdout: + console.info("Stopping VM...") + subprocess.run( + [str(VMRUN), "stop", str(self.vmx_path), "soft"], + capture_output=True, + ) + time.sleep(2) + + def cleanup(self) -> None: + """Clean up VM files while preserving cached disks.""" + self.stop_vm() + if not self.vm_dir.exists(): + return + + for path in self.vm_dir.iterdir(): + if path.is_dir(): + shutil.rmtree(path) + continue + if path.suffix == ".vmdk" or path.name.endswith(".vmdk.meta.json"): + continue + path.unlink() + + console.success("VM artifacts cleaned up") + + +# ============================================================================= +# CLI +# ============================================================================= + + +def check_prerequisites(skip_ventoy: bool) -> bool: + """Check all prerequisites are met.""" + console.info("Checking prerequisites...") + + # Check for macOS + import platform + + if platform.system() != "Darwin": + console.error("This script only runs on macOS") + return False + + # Check for VMware Fusion (only if installing Ventoy) + if not skip_ventoy: + vmware = VMwareManager() + if not vmware.check_vmware_fusion(): + return False + + # ARM64 Macs need qemu-img to convert Ubuntu cloud images + if get_host_arch() == "arm64" and not vmware.check_qemu_img(): + console.error( + "qemu-img is required for ARM64 Macs (to convert Ubuntu cloud images).\n" + "Install with: brew install qemu" + ) + return False + if get_host_arch() == "arm64": + console.success("qemu-img found") + + # Check for VyOS config file + if not VYOS_CONFIG.exists(): + console.error(f"VyOS configuration file not found: {VYOS_CONFIG}") + return False + console.success("VyOS configuration file found") + + return True + + +def select_device(usb_mgr: USBDeviceManager, device: str | None) -> USBDevice | None: + """Select a USB device interactively or by name.""" + if device: + info = usb_mgr.get_device_info(device) + if not info: + console.error(f"Device not found: {device}") + return None + return info + + # List devices + console.info("Detecting USB devices...") + devices = usb_mgr.list_external_devices() + + if not devices: + console.error("No external USB devices detected") + return None + + # Show table + console.table( + "Available USB Devices", + ["Identifier", "Name", "Size"], + [[d.identifier, d.name, d.size] for d in devices], + ) + + # Prompt for selection + identifier = console.prompt("Enter the disk identifier to use (e.g., disk4)") + return usb_mgr.get_device_info(identifier) + + +def confirm_device(device: USBDevice, skip_ventoy: bool, yes: bool) -> bool: + """Confirm device selection.""" + console.console.print() + if not skip_ventoy: + console.warn("WARNING: This will ERASE ALL DATA on the following device:") + else: + console.info("Target USB device:") + + console.console.print(f"\n Device: /dev/{device.identifier}") + console.console.print(f" Name: {device.name}") + console.console.print(f" Size: {device.size}\n") + + if yes: + console.info("Skipping confirmation (--yes flag)") + return True + + if not skip_ventoy: + response = console.prompt("Type 'yes' to continue, or anything else to abort") + return response.lower() == "yes" + + return console.confirm("Continue?", default=True) + + +def copy_files_to_usb( + mount_point: Path, vyos_iso: Path | None, talos_iso: Path | None +) -> None: + """Copy ISOs and config to USB.""" + console.info("Copying files to USB...") + + if vyos_iso and vyos_iso.exists(): + console.info( + f"Copying VyOS ISO ({vyos_iso.stat().st_size // 1024 // 1024}MB)..." + ) + shutil.copy2(vyos_iso, mount_point / vyos_iso.name) + console.success("VyOS ISO copied") + + if talos_iso and talos_iso.exists(): + console.info( + f"Copying Talos ISO ({talos_iso.stat().st_size // 1024 // 1024}MB)..." + ) + shutil.copy2(talos_iso, mount_point / talos_iso.name) + console.success("Talos ISO copied") + + if VYOS_CONFIG.exists(): + console.info("Copying VyOS configuration...") + shutil.copy2(VYOS_CONFIG, mount_point / VYOS_CONFIG.name) + console.success("VyOS configuration copied") + + console.success("All files copied to USB") + + +@click.command() +@click.option("-d", "--device", help="USB device to provision (e.g., disk4)") +@click.option( + "-s", "--skip-download", is_flag=True, help="Skip ISO download, use cached files" +) +@click.option("-v", "--skip-ventoy", is_flag=True, help="Skip Ventoy installation") +@click.option("-y", "--yes", is_flag=True, help="Skip confirmation prompts") +def main(device: str | None, skip_download: bool, skip_ventoy: bool, yes: bool) -> None: + """Provision a USB drive with Ventoy and lab bootstrap media.""" + console.banner("Lab Bootstrap USB Provisioning") + + # Check prerequisites + if not check_prerequisites(skip_ventoy): + raise SystemExit(1) + + # Load e2 credentials for S3 downloads + e2_credentials = None + if not skip_download: + try: + console.info("Loading e2 storage credentials...") + e2_credentials = load_e2_credentials() + console.success("e2 credentials loaded") + except FileNotFoundError as e: + console.error(str(e)) + raise SystemExit(1) + except RuntimeError as e: + console.error(str(e)) + raise SystemExit(1) + + # Initialize managers + usb_mgr = USBDeviceManager() + download_mgr = DownloadManager(credentials=e2_credentials) + + # Select and confirm device + usb_device = select_device(usb_mgr, device) + if not usb_device: + raise SystemExit(1) + + if not confirm_device(usb_device, skip_ventoy, yes): + console.error("Aborted by user") + raise SystemExit(1) + + # Download ISOs + vyos_iso = None + talos_iso = None + if not skip_download: + vyos_iso = download_mgr.download_vyos_iso() + if not vyos_iso: + console.error("Failed to download VyOS ISO - cannot continue") + raise SystemExit(1) + + talos_iso = download_mgr.download_talos_iso() + if not talos_iso: + console.error("Failed to download Talos ISO - cannot continue") + raise SystemExit(1) + else: + # Try to resolve paths for cache checking even if we skipped download + vyos_path = download_mgr.vyos_path + talos_path = download_mgr.talos_path + + # If config loading failed earlier, try to load again or fail + if not vyos_path or not talos_path: + try: + vyos_path, talos_path = load_iso_config() + except Exception as e: + console.error(f"Cannot resolve ISO paths from manifest: {e}") + raise SystemExit(1) + + console.info("Skipping ISO download (--skip-download flag)") + vyos_iso = ISO_CACHE_DIR / Path(vyos_path).name + talos_iso = ISO_CACHE_DIR / Path(talos_path).name + if not vyos_iso.exists(): + console.error(f"VyOS ISO not found in cache: {vyos_iso}") + raise SystemExit(1) + if not talos_iso.exists(): + console.error(f"Talos ISO not found in cache: {talos_iso}") + raise SystemExit(1) + + # Install Ventoy + if not skip_ventoy: + vmware = VMwareManager() + + # Get VID:PID + vid_pid = usb_mgr.extract_vid_pid(usb_device.identifier) + if not vid_pid: + console.warn("Could not detect USB VID:PID automatically") + vid_pid = console.prompt("Enter USB VID:PID manually (format: xxxx:xxxx)") + if not re.match(r"^[0-9a-fA-F]{4}:[0-9a-fA-F]{4}$", vid_pid): + console.error("Invalid VID:PID format") + raise SystemExit(1) + + console.info(f"USB device VID:PID: {vid_pid}") + + # Unmount USB before VM operations + console.info("Unmounting USB device...") + usb_mgr.unmount(usb_device.identifier) + + try: + # Download VM requirements + ubuntu_result = download_mgr.download_ubuntu_image() + + if not ubuntu_result: + console.error("Failed to download Ubuntu cloud image") + raise SystemExit(1) + + ubuntu_image, image_format = ubuntu_result + + # Prepare disk (handles OVA for x86_64, QCOW2 conversion for ARM64) + vmdk_path = vmware.prepare_disk(ubuntu_image, image_format) + cloud_init_iso = vmware.create_cloud_init_iso() + vmware.generate_vmx(vmdk_path, cloud_init_iso, vid_pid) + + # Start VM + if not vmware.start_vm(): + raise SystemExit(1) + + # Wait for Ventoy installation (VM shuts down on success) + if not vmware.wait_for_ventoy_install(timeout=300): + console.error("Ventoy installation did not complete successfully") + raise SystemExit(1) + + finally: + vmware.cleanup() + + console.success("Ventoy installation complete") + else: + console.info("Skipping Ventoy installation (--skip-ventoy flag)") + + # Wait for Ventoy partition + console.info("Waiting for Ventoy partition...") + time.sleep(3) # Give macOS time to detect the new partition + if not usb_mgr.wait_for_partition(usb_device.identifier): + console.error("Ventoy partition not found") + raise SystemExit(1) + console.success("Ventoy partition detected") + + # Mount and copy files + mount_point = usb_mgr.mount_partition(usb_device.identifier) + if not mount_point: + console.error("Failed to mount Ventoy partition") + raise SystemExit(1) + console.success(f"Ventoy partition mounted at: {mount_point}") + + copy_files_to_usb(mount_point, vyos_iso, talos_iso) + + # Eject USB + console.info("Ejecting USB device...") + usb_mgr.eject(usb_device.identifier) + console.success("USB device ejected") + + # Done! + console.console.print() + console.banner("USB Provisioning Complete!") + console.console.print() + console.console.print("The USB drive is now ready for lab bootstrap.") + console.console.print() + console.console.print("[bold]Contents:[/bold]") + console.console.print(" - Ventoy bootloader installed") + console.console.print(" - VyOS Stream ISO (for router installation)") + console.console.print(" - Talos ISO with embedded config (for UM760 bootstrap)") + console.console.print(" - gateway.conf (VyOS configuration)") + console.console.print() + console.console.print("[bold]Next steps:[/bold]") + console.console.print( + " 1. Boot VP6630 from USB -> Select VyOS ISO -> Run 'install image'" + ) + console.console.print(" 2. After VyOS install, load gateway.conf configuration") + console.console.print( + " 3. Boot UM760 from USB -> Select Talos ISO -> Bootstrap completes" + ) + console.console.print() + console.console.print( + "See docs/architecture/appendices/B_bootstrap_procedure.md for details." + ) + console.console.print() + + +if __name__ == "__main__": + main() diff --git a/bootstrap/genesis/scripts/templates/vmx-arm64.template b/bootstrap/genesis/scripts/templates/vmx-arm64.template new file mode 100644 index 0000000..fbfb762 --- /dev/null +++ b/bootstrap/genesis/scripts/templates/vmx-arm64.template @@ -0,0 +1,64 @@ +.encoding = "UTF-8" +config.version = "8" +virtualHW.version = "22" +displayName = "{vm_name}" +guestOS = "{guest_os}" +firmware = "efi" + +# PCIe bridges (required for ARM) +pciBridge0.present = "TRUE" +pciBridge4.present = "TRUE" +pciBridge4.virtualDev = "pcieRootPort" +pciBridge4.functions = "8" +pciBridge5.present = "TRUE" +pciBridge5.virtualDev = "pcieRootPort" +pciBridge5.functions = "8" +pciBridge6.present = "TRUE" +pciBridge6.virtualDev = "pcieRootPort" +pciBridge6.functions = "8" +pciBridge7.present = "TRUE" +pciBridge7.virtualDev = "pcieRootPort" +pciBridge7.functions = "8" + +# Hardware +numvcpus = "2" +memsize = "2048" +virtualHW.productCompatibility = "hosted" +hpet0.present = "TRUE" +vmci0.present = "TRUE" + +# NVMe disk (Ubuntu cloud image) +nvme0.present = "TRUE" +nvme0.pciSlotNumber = "224" +nvme0:0.fileName = "{vmdk_path}" +nvme0:0.present = "TRUE" + +# SATA controller for CD-ROM (cloud-init) +sata0.present = "TRUE" +sata0.pciSlotNumber = "35" +sata0:0.present = "TRUE" +sata0:0.deviceType = "cdrom-image" +sata0:0.fileName = "{cloud_init_iso}" + +# USB controllers +usb.present = "TRUE" +usb.pciSlotNumber = "32" +ehci.present = "TRUE" +ehci.pciSlotNumber = "34" +usb_xhci.present = "TRUE" +usb_xhci.pciSlotNumber = "192" +usb.generic.autoconnect = "TRUE" + +# USB device auto-connect +usb.autoConnect.device0 = "vid:{vid} pid:{pid}" + +# Network (NAT for internet access) +ethernet0.present = "TRUE" +ethernet0.connectionType = "nat" +ethernet0.virtualDev = "e1000e" +ethernet0.addressType = "generated" +ethernet0.pciSlotNumber = "160" + +# Misc +floppy0.present = "FALSE" +tools.upgrade.policy = "manual" diff --git a/bootstrap/genesis/scripts/templates/vmx-x86_64.template b/bootstrap/genesis/scripts/templates/vmx-x86_64.template new file mode 100644 index 0000000..8238976 --- /dev/null +++ b/bootstrap/genesis/scripts/templates/vmx-x86_64.template @@ -0,0 +1,40 @@ +.encoding = "UTF-8" +config.version = "8" +virtualHW.version = "22" +displayName = "{vm_name}" +guestOS = "{guest_os}" + +# Hardware +numvcpus = "2" +memsize = "2048" +virtualHW.productCompatibility = "hosted" + +# SCSI disk (Ubuntu cloud image) +scsi0.present = "TRUE" +scsi0.virtualDev = "lsilogic" +scsi0:0.fileName = "{vmdk_path}" +scsi0:0.present = "TRUE" + +# IDE CD-ROM (cloud-init) +ide0:0.present = "TRUE" +ide0:0.deviceType = "cdrom-image" +ide0:0.fileName = "{cloud_init_iso}" +ide0:0.startConnected = "TRUE" + +# USB controller +usb.present = "TRUE" +usb.generic.autoconnect = "TRUE" +usb_xhci.present = "TRUE" + +# USB device auto-connect +usb.autoConnect.device0 = "vid:{vid} pid:{pid}" + +# Network (NAT for internet access) +ethernet0.present = "TRUE" +ethernet0.connectionType = "nat" +ethernet0.virtualDev = "e1000" +ethernet0.addressType = "generated" + +# Misc +floppy0.present = "FALSE" +tools.upgrade.policy = "manual" diff --git a/docs/architecture/05_building_blocks/02_tinkerbell_provisioning.md b/docs/architecture/05_building_blocks/02_tinkerbell_provisioning.md index 9e955a5..a6108ce 100644 --- a/docs/architecture/05_building_blocks/02_tinkerbell_provisioning.md +++ b/docs/architecture/05_building_blocks/02_tinkerbell_provisioning.md @@ -9,10 +9,10 @@ Tinkerbell handles **Day Zero** operations — the initial bootstrap of physical | Target | What Tinkerbell Installs | Result | |:---|:---|:---| -| **VP6630** | VyOS (vyos-build image) | Lab router with VLANs and DHCP relay | -| **UM760** | Talos Linux | Node joins the Platform Cluster | | **MS-02 (x3)** | Harvester OS | Nodes join the Harvester HCI cluster | +> **Note:** The VP6630 (VyOS router) and UM760 (first platform node) are bootstrapped manually before Tinkerbell is available. VyOS is installed from the Stream ISO with configuration loaded from USB. The UM760 boots from a Talos ISO with embedded machine configuration. See [Appendix B: Bootstrap Procedure](../appendices/B_bootstrap_procedure.md) for details. + After initial provisioning, Tinkerbell's role is complete. Ongoing lifecycle management is handled by VyOS CI/CD (for router config), Harvester (for HCI), and CAPI (for Kubernetes clusters). ## Components diff --git a/docs/architecture/06_runtime_view.md b/docs/architecture/06_runtime_view.md index d090591..1acbe75 100644 --- a/docs/architecture/06_runtime_view.md +++ b/docs/architecture/06_runtime_view.md @@ -6,42 +6,40 @@ This section describes key runtime scenarios — how the system's building block ## 1. Genesis Bootstrap -The "Genesis" sequence bootstraps the entire infrastructure from bare metal to a fully operational Platform Cluster. +The "Genesis" sequence bootstraps the entire infrastructure from bare metal to a fully operational Platform Cluster using an embedded configuration ISO approach that eliminates the need for a temporary seed cluster. ### Prerequisites - Physical hardware cabled and powered -- VyOS image built with vyos-build (baked-in configuration) -- Synology NAS available with Talos VM capability +- VyOS Stream ISO downloaded via `labctl images sync` +- Talos ISO with embedded machine configuration (built via `labctl images sync`) +- USB drives for VyOS and Talos installation ### Sequence ```mermaid sequenceDiagram - participant NAS as Synology NAS - participant Seed as Seed Cluster - participant Tink as Tinkerbell participant VyOS as VP6630 (VyOS) participant UM as UM760 + participant Argo as Argo CD + participant Tink as Tinkerbell participant MS as MS-02 (x3) participant Harv as Harvester - participant Argo as Argo CD participant Plat as Platform Cluster - Note over NAS: Phase 1: Seed - NAS->>Seed: Bootstrap single-node Talos VM - Seed->>Argo: Deploy Argo CD (Helm) - Seed->>Tink: Deploy Tinkerbell stack - VyOS->>Tink: PXE boot request - Tink->>VyOS: Provision VyOS (lab networking) - Note over VyOS: VLANs + DHCP relay active + Note over VyOS: Phase 1: Direct Boot + VyOS->>VyOS: Boot from Stream ISO (USB) + VyOS->>VyOS: Load gateway.conf from USB + Note over VyOS: VLANs + NAT + DHCP relay active + + UM->>UM: Boot from embedded Talos ISO (USB) + Note over UM: Config embedded in ISO + UM->>UM: Install to disk, bootstrap cluster + UM->>Argo: Deploy Argo CD (Helm) Note over UM: Phase 2: Single-Node Platform - UM->>Tink: PXE boot request (via VyOS DHCP relay) - Tink->>UM: Provision Talos - UM->>Seed: Join cluster - Seed->>UM: Migrate workloads (Tinkerbell, Argo) - NAS->>NAS: Shutdown Seed VM + Argo->>UM: Sync clusters/platform/ UM->>UM: Deploy Crossplane + XRDs + UM->>Tink: Deploy Tinkerbell via XRs Note over MS: Phase 3: Harvester Online MS->>Tink: PXE boot (x3) @@ -52,7 +50,9 @@ sequenceDiagram Note over Plat: Phase 4: Full Platform Harv->>Harv: Create CP-2, CP-3 VMs - Harv-->>UM: VMs PXE boot and join + Harv-->>Tink: VMs PXE boot + Tink->>Harv: Provision Talos on VMs + Harv-->>UM: VMs join platform cluster UM->>Plat: 3-node Platform Cluster formed Plat->>Plat: Deploy remaining services ``` @@ -61,10 +61,10 @@ sequenceDiagram | Phase | Action | Result | |:---|:---|:---| -| **1. Seed** | Bootstrap temporary Talos on NAS, provision VyOS | Tinkerbell + Argo CD + VyOS networking operational | -| **2. Single-Node Platform** | Provision UM760, migrate from NAS | Single-node platform with Crossplane | -| **3. Harvester Online** | Provision 3x MS-02, register with Argo CD | HCI cluster managed by Argo CD | -| **4. Full Platform** | Add 2 Harvester VMs to UM760 | 3-node HA Platform Cluster | +| **1. Direct Boot** | Install VyOS from ISO, boot UM760 from embedded ISO, deploy Argo CD | Single-node platform with VyOS networking | +| **2. Single-Node Platform** | Deploy Crossplane + Tinkerbell via GitOps | Platform ready to provision hardware | +| **3. Harvester Online** | Provision 3x MS-02 via Tinkerbell, register with Argo CD | HCI cluster managed by Argo CD | +| **4. Full Platform** | Add 2 Harvester VMs to platform cluster | 3-node HA Platform Cluster | --- diff --git a/docs/architecture/07_deployment_view.md b/docs/architecture/07_deployment_view.md index aceb089..7f8af42 100644 --- a/docs/architecture/07_deployment_view.md +++ b/docs/architecture/07_deployment_view.md @@ -79,7 +79,7 @@ This section describes the physical and virtual infrastructure topology — how | Node | Operating System | Deployment Method | |:---|:---|:---| -| **VP6630** | VyOS | Tinkerbell PXE (vyos-build image) | +| **VP6630** | VyOS | Manual install from Stream ISO | | **MS-02 (x3)** | Harvester (Elemental OS) | Tinkerbell PXE | | **UM760** | Talos Linux | Tinkerbell PXE | | **Platform VMs (x2)** | Talos Linux | CAPI + Harvester | diff --git a/docs/architecture/09_design_decisions/003_vyos_gitops.md b/docs/architecture/09_design_decisions/003_vyos_gitops.md index 914a63f..f2d0da8 100644 --- a/docs/architecture/09_design_decisions/003_vyos_gitops.md +++ b/docs/architecture/09_design_decisions/003_vyos_gitops.md @@ -92,9 +92,9 @@ jobs: run: ansible-playbook infrastructure/vyos/ansible/playbooks/deploy.yml --syntax-check - name: Validate VyOS config syntax run: | - # Use vyos-config-validator or docker container - docker run --rm -v $PWD/infrastructure/vyos/configs:/config vyos/vyos-build \ - /opt/vyatta/sbin/vyatta-config-validator /config/vyos.conf + # Use vyos container for config validation + docker run --rm -v $PWD/infrastructure/network/vyos/configs:/config vyos/vyos:current \ + /opt/vyatta/sbin/vyatta-config-validator /config/gateway.conf ``` ### Workflow: Deploy on Merge @@ -182,10 +182,10 @@ To validate configuration changes before they reach production, we use [Containe #### How It Works -1. **Container Image Build**: The vyos-build pipeline produces a squashfs filesystem which is converted to a container image using `sqfs2tar` and a minimal Dockerfile. +1. **Container Image**: Uses the official VyOS container image from Docker Hub (`vyos/vyos:current`) 2. **Topology Simulation**: Containerlab deploys a test topology with: - - VyOS gateway container (same rootfs as production) + - VyOS gateway container - Simulated network clients for WAN, MGMT, and Platform networks 3. **Test Suite**: pytest with scrapli validates: @@ -199,11 +199,11 @@ To validate configuration changes before they reach production, we use [Containe ``` infrastructure/network/vyos/ -├── Dockerfile.containerlab # Container build from squashfs └── tests/ ├── topology.clab.yml # Containerlab topology ├── conftest.py # pytest fixtures ├── test_gateway.py # Test suite + ├── render-config-boot.sh # Generate config.boot with injected SSH key └── requirements.txt # Python dependencies ``` @@ -221,7 +221,7 @@ The test environment uses simplified interface mapping: Integration tests run automatically on PRs modifying `infrastructure/network/vyos/**`: -1. `build-container` job builds VyOS container from squashfs +1. `render-config-boot.sh` generates test config with SSH key injection 2. `integration-test` job deploys topology and runs pytest suite 3. Tests must pass before merge diff --git a/docs/architecture/09_design_decisions/007_image_pipeline_s3_intermediary.md b/docs/architecture/09_design_decisions/007_image_pipeline_s3_intermediary.md index e02397c..d7c0f78 100644 --- a/docs/architecture/09_design_decisions/007_image_pipeline_s3_intermediary.md +++ b/docs/architecture/09_design_decisions/007_image_pipeline_s3_intermediary.md @@ -5,10 +5,7 @@ ## Context -The lab requires machine images (Talos, VyOS, Harvester) to be available on the Synology NAS for PXE provisioning via Tinkerbell. Images come from two sources: - -1. **HTTP downloads** — Pre-built images from vendors (Talos Factory, Rancher) -2. **vyos-build** — Custom VyOS images built via Docker with configuration baked in (VyOS gateway) +The lab requires machine images (Talos, VyOS, Harvester) to be available on the Synology NAS for PXE provisioning via Tinkerbell and for manual bootstrap. Images are primarily obtained via HTTP downloads from vendors (Talos Factory, Rancher, VyOS community builds). We need a GitOps-friendly pipeline to: - Declaratively define required images in Git @@ -60,8 +57,9 @@ Store images as GitHub Release assets. Script on NAS polls for new releases. │ │ 1. Build labctl CLI │ │ │ │ 2. Parse images/images.yaml │ │ │ │ 3. For each image: │ │ -│ │ - HTTP: Download → Verify → Decompress │ │ -│ │ - vyos-build: Build in container → Collect artifact │ │ +│ │ - Download from vendor (HTTP) │ │ +│ │ - Verify checksum │ │ +│ │ - Decompress if needed │ │ │ │ 4. Upload to iDrive e2 │ │ │ └──────────────────────────────────────────────────────────────────┘ │ └─────────────────────────────────┬──────────────────────────────────────┘ diff --git a/docs/architecture/appendices/A_repository_structure.md b/docs/architecture/appendices/A_repository_structure.md index 4b9b606..ac6999f 100644 --- a/docs/architecture/appendices/A_repository_structure.md +++ b/docs/architecture/appendices/A_repository_structure.md @@ -28,9 +28,8 @@ lab/ ├── .github/ │ └── workflows/ │ ├── crossplane-build.yml # Build Crossplane packages on tag -│ ├── vyos-build.yml # Build VyOS image (vyos-build) -│ ├── vyos-validate.yml # PR validation for VyOS -│ └── vyos-deploy.yml # Deploy VyOS on merge +│ ├── vyos-validate.yml # PR validation for VyOS config +│ └── vyos-deploy.yml # Deploy VyOS config on merge │ ├── docs/ │ └── architecture/ # arc42 documentation @@ -41,17 +40,17 @@ lab/ │ ├── network/ │ │ └── vyos/ │ │ ├── configs/ -│ │ │ └── gateway.conf -│ │ ├── vyos-build/ -│ │ │ ├── build-flavors/ -│ │ │ │ └── gateway.toml # Build flavor with baked-in config -│ │ │ └── scripts/ -│ │ │ └── generate-flavor.sh # Injects SSH credentials -│ │ └── ansible/ -│ │ ├── playbooks/ -│ │ │ └── deploy.yml -│ │ └── inventory/ -│ │ └── hosts.yml +│ │ │ └── gateway.conf # Production VyOS config (loaded manually at install) +│ │ ├── ansible/ +│ │ │ ├── playbooks/ +│ │ │ │ └── deploy.yml # Ansible playbook for config updates +│ │ │ └── inventory/ +│ │ │ └── hosts.yml +│ │ ├── scripts/ +│ │ │ └── iso-to-container.sh # Convert ISO to container for testing +│ │ └── tests/ +│ │ ├── topology.clab.yml # Containerlab test topology +│ │ └── test_*.py # Integration tests │ │ │ ├── compute/ │ │ └── talos/ @@ -206,20 +205,7 @@ lab/ │ ├── genesis/ # Runbooks and scripts │ ├── README.md # Overview and prerequisites - │ ├── 01-build-vyos-image.md # Build VyOS image with vyos-build - │ ├── 02-seed-cluster.md # Create Talos VM on NAS - │ ├── 03-deploy-argocd.md # Manual Argo CD install - │ ├── 04-apply-bootstrap.md # Apply bootstrap Application - │ ├── 05-vyos-provisioning.md # Wait for VyOS to PXE boot - │ ├── 06-um760-provisioning.md # Wait for UM760 to PXE boot - │ ├── 07-migrate-to-um760.md # Drain NAS, migrate to UM760 - │ ├── 08-deploy-platform.md # Delete bootstrap, apply full platform - │ ├── 09-provision-harvester.md # Tinkerbell provisions MS-02s - │ ├── 10-expand-platform.md # Add CP-2, CP-3 VMs │ └── scripts/ - │ ├── build-vyos-image.sh # Runs vyos-build to create VyOS image - │ ├── generate-talos-config.sh # Runs talhelper - │ ├── create-seed-vm.sh # Creates Talos VM on NAS │ └── install-argocd.sh # Helm install Argo CD │ └── recovery/ @@ -563,25 +549,22 @@ infrastructure/ VyOS provides the lab's core networking: routing, firewall, DHCP, and VPN. -**Bootstrap Image (vyos-build):** -- VyOS is provisioned via Tinkerbell during genesis bootstrap -- The `vyos-build` toolchain builds a raw disk image with configuration baked in -- Image includes: VLANs, DHCP relay, BGP peering config, firewall rules, and SSH credentials -- Built once during initial bootstrap; stored on NAS for Tinkerbell to serve -- Future configuration changes use the Ansible CI/CD pipeline (not image rebuild) +**Bootstrap (Manual Install):** +- VyOS is installed manually from the official Stream ISO during genesis bootstrap +- Configuration is loaded from `infrastructure/network/vyos/configs/gateway.conf` via USB +- This is a one-time manual step; VyOS must be operational before the platform cluster can bootstrap +- See [Appendix B: Bootstrap Procedure](B_bootstrap_procedure.md) for step-by-step instructions -**VyOS Build (`infrastructure/network/vyos/vyos-build/`):** -- `build-flavors/gateway.toml` - Build flavor defining config.boot content -- `scripts/generate-flavor.sh` - Injects SSH credentials from SOPS secrets +**Configuration (`infrastructure/network/vyos/configs/gateway.conf`):** +- Declarative VyOS configuration file (curly-brace format) +- Includes: VLANs, DHCP relay, BGP peering config, firewall rules, NAT **Ongoing Management:** -- Configuration stored as declarative VyOS config file -- Deployed via Ansible playbook -- GitHub Action validates config on PR -- GitHub Action deploys config on merge to main +- Configuration changes are deployed via Ansible playbook +- GitHub Action validates config syntax on PR +- GitHub Action deploys config on merge to main (via Tailscale) **Workflow Files:** -- `.github/workflows/vyos-build.yml` - Builds VyOS image via vyos-build - `.github/workflows/vyos-validate.yml` - Validates VyOS config on PR - `.github/workflows/vyos-deploy.yml` - Deploys VyOS config on merge @@ -699,27 +682,18 @@ See [Appendix B: Bootstrap Procedure](B_bootstrap_procedure.md) for details. ### Genesis (`bootstrap/genesis/`) -Step-by-step runbooks and scripts for bootstrapping the lab from scratch. +Scripts for bootstrapping the lab from scratch. The bootstrap procedure uses an embedded ISO approach - no seed cluster or PXE is required for the initial VyOS and UM760 nodes. -**Runbooks (in order):** +**Bootstrap Flow:** +1. Install VyOS manually from Stream ISO, load `gateway.conf` from USB +2. Boot UM760 from embedded Talos ISO (config baked in via `labctl images sync`) +3. Deploy Argo CD via `install-argocd.sh` +4. GitOps takes over from there -1. `01-build-vyos-image.md` - Build VyOS image with vyos-build (bakes in initial config) -2. `02-seed-cluster.md` - Create Talos VM on NAS -3. `03-deploy-argocd.md` - Install Argo CD manually via Helm -4. `04-apply-bootstrap.md` - Apply bootstrap Application pointing to `bootstrap/seed/` -5. `05-vyos-provisioning.md` - Wait for VyOS to PXE boot (establishes lab networking) -6. `06-um760-provisioning.md` - Wait for UM760 to PXE boot and join cluster -7. `07-migrate-to-um760.md` - Drain NAS, migrate workloads to UM760 -8. `08-deploy-platform.md` - Delete bootstrap App, deploy full platform via XRs -9. `09-provision-harvester.md` - Use Tinkerbell to provision MS-02 nodes with Harvester -10. `10-expand-platform.md` - Create CP-2/CP-3 VMs on Harvester, expand platform to 3 nodes +See [Appendix B: Bootstrap Procedure](B_bootstrap_procedure.md) for complete step-by-step instructions. **Scripts:** - -- `build-vyos-image.sh` - Runs vyos-build to create VyOS raw disk image -- `generate-talos-config.sh` - Runs talhelper to generate machine configs -- `create-seed-vm.sh` - Creates Talos VM on NAS -- `install-argocd.sh` - Installs Argo CD via Helm +- `install-argocd.sh` - Installs Argo CD via Helm on the single-node platform cluster ### Recovery (`bootstrap/recovery/`) diff --git a/docs/architecture/appendices/B_bootstrap_procedure.md b/docs/architecture/appendices/B_bootstrap_procedure.md index 3be19c2..13e09a5 100644 --- a/docs/architecture/appendices/B_bootstrap_procedure.md +++ b/docs/architecture/appendices/B_bootstrap_procedure.md @@ -78,26 +78,26 @@ When the UM760 boots from this ISO: ## Bootstrap Phases -The bootstrap is divided into 4 phases, spanning 16 discrete steps. +The bootstrap is divided into 4 phases, spanning 14 discrete steps. ``` Phase 1: Direct Boot (UM760) - Steps 1-4: Build images, boot UM760 from embedded ISO, deploy Argo CD - Duration: ~30 minutes + Steps 1-4: Build images, install VyOS, boot UM760 from embedded ISO, deploy Argo CD + Duration: ~40 minutes Result: Single-node platform cluster running on UM760 Phase 2: Single-Node Platform (UM760) - Steps 5-9: Deploy full platform with Crossplane/CAPI/Tinkerbell, provision Harvester + Steps 5-7: Deploy full platform with Crossplane/CAPI/Tinkerbell, provision Harvester Duration: ~2 hours Result: Platform cluster with Crossplane, CAPI, Harvester provisioned Phase 3: Harvester Online - Steps 10-13: Register Harvester, create platform VMs + Steps 8-11: Register Harvester, create platform VMs Duration: ~30 minutes Result: CP-2 and CP-3 join platform cluster Phase 4: Full Platform (3-Node HA) - Steps 14-16: Deploy remaining services, steady state + Steps 12-14: Deploy remaining services, steady state Duration: ~30 minutes Result: Full platform operational, ready for tenant clusters ``` @@ -156,20 +156,74 @@ The embedded ISO approach removes the complexity of the previous seed cluster me - `talos/talos-1.9.1-metal-amd64.iso` uploaded to S3 (vanilla, for other uses) - `vyos/vyos-2025.11-generic-amd64.iso` uploaded to S3 -### Step 2: Prepare VyOS Router +### Step 2: Install and Configure VyOS Router + +**Purpose:** Establish lab networking before UM760 bootstrap. VyOS is foundational infrastructure that must be operational before the platform cluster can bootstrap. -**Purpose:** Establish lab networking before UM760 bootstrap. +**Why Manual Installation:** +VyOS provides the network connectivity (NAT, DHCP relay, inter-VLAN routing) required for the UM760 to pull container images during platform bootstrap. This creates a dependency: Tinkerbell cannot provision VyOS because Tinkerbell runs on the platform cluster, which needs VyOS to bootstrap. Therefore, VyOS is installed manually from the Stream ISO, similar to how the home router (CCR2004) is configured manually. + +**Prerequisites:** +- VyOS Stream ISO downloaded via `labctl images sync` (stored in S3) +- USB drive (8GB+) for VyOS installation +- USB drive or network share with `gateway.conf` configuration file **Mechanism:** -- VyOS is pre-configured (either from previous install or manual setup) -- Required VLANs: 10 (mgmt), 20 (services), 30 (platform), 40 (cluster), 60 (storage) -- UM760 must have network connectivity on VLAN 30 +1. Download VyOS Stream ISO from S3: `vyos/vyos-2025.11-generic-amd64.iso` +2. Write ISO to USB drive: + ```bash + sudo dd if=vyos-2025.11-generic-amd64.iso of=/dev/sdX bs=4M status=progress + ``` +3. Copy `infrastructure/network/vyos/configs/gateway.conf` to a second USB drive +4. Boot VP6630 from VyOS USB drive +5. Run VyOS installation: + ```bash + install image + ``` + - Follow prompts (accept defaults for disk, partition, etc.) + - Set initial password when prompted +6. Reboot into installed VyOS +7. Load production configuration: + ```bash + configure + # Mount USB with config file + sudo mount /dev/sdb1 /mnt + load /mnt/gateway.conf + commit + save + ``` +8. Add SSH public key for management: + ```bash + set system login user vyos authentication public-keys admin key '' + set system login user vyos authentication public-keys admin type ssh-ed25519 + commit + save + ``` + +**Configuration Source:** +The production VyOS configuration is maintained in Git at `infrastructure/network/vyos/configs/gateway.conf`. This file contains: +- Interface configuration (WAN on eth4, trunk on eth5 with all VLANs) +- Firewall rules (WAN isolation, lab network policies) +- NAT (masquerade for lab → internet) +- BGP configuration (for Cilium LoadBalancer VIPs) +- DHCP relay (forwards DHCP from VLANs 30/40 to Tinkerbell) +- DNS forwarding + +**Ongoing Management:** +After initial installation, VyOS configuration changes are managed via Ansible + GitHub Actions (see [ADR 003](../09_design_decisions/003_vyos_gitops.md)). The manual installation is a one-time bootstrap step. -**Note:** VyOS provisioning via Tinkerbell happens in Phase 2 after the platform cluster is running. For initial bootstrap, VyOS must already be configured. +**Timeline:** +- Write ISO to USB: ~2 minutes +- VyOS installation: ~5 minutes +- Load configuration: ~2 minutes +- **Total: ~10 minutes** **Result:** -- Lab networking operational -- UM760 can reach the network +- Lab networking fully operational +- All VLANs routable (10, 20, 30, 40, 50, 60) +- NAT providing internet access for lab networks +- DHCP relay ready for Tinkerbell +- UM760 can reach the network and pull container images ### Step 3: Boot UM760 from Embedded ISO @@ -313,27 +367,7 @@ clusters/platform/apps/tinkerbell/ - Hardware definitions registered for MS-02 nodes (Harvester cluster) - Workflows ready to provision Harvester -### Step 7: Provision VyOS via Tinkerbell - -**Purpose:** Provision the VyOS router using Tinkerbell now that the platform is running. - -**Mechanism:** -1. Power on VP6630 with PXE boot enabled -2. Tinkerbell detects hardware (MAC addresses match Hardware XRs) -3. Executes VyOS installation workflow: - - Downloads VyOS image - - Writes VyOS to disk - - Applies VyOS config - - Reboots into VyOS -4. VyOS boots with pre-configured lab networking - -**Note:** If VyOS was already manually configured in Phase 1, this step can be skipped or used to reprovision with the official workflow. - -**Result:** -- VyOS router provisioned via GitOps -- Lab networking fully managed - -### Step 8: Provision Harvester +### Step 7: Provision Harvester **Purpose:** Install Harvester OS on MS-02 nodes to create HCI cluster. @@ -376,7 +410,7 @@ clusters/platform/apps/tinkerbell/ - Need VMs on Harvester to add CP-2 and CP-3 - Harvester must be registered with Argo CD before it can be managed -### Step 9: Register Harvester with Argo CD +### Step 8: Register Harvester with Argo CD **Purpose:** Allow Argo CD to deploy resources to Harvester cluster. @@ -413,7 +447,7 @@ stringData: - Harvester appears in Argo CD cluster list - ApplicationSet can now route apps to Harvester -### Step 10: Argo CD Syncs `clusters/harvester/` +### Step 9: Argo CD Syncs `clusters/harvester/` **Purpose:** Deploy Harvester network configuration, VM images, and VM definitions. @@ -451,7 +485,7 @@ stringData: - Talos VM image available - CP-2 and CP-3 VMs created (powered off initially) -### Step 11: CP-2, CP-3 VMs Created +### Step 10: CP-2, CP-3 VMs Created **Purpose:** Create VirtualMachine resources on Harvester for platform cluster nodes. @@ -476,7 +510,7 @@ stringData: - VMs exist but not yet running - Ready for PXE boot -### Step 12: CP-2, CP-3 PXE Boot +### Step 11: CP-2, CP-3 PXE Boot **Purpose:** Provision CP-2 and CP-3 with Talos OS and join platform cluster. @@ -512,7 +546,7 @@ stringData: - Safe to deploy production workloads - Infrastructure complete, ready for tenant clusters -### Step 13: Deploy Remaining Platform Services +### Step 12: Deploy Remaining Platform Services **Purpose:** Activate all platform capabilities (observability, policy, etc.). @@ -543,7 +577,7 @@ stringData: - CAPI ready to provision tenant clusters - Platform services fully deployed -### Step 14: Steady State +### Step 13: Steady State **Purpose:** Validate that all platform components are healthy and operational. @@ -566,7 +600,7 @@ stringData: - All services healthy - Ready for tenant clusters -### Step 15: Tenant Clusters +### Step 14: Tenant Clusters **Purpose:** Begin provisioning application workload clusters. @@ -631,7 +665,9 @@ spec: │ → Runs transform hook to embed machine config │ │ → Uploads embedded ISO to S3 │ │ ↓ │ -│ Step 2: Prepare VyOS Router (manual or pre-existing) │ +│ Step 2: Install and Configure VyOS Router │ +│ → Boot from VyOS Stream ISO │ +│ → Load gateway.conf from USB │ │ ↓ │ │ Step 3: Boot UM760 from Embedded ISO │ │ → Talos reads embedded config │ @@ -651,9 +687,7 @@ spec: │ ↓ │ │ Step 6: Crossplane + Tinkerbell (XRD-based) │ │ ↓ │ -│ Step 7: Provision VyOS via Tinkerbell (optional) │ -│ ↓ │ -│ Step 8: Provision Harvester (Tinkerbell PXE boots MS-02 nodes) │ +│ Step 7: Provision Harvester (Tinkerbell PXE boots MS-02 nodes) │ │ → 3-node Harvester cluster online (~1.5 hours) │ │ │ │ Result: Platform cluster (1 node), Harvester cluster (3 nodes) │ @@ -662,16 +696,16 @@ spec: ┌─────────────────────────────────────────────────────────────────────────┐ │ PHASE 3: HARVESTER ONLINE │ │ │ -│ Step 9: Register Harvester with Argo CD (cluster Secret) │ +│ Step 8: Register Harvester with Argo CD (cluster Secret) │ │ ↓ │ -│ Step 10: Argo CD Syncs clusters/harvester/ │ +│ Step 9: Argo CD Syncs clusters/harvester/ │ │ → Networks (VLANs 10, 30, 40, 60) │ │ → Images (Talos 1.9) │ │ → VMs (CP-2, CP-3) │ │ ↓ │ -│ Step 11: CP-2, CP-3 VMs Created (powered off) │ +│ Step 10: CP-2, CP-3 VMs Created (powered off) │ │ ↓ │ -│ Step 12: CP-2, CP-3 PXE Boot → Talos installed → Join cluster │ +│ Step 11: CP-2, CP-3 PXE Boot → Talos installed → Join cluster │ │ │ │ Result: Platform cluster (3 nodes HA), Harvester cluster (3 nodes) │ └─────────────────────────────────────────────────────────────────────────┘ @@ -679,13 +713,13 @@ spec: ┌─────────────────────────────────────────────────────────────────────────┐ │ PHASE 4: FULL PLATFORM (3-NODE HA) │ │ │ -│ Step 13: Deploy Remaining Platform Services │ +│ Step 12: Deploy Remaining Platform Services │ │ → Observability (Prometheus, Grafana, Loki) │ │ → CAPI providers (Harvester, Talos) │ │ ↓ │ -│ Step 14: Steady State - Validate all components healthy │ +│ Step 13: Steady State - Validate all components healthy │ │ ↓ │ -│ Step 15: Tenant Clusters - Provision via TenantCluster XR │ +│ Step 14: Tenant Clusters - Provision via TenantCluster XR │ │ → media cluster (3 CP + 3 workers) │ │ → dev cluster (1 CP + 2 workers) │ │ → prod cluster (3 CP + 5 workers) │ @@ -701,20 +735,19 @@ spec: | Phase | Step | Name | Duration | Purpose | |:------|:-----|:-----|:---------|:--------| | 1 | 1 | Sync Images with labctl | 15 min | Build embedded ISO via transform hook | -| 1 | 2 | Prepare VyOS Router | 5 min | Ensure lab networking is ready | +| 1 | 2 | Install and Configure VyOS Router | 10 min | Boot from ISO, load gateway.conf | | 1 | 3 | Boot UM760 from Embedded ISO | 10 min | Bootstrap single-node platform cluster | | 1 | 4 | Deploy Argo CD | 5 min | Install GitOps controller | | 2 | 5 | Apply Platform Configuration | 10 min | Deploy CoreServices and PlatformServices XRs | | 2 | 6 | Crossplane + Tinkerbell (XRD) | 10 min | Deploy Tinkerbell via XRs | -| 2 | 7 | Provision VyOS via Tinkerbell | 6 min | (Optional) Reprovision VyOS via GitOps | -| 2 | 8 | Provision Harvester | 90 min | PXE boot MS-02 nodes with Harvester OS | -| 3 | 9 | Register Harvester with Argo CD | 5 min | Create cluster Secret for Harvester | -| 3 | 10 | Argo CD Syncs clusters/harvester/ | 5 min | Deploy networks, images, VM definitions | -| 3 | 11 | CP-2, CP-3 VMs Created | 5 min | Harvester creates VM resources | -| 3 | 12 | CP-2, CP-3 PXE Boot | 20 min | Provision VMs with Talos, join platform cluster | -| 4 | 13 | Deploy Remaining Platform Services | 15 min | Observability, CAPI providers | -| 4 | 14 | Steady State | 10 min | Validate all components healthy | -| 4 | 15 | Tenant Clusters | 30 min/cluster | Provision application workload clusters | +| 2 | 7 | Provision Harvester | 90 min | PXE boot MS-02 nodes with Harvester OS | +| 3 | 8 | Register Harvester with Argo CD | 5 min | Create cluster Secret for Harvester | +| 3 | 9 | Argo CD Syncs clusters/harvester/ | 5 min | Deploy networks, images, VM definitions | +| 3 | 10 | CP-2, CP-3 VMs Created | 5 min | Harvester creates VM resources | +| 3 | 11 | CP-2, CP-3 PXE Boot | 20 min | Provision VMs with Talos, join platform cluster | +| 4 | 12 | Deploy Remaining Platform Services | 15 min | Observability, CAPI providers | +| 4 | 13 | Steady State | 10 min | Validate all components healthy | +| 4 | 14 | Tenant Clusters | 30 min/cluster | Provision application workload clusters | **Total Duration (Phases 1-3):** ~3 hours **Phase 4:** Ongoing (as tenant clusters are added) @@ -744,15 +777,13 @@ Before beginning the bootstrap, ensure the following are in place: | 40 | 10.10.40.0/24 | Tenant clusters | DHCP (Tinkerbell) | | 60 | 10.10.60.0/24 | Storage replication | Static IPs | -**Note:** VyOS must be configured before UM760 bootstrap to provide network connectivity. It can be: -- Manually configured initially, then reprovisioned via Tinkerbell in Phase 2 -- Pre-configured from a previous installation +**Note:** VyOS must be installed and configured before UM760 bootstrap to provide network connectivity. See Step 2 for the manual installation procedure. ### Software | Tool | Version | Purpose | |:-----|:--------|:--------| -| Docker | v24.0.0+ | Run Talos imager and vyos-build containers | +| Docker | v24.0.0+ | Run Talos imager container | | labctl | latest | Sync images with transform hooks | | talhelper | v3.0.0+ | Generate Talos machine configs | | SOPS | v3.9.0+ | Encrypt Talos secrets | diff --git a/docs/design/image-pipeline.md b/docs/design/image-pipeline.md index 8a71e55..7a35bde 100644 --- a/docs/design/image-pipeline.md +++ b/docs/design/image-pipeline.md @@ -10,7 +10,6 @@ * **Language/Stack:** Go 1.23+, GitHub Actions, iDrive e2, Synology Cloud Sync, Mergify * **Relevant Files:** - * `infrastructure/network/vyos/vyos-build/` - VyOS image build using vyos-build toolchain * `docs/architecture/08_concepts/storage.md` - NFS storage architecture * **Style Guide:** * Configuration files use YAML @@ -39,12 +38,12 @@ spec: algorithm: sha256 expected: sha256:def456... # Post-decompression checksum - # VyOS ISO for reference/manual builds - - name: vyos-iso + # VyOS Stream ISO for manual gateway installation + - name: vyos-stream source: - url: https://github.com/vyos/vyos-rolling-nightly-builds/releases/download/1.5-rolling-202412190007/vyos-1.5-rolling-202412190007-amd64.iso + url: https://github.com/vyos/vyos-stream/releases/download/2025.11/vyos-2025.11-stream-generic-amd64.iso checksum: sha256:abc123... - destination: vyos/vyos-1.5-rolling-202412190007.iso + destination: vyos/vyos-2025.11-stream-amd64.iso # Harvester ISO (no transformation) - name: harvester-1.4.0 @@ -136,12 +135,10 @@ tools/ images/ ├── images.yaml # Image manifest ├── e2.sops.yaml # e2 credentials (SOPS encrypted) -├── packer-ssh.sops.yaml # SSH keypair for image builds (SOPS encrypted) └── .sops.yaml # SOPS config (age + PGP keys) .github/workflows/ -├── images-sync.yml # Source image pipeline -└── vyos-build.yml # VyOS image build using vyos-build toolchain +└── images-sync.yml # Source image pipeline ``` ## 4. CLI Interface @@ -230,16 +227,6 @@ echo "files_changed=true" >> "$GITHUB_OUTPUT" │ 4. CLOUD SYNC │ │ └─> Synology pulls from e2 to /volume1/images/ │ │ │ -├─────────────────────────────────────────────────────────────────────────────┤ -│ Derived Images │ -├─────────────────────────────────────────────────────────────────────────────┤ -│ │ -│ 5. VYOS BUILD WORKFLOW (vyos-build.yml) │ -│ └─> Triggered by changes to vyos-build/ or configs/ │ -│ ├─> Run vyos-build in Docker container │ -│ ├─> Upload built image to e2 │ -│ └─> Cloud Sync pulls to NAS │ -│ │ └─────────────────────────────────────────────────────────────────────────────┘ ``` @@ -269,15 +256,15 @@ echo "files_changed=true" >> "$GITHUB_OUTPUT" } } -// For upload (local files, e.g., vyos-build output) +// For upload (local files, e.g., custom built images) { - "name": "vyos-gateway", + "name": "custom-image", "checksum": "sha256:def456...", "size": 8589934592, "uploadedAt": "2024-12-20T12:00:00Z", "source": { "type": "local", - "path": "/tmp/vyos-gateway.raw" + "path": "/tmp/custom-image.raw" } } ``` @@ -290,16 +277,14 @@ lab-images/ │ ├── talos/ │ │ └── talos-1.9.1-amd64.raw │ ├── vyos/ -│ │ ├── vyos-1.5-rolling-202412190007.iso # Source ISO -│ │ └── vyos-gateway.raw # Built by vyos-build +│ │ └── vyos-2025.11-stream-amd64.iso # VyOS Stream ISO │ └── harvester/ │ └── harvester-1.4.0-amd64.iso └── metadata/ ├── talos/ │ └── talos-1.9.1-amd64.raw.json ├── vyos/ - │ ├── vyos-1.5-rolling-202412190007.iso.json - │ └── vyos-gateway.raw.json + │ └── vyos-2025.11-stream-amd64.iso.json └── harvester/ └── harvester-1.4.0-amd64.iso.json ``` @@ -399,125 +384,7 @@ jobs: --sops-age-key-file /tmp/age-key.txt ``` -### 8.2 VyOS Build (vyos-build.yml) - -```yaml -name: Build VyOS Image - -on: - push: - branches: [master] - paths: - - 'infrastructure/network/vyos/vyos-build/**' - - 'infrastructure/network/vyos/configs/gateway.conf' - pull_request: - paths: - - 'infrastructure/network/vyos/vyos-build/**' - - 'infrastructure/network/vyos/configs/gateway.conf' - workflow_dispatch: - inputs: - upload: - description: 'Upload image to e2 storage' - type: boolean - default: true - -concurrency: - group: vyos-build-${{ github.ref }} - cancel-in-progress: false - -jobs: - validate: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Validate flavor template - run: | - TEMPLATE="infrastructure/network/vyos/vyos-build/build-flavors/gateway.toml" - if [[ ! -f "${TEMPLATE}" ]]; then - echo "ERROR: Template file not found" - exit 1 - fi - if ! grep -q '%%SSH_KEY_TYPE%%' "${TEMPLATE}"; then - echo "ERROR: Template missing %%SSH_KEY_TYPE%% placeholder" - exit 1 - fi - if ! grep -q '%%SSH_PUBLIC_KEY%%' "${TEMPLATE}"; then - echo "ERROR: Template missing %%SSH_PUBLIC_KEY%% placeholder" - exit 1 - fi - - build: - if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' - runs-on: ubuntu-latest - needs: validate - steps: - - uses: actions/checkout@v4 - - - uses: actions/setup-go@v5 - with: - go-version: '1.23' - - - name: Build labctl - run: go build -o labctl ./tools/labctl - - - name: Install SOPS - run: | - curl -LO https://github.com/getsops/sops/releases/download/v3.9.2/sops-v3.9.2.linux.amd64 - chmod +x sops-v3.9.2.linux.amd64 - sudo mv sops-v3.9.2.linux.amd64 /usr/local/bin/sops - - - name: Write SOPS age key - run: | - echo "${{ secrets.SOPS_AGE_KEY }}" > /tmp/age-key.txt - chmod 600 /tmp/age-key.txt - - - name: Extract SSH public key - env: - SOPS_AGE_KEY_FILE: /tmp/age-key.txt - run: | - sops --decrypt \ - --extract '["ssh_public_key"]' images/packer-ssh.sops.yaml > /tmp/ssh_key.pub - - - name: Clone vyos-build - run: | - git clone -b current --single-branch --depth 1 \ - https://github.com/vyos/vyos-build.git /tmp/vyos-build - - - name: Generate build flavor - run: | - ./infrastructure/network/vyos/vyos-build/scripts/generate-flavor.sh \ - "$(cat /tmp/ssh_key.pub)" \ - /tmp/vyos-build/data/build-flavors/gateway.toml - - - name: Build VyOS image - run: | - VERSION="lab-$(date +%Y%m%d%H%M%S)" - docker run --rm --privileged \ - -v /tmp/vyos-build:/vyos \ - -v /dev:/dev \ - -w /vyos \ - vyos/vyos-build:current \ - bash -c "sudo ./build-vyos-image --architecture amd64 --build-by ci@lab.gilman.io --build-type release --version ${VERSION} gateway" - - RAW_FILE=$(find /tmp/vyos-build -maxdepth 1 -name "*.raw" -type f 2>/dev/null | head -1) - cp "${RAW_FILE}" /tmp/vyos-gateway.raw - - - name: Upload to e2 - if: github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && inputs.upload) - run: | - ./labctl images upload \ - --credentials images/e2.sops.yaml \ - --sops-age-key-file /tmp/age-key.txt \ - --source /tmp/vyos-gateway.raw \ - --destination vyos/vyos-gateway.raw -``` - -**VyOS Build Process:** The workflow uses the official `vyos/vyos-build` Docker container -with build flavors. The `gateway.toml` flavor embeds the VyOS configuration directly into -the image, with SSH credentials injected via placeholder replacement. - -### 8.3 Mergify Configuration (.mergify.yml) +### 8.2 Mergify Configuration (.mergify.yml) ```yaml pull_request_rules: @@ -525,9 +392,8 @@ pull_request_rules: conditions: - author=github-actions[bot] - label=automated - - base=main + - base=master - "#approved-reviews-by>=0" # No approval required for bot PRs - - "check-success=Build VyOS Image / validate" actions: merge: method: squash @@ -537,8 +403,6 @@ pull_request_rules: {{ body }} ``` -**Check Name Format:** `Workflow Name / Job Name` - ## 9. Security ### SOPS-Encrypted Credentials @@ -557,41 +421,6 @@ sops: encrypted_regex: ^(access_key|secret_key)$ ``` -### SOPS-Encrypted SSH Keypair - -Used by VyOS builds for image provisioning. The public key is baked into the image; the private key is stored for future use (e.g., post-build testing). - -```bash -# Generate keypair (filename kept as packer-ssh for compatibility) -ssh-keygen -t ed25519 -f packer-ssh -N "" -C "vyos-ci" - -# Create SOPS file -cat > images/packer-ssh.sops.yaml << 'EOF' -ssh_public_key: "ssh-ed25519 AAAA... vyos-ci" -ssh_private_key: | - -----BEGIN OPENSSH PRIVATE KEY----- - ... - -----END OPENSSH PRIVATE KEY----- -EOF - -# Encrypt -sops --encrypt --in-place images/packer-ssh.sops.yaml -``` - -```yaml -# images/packer-ssh.sops.yaml (encrypted) -ssh_public_key: ENC[AES256_GCM,data:...,type:str] -ssh_private_key: ENC[AES256_GCM,data:...,type:str] # Optional: for future use -sops: - age: - - recipient: age1... # CI key - pgp: - - XXXX... # Yubikey - encrypted_regex: ^(ssh_public_key|ssh_private_key)$ -``` - -**Current Usage:** Only `ssh_public_key` is extracted during build. The private key is retained for potential future automation (e.g., post-build smoke tests). - ```yaml # images/.sops.yaml creation_rules: diff --git a/images/packer-ssh.sops.yaml b/images/packer-ssh.sops.yaml deleted file mode 100644 index 3f5eb52..0000000 --- a/images/packer-ssh.sops.yaml +++ /dev/null @@ -1,29 +0,0 @@ -ssh_public_key: ENC[AES256_GCM,data:DvFZz2d7nI+ijOicfHobn842tIk9TVUqorTho53pa6wOorrE5IgSIauVe3MPzlFft6AwsM6H4O+8YXdpanQxqSzWyfTJU/ZpLr0umacDj9Fnpx1+udra2apv,iv:v9xc1O5FqgDiLS918wTWNJ/gpU9nEitv7GdDIyU5l60=,tag:wmRAAxqVY/MUtOpmMpl59g==,type:str] -ssh_private_key: ENC[AES256_GCM,data:GjOrHGbeRyl17wz/+UjfKWdHKgz9FMLxsDId+33TOyV1564M0rKMyWu2exsVeNWyW3LeUO1ME73VGxQ2VLcaJrK03n2qeOZyL1lpoX+TIx4O4ESWUYwjCbHbFVapIR4UBNihdWykkipEcH9NFpFXTGgmN8AIO+ci0u8kmK59IaxY2g9Xx37EUTyR6k/2HB+VTYYi9M3EUZcom0QVbEJ2jfrrIWyiFwQntGA1UUeFErQb6IH+U8jFiJCpmr432V4OOQBc2fut0PSs3ODXP71DK0kHQZn+RmikUa3hGmCKdp8gJMieJ8YtJqMao6brwcEdL6mtpUnJ8VkMwkVOrLl92GJzt9RjourLD6PdE4p5+cLMEuLtnYA7HAI7i0XZjfz3mGDq0UQNFb7yGU1QlKaBtglroYcZAKos0C7PLYTCEUKlPOKjy0zQ06RTJhOaZjbErLZqAh+O9khIcKux5wQKfe1it+dr0GeEapndhLjYMiSK4rhEdX3dAWtRUWwa2McEpBJJ9xBlVjezu+SRDlnr,iv:0P4Q8X5A/nXs/fU33f/p688MSGEAhFfXLq8tf3Bwqho=,tag:Dl5rZXBENZHmPbjD25bs9Q==,type:str] -sops: - age: - - recipient: age1d9n4x345xfahp0wmjak4gjzawpjvmcxmyf5knct7yy84mznq2sdqp0dy2d - enc: | - -----BEGIN AGE ENCRYPTED FILE----- - YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBTSmR6WEZDUUI5eUJOS3h1 - bDNKZkZwb3p4eTlMWnZKSnZFb2ZOMnJJdUVrCmVnOHl5MVZSZ0JSbmVuVVlFcnkv - VzlCOWFzbEtJRHM0blZBWmpBU2lqblkKLS0tIHZjMlFSd0Qybk80WlJ0YnZXVjNJ - TUhnUFpFYWJlUFIxSkJtSWE0QTZtbEUKBSI9hBQ7t3SPwZ5LjJHqXPhOfyYBv8dy - g6ezYTNbWvNm271ZIQlcmTLEzOSgU8RVpQBKhpseR93VBeHxv7H/wQ== - -----END AGE ENCRYPTED FILE----- - lastmodified: "2025-12-20T06:02:26Z" - mac: ENC[AES256_GCM,data:xM65GlFnT6swZ272fYpMnUsLPh7AB6VqXKWQwSIPOygcGJTVixjlcKmvY8MvGjuIUxHTXLTKemtYFnMqEpOB+VOhfdwArCkx4pcmrBY+Ph7rsSuznbA2Gy70TfQ/8PU106F42Pm4/kWwcK+GgsDR7jDUc+BaH3lK9gWo3TBO7Gg=,iv:Hp2BFKyWKMLxbW0VzPnehqXny5ws43M3j+4jwSoyQFM=,tag:Ava3XuVXonUcae/fmMTvyA==,type:str] - pgp: - - created_at: "2025-12-20T06:06:06Z" - enc: |- - -----BEGIN PGP MESSAGE----- - - hF4DhYpGbIWgl5wSAQdARshxY9bCf+IVNO6aDa2OGZ0LcfKGmw0jL8JN6FiBozsw - C+U89kyT9e4LuXkU8hF+srF/i8vYYf01SC9XEAMWi9kAwyUoTswtKI97dGQJOhHI - 0l4BVJOf3LbjgMJqddxnk5Cr9TQtL2yqjKl5oQQs/J4DHMc4XTFQT1/EJKJxnMIJ - pHItDgER0wwQu4ZhPOljFMDUbBHMnj7s+WHWvyWmNVs+gcGDGD15ih/2rpEf276N - =8uxG - -----END PGP MESSAGE----- - fp: 3965F16E293466CFE77D47F38C15553EEB22DB2A - encrypted_regex: ^(ssh_public_key|ssh_private_key)$ - version: 3.11.0 diff --git a/infrastructure/network/vyos/Dockerfile.containerlab b/infrastructure/network/vyos/Dockerfile.containerlab index 37d9a29..db3843f 100644 --- a/infrastructure/network/vyos/Dockerfile.containerlab +++ b/infrastructure/network/vyos/Dockerfile.containerlab @@ -1,13 +1,16 @@ # VyOS Container Image for Containerlab Testing # -# This Dockerfile builds a container image from the VyOS squashfs filesystem -# produced by vyos-build. The container uses the same rootfs as the production -# raw disk image, ensuring test fidelity. +# This Dockerfile builds a container image from a VyOS squashfs filesystem. +# It can be used with a squashfs extracted from a VyOS ISO or from an +# official VyOS container. # # Usage: -# sqfs2tar build/live/filesystem.squashfs > rootfs.tar +# # From ISO: Extract squashfs from VyOS ISO, then: +# sqfs2tar path/to/filesystem.squashfs > rootfs.tar # docker build -t vyos-gateway:test -f Dockerfile.containerlab . # +# Alternative: Use the official vyos/vyos:current container directly with Containerlab. +# # The resulting image can be used with Containerlab for integration testing. FROM scratch diff --git a/infrastructure/network/vyos/configs/gateway.conf b/infrastructure/network/vyos/configs/gateway.conf index d49e7e5..6db1f8d 100644 --- a/infrastructure/network/vyos/configs/gateway.conf +++ b/infrastructure/network/vyos/configs/gateway.conf @@ -318,9 +318,8 @@ service { system { domain-name lab.gilman.io host-name gateway - /* SSH keys managed separately via Ansible or vyos-build + /* SSH keys managed separately via Ansible * Do not commit real keys to this file - * vyos-build: baked into image via build-flavors/gateway.toml * Ansible: deploy.yml -e ssh_public_key_file=~/.ssh/id_rsa.pub */ name-server 1.1.1.1 diff --git a/infrastructure/network/vyos/justfile b/infrastructure/network/vyos/justfile index 04b2c20..f48f346 100644 --- a/infrastructure/network/vyos/justfile +++ b/infrastructure/network/vyos/justfile @@ -8,7 +8,7 @@ KEY := "tests/.vyos-test-key" key: test -f "{{KEY}}" || ssh-keygen -t ed25519 -f "{{KEY}}" -N "" -C "vyos-ci" -# Render test config.boot from gateway.toml template +# Render test config.boot from gateway.conf with injected SSH key config: key tests/render-config-boot.sh "$(cat {{KEY}}.pub)" diff --git a/infrastructure/network/vyos/templates/gateway.toml b/infrastructure/network/vyos/templates/gateway.toml deleted file mode 100644 index ce69276..0000000 --- a/infrastructure/network/vyos/templates/gateway.toml +++ /dev/null @@ -1,338 +0,0 @@ -# VyOS Gateway Build Flavor -# Produces a raw disk image with lab configuration baked in -# Target: VP6630 (Minisforum) - Lab Gateway Router -# -# Usage: -# This is a template file - use generate-flavor.sh to create the final TOML -# with SSH credentials injected from SOPS secrets. - -# Output format: raw disk image for Tinkerbell/NAS deployment -image_format = "raw" - -# Image settings -disk_size = 8 # GB - -# Include QEMU guest agent for VM environments -packages = ["qemu-guest-agent"] - -# Boot settings - serial console for headless operation -[boot_settings] -console_type = "serial" -serial_console = "ttyS0,115200n8" - -# Default configuration - will be merged with SSH key at build time -# This serves as the base config.boot content -# -# IMPORTANT: The SSH key placeholder %%SSH_KEY_TYPE%% and %%SSH_PUBLIC_KEY%% -# will be replaced by the build script with actual values from SOPS secrets. -# -# The configuration below is based on infrastructure/network/vyos/configs/gateway.conf -# converted to VyOS config.boot format (nested curly braces) -default_config = ''' -firewall { - group { - network-group HOME_NETWORK { - network 192.168.0.0/24 - } - network-group LAB_NETWORKS { - network 10.10.0.0/16 - } - network-group RFC1918 { - network 10.0.0.0/8 - network 172.16.0.0/12 - network 192.168.0.0/16 - } - } - interface eth4 { - in { - name WAN_TO_LAB - } - local { - name LOCAL - } - out { - name LAB_TO_WAN - } - } - ipv4 { - name LAB_TO_WAN { - default-action accept - rule 10 { - action accept - description "Allow established/related" - state established - state related - } - rule 20 { - action drop - description "Block new connections to home network" - destination { - group { - network-group HOME_NETWORK - } - } - state new - } - } - name LOCAL { - default-action drop - rule 10 { - action accept - state established - state related - } - rule 20 { - action accept - description "Allow ICMP" - protocol icmp - } - rule 30 { - action accept - description "Allow SSH from lab" - destination { - port 22 - } - protocol tcp - source { - group { - network-group LAB_NETWORKS - } - } - } - rule 31 { - action accept - description "Allow SSH from home" - destination { - port 22 - } - protocol tcp - source { - group { - network-group HOME_NETWORK - } - } - } - rule 40 { - action accept - description "Allow DNS from lab" - destination { - port 53 - } - protocol udp - source { - group { - network-group LAB_NETWORKS - } - } - } - rule 50 { - action accept - description "Allow DHCP from lab" - destination { - port 67 - } - protocol udp - source { - group { - network-group LAB_NETWORKS - } - } - } - rule 60 { - action accept - description "Allow BGP from lab" - destination { - port 179 - } - protocol tcp - source { - group { - network-group LAB_NETWORKS - } - } - } - } - name WAN_TO_LAB { - default-action drop - rule 10 { - action accept - description "Allow established/related" - state established - state related - } - rule 20 { - action accept - description "Allow from home network" - source { - group { - network-group HOME_NETWORK - } - } - } - } - } -} -interfaces { - ethernet eth4 { - address 192.168.0.2/24 - description "WAN - Transit to Home (CCR2004)" - } - ethernet eth5 { - description "TRUNK - Lab Switch (CRS)" - vif 10 { - address 10.10.10.1/24 - description "LAB_MGMT - Infrastructure Management" - } - vif 20 { - address 10.10.20.1/24 - description "LAB_PROV - Provisioning (PXE)" - } - vif 30 { - address 10.10.30.1/24 - description "LAB_PLATFORM - Platform Cluster" - } - vif 40 { - address 10.10.40.1/24 - description "LAB_CLUSTER - Tenant Clusters" - } - vif 50 { - address 10.10.50.1/24 - description "LAB_SERVICE - Service VIPs (BGP)" - } - vif 60 { - address 10.10.60.1/24 - description "LAB_STORAGE - Storage Replication" - } - } -} -nat { - source { - rule 100 { - outbound-interface { - name eth4 - } - source { - address 10.10.0.0/16 - } - translation { - address masquerade - } - } - } -} -protocols { - bgp { - address-family { - ipv4-unicast { - network 10.10.50.0/24 { - } - } - } - neighbor 10.10.30.10 { - address-family { - ipv4-unicast { - } - } - description "platform-cp-1 (UM760)" - remote-as 64513 - shutdown - } - neighbor 10.10.30.11 { - address-family { - ipv4-unicast { - } - } - description "platform-cp-2" - remote-as 64513 - shutdown - } - neighbor 10.10.30.12 { - address-family { - ipv4-unicast { - } - } - description "platform-cp-3" - remote-as 64513 - shutdown - } - parameters { - bestpath { - as-path { - multipath-relax - } - } - router-id 10.10.50.1 - } - system-as 64512 - } - static { - route 0.0.0.0/0 { - next-hop 192.168.0.1 { - } - } - } -} -service { - dhcp-relay { - interface eth5.30 - interface eth5.40 - relay-options { - relay-agents-packets discard - } - server 10.10.20.10 - } - dhcp-server { - shared-network-name LAB_MGMT { - interface eth5.10 - subnet 10.10.10.0/24 { - lease 86400 - option { - default-router 10.10.10.1 - name-server 10.10.10.1 - } - range 0 { - start 10.10.10.200 - stop 10.10.10.250 - } - subnet-id 10 - } - } - } - dns { - forwarding { - allow-from 10.10.0.0/16 - listen-address 10.10.10.1 - listen-address 10.10.20.1 - listen-address 10.10.30.1 - listen-address 10.10.40.1 - listen-address 10.10.50.1 - system - } - } - ssh { - disable-password-authentication - port 22 - } -} -system { - domain-name lab.gilman.io - host-name gateway - login { - user vyos { - authentication { - public-keys admin { - key "%%SSH_PUBLIC_KEY%%" - type %%SSH_KEY_TYPE%% - } - } - } - } - name-server 1.1.1.1 - name-server 8.8.8.8 - ntp { - server time.cloudflare.com { - } - } - time-zone America/Los_Angeles -} -''' diff --git a/infrastructure/network/vyos/tests/render-config-boot.sh b/infrastructure/network/vyos/tests/render-config-boot.sh index 1f06b87..6c05837 100755 --- a/infrastructure/network/vyos/tests/render-config-boot.sh +++ b/infrastructure/network/vyos/tests/render-config-boot.sh @@ -1,10 +1,16 @@ #!/bin/bash +# Render config.boot for containerlab testing +# +# Takes gateway.conf as the base configuration and injects an SSH public key +# for test authentication. +# +# Usage: render-config-boot.sh set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="${SCRIPT_DIR}/.." -TEMPLATE_FILE="${REPO_ROOT}/templates/gateway.toml" +CONFIG_FILE="${REPO_ROOT}/configs/gateway.conf" OUTPUT_FILE="${SCRIPT_DIR}/config.boot" usage() { @@ -29,17 +35,34 @@ if [[ -z "${SSH_KEY_TYPE}" ]] || [[ -z "${SSH_KEY_BODY}" ]]; then exit 1 fi -if [[ ! -f "${TEMPLATE_FILE}" ]]; then - echo "ERROR: Template file not found: ${TEMPLATE_FILE}" +if [[ ! -f "${CONFIG_FILE}" ]]; then + echo "ERROR: Config file not found: ${CONFIG_FILE}" exit 1 fi -sed -n "/^default_config = '''$/,/^'''$/p" "${TEMPLATE_FILE}" \ - | sed '1d;$d' \ - | sed -e "s|%%SSH_KEY_TYPE%%|${SSH_KEY_TYPE}|g" \ - -e "s|%%SSH_PUBLIC_KEY%%|${SSH_KEY_BODY}|g" \ - > "${OUTPUT_FILE}" +# Start with the base gateway.conf +cp "${CONFIG_FILE}" "${OUTPUT_FILE}" +# Inject SSH key into the system login section +# Find the closing brace of the system block and insert login config before it +# Use temp file approach for portability (macOS vs GNU sed) +TEMP_FILE=$(mktemp) +sed '/^system {$/,/^}$/{ + /^}$/i\ + login {\ + user vyos {\ + authentication {\ + public-keys test {\ + key "'"${SSH_KEY_BODY}"'"\ + type '"${SSH_KEY_TYPE}"'\ + }\ + }\ + }\ + } +}' "${OUTPUT_FILE}" > "${TEMP_FILE}" +mv "${TEMP_FILE}" "${OUTPUT_FILE}" + +# Fix SELinux context if applicable (for container environments) if command -v getenforce >/dev/null 2>&1 && command -v chcon >/dev/null 2>&1; then if [[ "$(getenforce)" == "Enforcing" ]]; then if [[ "${EUID}" -ne 0 ]] && command -v sudo >/dev/null 2>&1; then