diff --git a/.config/nextest.toml b/.config/nextest.toml index 3ca6324e..7ff301cc 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -14,6 +14,11 @@ failure-output = "immediate" success-output = "never" status-level = "pass" +# Tests that pull OCI images need more time (especially on cold CI runners) +[[profile.default.overrides]] +filter = 'test(/digest_stability|oci_pull/)' +slow-timeout = { period = "300s", terminate-after = 2 } + # Profile for integration tests — run with limited parallelism due to QEMU/KVM resources [profile.integration] test-threads = 2 @@ -30,7 +35,12 @@ path = "junit.xml" store-success-output = true store-failure-output = true -# VM tests boot an ephemeral QEMU instance per test, limit parallelism +# Privileged tests boot an ephemeral QEMU instance per test — limit +# parallelism to avoid OOM kills on 16 GB CI runners. +# Requiring all 2 threads effectively serialises them. +# +# NOTE: use /regex/ syntax, not ~substring — the ~ operator treats ^ $ as +# literal characters, so ~^foo never matches anything. [[profile.integration.overrides]] -filter = 'test(~^vm_)' -threads-required = 4 +filter = 'test(/^privileged_/)' +threads-required = 2 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6651b615..5e9bdfe8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -51,6 +51,7 @@ jobs: - uses: actions/checkout@v6 - uses: bootc-dev/actions/bootc-ubuntu-setup@main - uses: dtolnay/rust-toolchain@stable + - uses: taiki-e/install-action@nextest - uses: Swatinem/rust-cache@v2 - run: just test-integration @@ -114,6 +115,7 @@ jobs: libvirt: true - uses: dtolnay/rust-toolchain@stable + - uses: taiki-e/install-action@nextest - uses: Swatinem/rust-cache@v2 diff --git a/Cargo.toml b/Cargo.toml index 06b9a600..d33e6c68 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,21 @@ [workspace] members = ["crates/*"] +# Exclude integration-tests from default `cargo test` — those require a +# built cfsctl binary and (for privileged tests) a VM. Run them via +# `just test-integration` or `just test-integration-vm` instead. +default-members = [ + "crates/cfsctl", + "crates/composefs", + "crates/composefs-boot", + "crates/composefs-fuse", + "crates/composefs-http", + "crates/composefs-ioctls", + "crates/composefs-oci", + "crates/composefs-setup-root", + "crates/cstorage", + "crates/erofs-debug", + "crates/splitfdstream", +] resolver = "2" [workspace.package] @@ -23,6 +39,9 @@ composefs-oci = { version = "0.3.0", path = "crates/composefs-oci", default-feat composefs-boot = { version = "0.3.0", path = "crates/composefs-boot", default-features = false } composefs-http = { version = "0.3.0", path = "crates/composefs-http", default-features = false } +# JSON-RPC with FD passing for userns helper +jsonrpc-fdpass = { version = "0.1.0", default-features = false } + [profile.dev.package.sha2] # this is *really* slow otherwise opt-level = 3 diff --git a/Justfile b/Justfile index fade3a7f..57e42de4 100644 --- a/Justfile +++ b/Justfile @@ -58,18 +58,37 @@ cfsctl_features := env("COMPOSEFS_CFSCTL_FEATURES", "pre-6.15") _test_image := if base_image =~ "debian" { "localhost/composefs-rs-test-debian:latest" } else if base_image =~ "stream9" { "localhost/composefs-rs-test-c9s:latest" } else { "localhost/composefs-rs-test:latest" } # Run unprivileged integration tests against the cfsctl binary (no root, no VM) -test-integration: build - CFSCTL_PATH=$(pwd)/target/debug/cfsctl cargo run -p integration-tests -- --skip privileged_ +# Prefers nextest for parallelism control and better UX; falls back to direct harness. +test-integration *ARGS: build + #!/usr/bin/env bash + set -euo pipefail + export CFSCTL_PATH=$(pwd)/target/debug/cfsctl + if command -v cargo-nextest &> /dev/null; then + cargo nextest run -p integration-tests -E 'not test(/^privileged_/)' {{ ARGS }} + else + cargo test -p integration-tests --test cfsctl-integration-tests -- --skip privileged_ {{ ARGS }} + fi # Build the test container image for VM-based integration tests _integration-container-build: podman build --build-arg base_image={{base_image}} --build-arg cfsctl_features={{cfsctl_features}} -t {{_test_image}} . # Run all integration tests including privileged VM tests (requires podman + libvirt) -test-integration-vm: build _integration-container-build - COMPOSEFS_TEST_IMAGE={{_test_image}} \ - CFSCTL_PATH=$(pwd)/target/debug/cfsctl \ - cargo run -p integration-tests +# Uses nextest with the integration profile for parallelism control of VM tests. +test-integration-vm *ARGS: build _integration-container-build + #!/usr/bin/env bash + set -euo pipefail + export COMPOSEFS_TEST_IMAGE={{_test_image}} + export CFSCTL_PATH=$(pwd)/target/debug/cfsctl + if command -v cargo-nextest &> /dev/null; then + cargo nextest run -P integration -p integration-tests {{ ARGS }} + else + cargo test -p integration-tests --test cfsctl-integration-tests -- {{ ARGS }} + fi + +# Install cargo-nextest if not already installed +install-nextest: + @which cargo-nextest > /dev/null 2>&1 || cargo install cargo-nextest --locked # Run everything: checks + full integration tests including VM ci: check test-integration-vm diff --git a/bootc/Justfile b/bootc/Justfile index f9ad20cc..8ad4a840 100644 --- a/bootc/Justfile +++ b/bootc/Justfile @@ -45,20 +45,22 @@ clone: # `rev` (done when the bootc branch is updated) busts the podman build # cache. This recipe just adds a [patch] override so Cargo resolves from # the bind-mounted local source instead of fetching from git. +# +# Errors if the composefs-rs tree has uncommitted changes. patch: clone #!/bin/bash set -euo pipefail - cd "$COMPOSEFS_BOOTC_PATH" - cfs_path="$_COMPOSEFS_SRC/crates/cfsctl" - - # Check if already patched (idempotent) - if grep -q 'Patched by composefs-rs' Cargo.toml 2>/dev/null; then - echo "bootc already patched for composefs-rs" - exit 0 + # Require a clean composefs-rs working tree so we test a real commit + if ! git -C "$_COMPOSEFS_SRC" diff --quiet HEAD 2>/dev/null; then + echo "error: composefs-rs has uncommitted changes — commit or stash first" >&2 + git -C "$_COMPOSEFS_SRC" status --short >&2 + exit 1 fi - echo "Patching bootc Cargo.toml to use $_COMPOSEFS_SRC" + cfs_path="$_COMPOSEFS_SRC/crates/cfsctl" + + cd "$COMPOSEFS_BOOTC_PATH" # Add or update the [patch] section with a path override patch_value="cfsctl = { path = \"${cfs_path}\" } # Patched by composefs-rs" @@ -85,7 +87,12 @@ patch: clone # We intentionally don't run `cargo update` here because it rewrites # the workspace dependency line in Cargo.toml (replacing git+rev with path). - echo "bootc patched successfully" + # Update the rev comment in the [patch] section so Cargo.toml actually + # changes when composefs-rs moves to a new commit. Since the file is + # part of the podman build context this busts the layer cache. + _rev=$(git -C "$_COMPOSEFS_SRC" rev-parse HEAD) + sed -i "s/^# Patched by composefs-rs.*/# Patched by composefs-rs at ${_rev}/" Cargo.toml + echo "bootc patched for composefs-rs at ${_rev}" # Build sealed bootc image using local composefs-rs # The path dependency is auto-detected and bind-mounted by bootc's Justfile @@ -158,6 +165,6 @@ config: Example Usage: just bootc/build # Clone main, patch, and build - COMPOSEFS_BOOTC_REF=v1.2.0 just bootc/build # Use specific tag + COMPOSEFS_BOOTC_REF=v1.0.0 just bootc/build # Use specific tag COMPOSEFS_BOOTC_REF=refs/pull/1791/head just bootc/build # Use PR EOF diff --git a/contrib/packaging/install-test-deps.sh b/contrib/packaging/install-test-deps.sh index f414dcd0..f1be65e6 100755 --- a/contrib/packaging/install-test-deps.sh +++ b/contrib/packaging/install-test-deps.sh @@ -11,11 +11,12 @@ set -euo pipefail case "${ID}" in centos|fedora|rhel) - pkg_install composefs openssl + pkg_install composefs openssl podman skopeo xfsprogs ;; debian|ubuntu) pkg_install \ - openssl e2fsprogs bubblewrap openssh-server + openssl e2fsprogs bubblewrap openssh-server \ + podman skopeo # OSTree symlink targets — /root, /home, /srv, etc. are symlinks # into /var on OSTree systems, so the target directories must exist. diff --git a/crates/cfsctl/Cargo.toml b/crates/cfsctl/Cargo.toml index bfb56522..e9ca29bb 100644 --- a/crates/cfsctl/Cargo.toml +++ b/crates/cfsctl/Cargo.toml @@ -14,9 +14,10 @@ version.workspace = true path = "src/lib.rs" [features] -default = ['pre-6.15', 'oci'] +default = ['pre-6.15', 'oci', 'containers-storage'] http = ['composefs-http'] oci = ['composefs-oci'] +containers-storage = ['composefs-oci/containers-storage', 'cstorage'] rhel9 = ['composefs/rhel9'] 'pre-6.15' = ['composefs/pre-6.15'] @@ -29,8 +30,10 @@ composefs = { workspace = true } composefs-boot = { workspace = true } composefs-oci = { workspace = true, optional = true, features = ["boot"] } composefs-http = { workspace = true, optional = true } +cstorage = { path = "../cstorage", version = "0.3.0", features = ["userns-helper"], optional = true } env_logger = { version = "0.11.0", default-features = false } hex = { version = "0.4.0", default-features = false } +indicatif = { version = "0.17.0", default-features = false } rustix = { version = "1.0.0", default-features = false, features = ["fs", "process"] } serde = { version = "1.0", default-features = false, features = ["derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } diff --git a/crates/cfsctl/src/lib.rs b/crates/cfsctl/src/lib.rs index eaaf1fca..3a99800f 100644 --- a/crates/cfsctl/src/lib.rs +++ b/crates/cfsctl/src/lib.rs @@ -170,6 +170,30 @@ impl std::fmt::Display for OciReference { } } +/// CLI representation of [`composefs_oci::LocalFetchOpt`]. +#[cfg(feature = "oci")] +#[derive(Debug, Clone, Copy, Default, clap::ValueEnum)] +enum LocalFetchCli { + /// Do not use native containers-storage import; use skopeo. + #[default] + Disabled, + /// Use native import with reflink/hardlink/copy fallback. + Auto, + /// Use native import; error if zero-copy is not possible. + Zerocopy, +} + +#[cfg(feature = "oci")] +impl From for composefs_oci::LocalFetchOpt { + fn from(cli: LocalFetchCli) -> Self { + match cli { + LocalFetchCli::Disabled => Self::Disabled, + LocalFetchCli::Auto => Self::IfPossible, + LocalFetchCli::Zerocopy => Self::ZeroCopy, + } + } +} + /// Common options for operations using OCI config manifest streams that may transform the image rootfs #[cfg(feature = "oci")] #[derive(Debug, Parser)] @@ -226,6 +250,10 @@ enum OciCommand { /// Also generate a bootable EROFS image from the pulled OCI image #[arg(long)] bootable: bool, + /// Controls whether containers-storage: references use the native + /// import path with zero-copy reflink/hardlink support. + #[arg(long, value_enum, default_value_t = LocalFetchCli::Disabled)] + local_fetch: LocalFetchCli, }, /// List all tagged OCI images in the repository #[clap(name = "images")] @@ -925,23 +953,23 @@ where ref image, name, bootable, + local_fetch, } => { // If no explicit name provided, use the image reference as the tag let tag_name = name.as_deref().unwrap_or(image); - let (result, stats) = - composefs_oci::pull_image(&repo, image, Some(tag_name), None).await?; + + let opts = composefs_oci::PullOptions { + local_fetch: local_fetch.into(), + ..Default::default() + }; + + let result = composefs_oci::pull(&repo, image, Some(tag_name), opts).await?; println!("manifest {}", result.manifest_digest); println!("config {}", result.config_digest); println!("verity {}", result.manifest_verity.to_hex()); println!("tagged {tag_name}"); - println!( - "objects {} copied, {} already present, {} bytes copied, {} bytes inlined", - stats.objects_copied, - stats.objects_already_present, - stats.bytes_copied, - stats.bytes_inlined, - ); + println!("objects {}", result.stats); if bootable { let image_verity = diff --git a/crates/cfsctl/src/main.rs b/crates/cfsctl/src/main.rs index 4873e608..83424c12 100644 --- a/crates/cfsctl/src/main.rs +++ b/crates/cfsctl/src/main.rs @@ -9,8 +9,20 @@ use cfsctl::App; use anyhow::Result; use clap::Parser; -#[tokio::main] -async fn main() -> Result<()> { +fn main() -> Result<()> { + // If we were spawned as a userns helper process, handle that and exit. + // This MUST be called before the tokio runtime is created. + #[cfg(feature = "containers-storage")] + cstorage::init_if_helper(); + + // Now we can create the tokio runtime for the main application + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()? + .block_on(async_main()) +} + +async fn async_main() -> Result<()> { env_logger::init(); let args = App::parse(); diff --git a/crates/composefs-oci/Cargo.toml b/crates/composefs-oci/Cargo.toml index 7898a68d..2273e39b 100644 --- a/crates/composefs-oci/Cargo.toml +++ b/crates/composefs-oci/Cargo.toml @@ -11,17 +11,21 @@ rust-version.workspace = true version.workspace = true [features] -test = ["tar", "composefs/test"] +default = ["containers-storage"] +test = ["tar", "rand", "composefs/test"] boot = ["composefs-boot"] +containers-storage = ["dep:cstorage", "dep:base64", "cstorage/userns-helper"] [dependencies] anyhow = { version = "1.0.87", default-features = false } fn-error-context = "0.2" async-compression = { version = "0.4.0", default-features = false, features = ["tokio", "zstd", "gzip"] } +base64 = { version = "0.22", default-features = false, features = ["std"], optional = true } bytes = { version = "1", default-features = false } composefs = { workspace = true } composefs-boot = { workspace = true, optional = true } containers-image-proxy = { version = "0.9.2", default-features = false } +cstorage = { path = "../cstorage", version = "0.3.0", optional = true } hex = { version = "0.4.0", default-features = false } indicatif = { version = "0.18.0", default-features = false, features = ["tokio"] } rustix = { version = "1.0.0", features = ["fs"] } @@ -29,16 +33,19 @@ serde = { version = "1.0", default-features = false, features = ["derive"] } thiserror = { version = "2.0.0", default-features = false } serde_json = { version = "1.0", default-features = false, features = ["std"] } sha2 = { version = "0.11.0", default-features = false } +rand = { version = "0.10.0", default-features = false, optional = true } tar = { version = "0.4.38", default-features = false, optional = true } tar-core = "0.1.0" tokio = { version = "1.24.2", features = ["macros", "rt-multi-thread"] } tokio-util = { version = "0.7", default-features = false, features = ["io"] } +tracing = { version = "0.1", default-features = false } [dev-dependencies] cap-std = { version = "4.0.0", default-features = false } cap-tempfile = { version = "4.0.0", default-features = false } similar-asserts = "1.7.0" tar = { version = "0.4.38", default-features = false } +rand = { version = "0.10.0", default-features = false } composefs = { workspace = true, features = ["test"] } composefs-boot = { workspace = true } once_cell = "1.21.3" diff --git a/crates/composefs-oci/src/cstor.rs b/crates/composefs-oci/src/cstor.rs new file mode 100644 index 00000000..00cdc768 --- /dev/null +++ b/crates/composefs-oci/src/cstor.rs @@ -0,0 +1,727 @@ +//! containers-storage integration for zero-copy layer import. +//! +//! This module provides functionality to import container images directly from +//! containers-storage (as used by podman/buildah) into composefs repositories. +//! It uses the cstorage crate to access the storage and leverages reflinks when +//! available to avoid copying file data, enabling efficient zero-copy extraction. +//! +//! This module requires the `containers-storage` feature to be enabled. +//! +//! The main entry point is [`import_from_containers_storage`], which takes an +//! image ID and imports all layers into the repository. +//! +//! # Overview +//! +//! When importing from containers-storage, we: +//! 1. Open the storage and locate the image +//! 2. For each layer, iterate through the tar-split metadata +//! 3. For large files (> INLINE_CONTENT_MAX_V0), reflink directly to objects/ +//! 4. For small files, embed inline in the splitstream +//! 5. Handle overlay whiteouts properly +//! +//! # Rootless Support +//! +//! When running as an unprivileged user, files in containers-storage may have +//! restrictive permissions (e.g., `/etc/shadow` with mode 0600 owned by remapped +//! UIDs). In this case, we spawn a helper process via `podman unshare` that can +//! read all files, and it streams the content back to us via a Unix socket with +//! file descriptor passing. +//! +//! # Example +//! +//! ```ignore +//! use composefs_oci::cstor::import_from_containers_storage; +//! +//! let repo = Arc::new(Repository::open_user()?); +//! let (result, stats) = import_from_containers_storage(&repo, "sha256:abc123...", None, false).await?; +//! println!("Imported config: {}", result.0); +//! println!("Stats: {:?}", stats); +//! ``` + +use std::os::unix::fs::FileExt; +use std::os::unix::io::OwnedFd; +use std::sync::Arc; + +use anyhow::{Context, Result}; +use base64::Engine; +use indicatif::{ProgressBar, ProgressStyle}; + +use composefs::{ + INLINE_CONTENT_MAX_V0, + fsverity::FsVerityHashValue, + repository::{ImportContext, ObjectStoreMethod, Repository}, +}; + +use cstorage::{ + Image, Layer, ProxiedTarSplitItem, Storage, StorageProxy, TarSplitFdStream, TarSplitItem, + can_bypass_file_permissions, +}; + +// Re-export init_if_helper for consumers that need userns helper support +pub use cstorage::init_if_helper; + +use crate::oci_image::manifest_identifier; +use crate::skopeo::{OCI_CONFIG_CONTENT_TYPE, OCI_MANIFEST_CONTENT_TYPE, TAR_LAYER_CONTENT_TYPE}; +use crate::{ContentAndVerity, ImportStats, OciDigest, config_identifier, layer_identifier}; + +/// Full result of a cstor import: manifest and config digests + verities. +type CstorImportResult = (ContentAndVerity, ContentAndVerity); + +/// Zero padding buffer for tar block alignment (512 bytes max needed). +const ZERO_PADDING: [u8; 512] = [0u8; 512]; + +/// Import a container image from containers-storage into the composefs repository. +/// +/// This function reads an image from the local containers-storage (podman/buildah) +/// and imports all layers using reflinks when possible, avoiding data duplication. +/// It creates full OCI structure (manifest + config + layers) matching the skopeo +/// import path. +/// +/// For rootless access, this function will automatically spawn a userns helper +/// process via `podman unshare` to read files with restrictive permissions. +/// +/// # Arguments +/// * `repo` - The composefs repository to import into +/// * `image_id` - The image ID (sha256 digest or name) to import +/// * `reference` - Optional reference name to assign to the imported image +/// * `zerocopy` - If true, error instead of falling back to copy (reflink or hardlink required) +/// * `storage_root` - Explicit storage root; skips auto-discovery when set +/// * `additional_image_stores` - Additional read-only image stores (appended after the primary store) +/// +/// # Returns +/// A tuple of ((manifest_digest, manifest_verity), (config_digest, config_verity)) +/// plus import stats. +pub async fn import_from_containers_storage( + repo: &Arc>, + image_id: &str, + reference: Option<&str>, + zerocopy: bool, + storage_root: Option<&std::path::Path>, + additional_image_stores: &[&std::path::Path], +) -> Result<(CstorImportResult, ImportStats)> { + // Check if we can access files directly or need a proxy + if can_bypass_file_permissions() { + // Direct access - use blocking implementation + let repo = Arc::clone(repo); + let image_id = image_id.to_owned(); + let reference = reference.map(|s| s.to_owned()); + let storage_root = storage_root.map(|p| p.to_path_buf()); + let additional_image_stores: Vec = additional_image_stores + .iter() + .map(|p| p.to_path_buf()) + .collect(); + + tokio::task::spawn_blocking(move || { + import_from_containers_storage_direct( + &repo, + &image_id, + reference.as_deref(), + zerocopy, + storage_root.as_deref(), + &additional_image_stores, + ) + }) + .await + .context("spawn_blocking failed")? + } else { + // The proxied (rootless) path uses a userns helper process that does + // its own storage discovery. Explicit storage paths are not yet + // plumbed through the proxy protocol. + if storage_root.is_some() || !additional_image_stores.is_empty() { + anyhow::bail!( + "storage_root and additional_image_stores are not supported in rootless mode" + ); + } + import_from_containers_storage_proxied(repo, image_id, reference, zerocopy).await + } +} + +/// Direct (privileged) implementation of containers-storage import. +/// +/// All file I/O operations in this function are blocking, so it must be called +/// from a blocking context (e.g., via `spawn_blocking`). +fn import_from_containers_storage_direct( + repo: &Arc>, + image_id: &str, + reference: Option<&str>, + zerocopy: bool, + storage_root: Option<&std::path::Path>, + additional_image_stores: &[std::path::PathBuf], +) -> Result<(CstorImportResult, ImportStats)> { + let mut stats = ImportStats::default(); + let mut ctx = ImportContext::default(); + + // Build the list of stores to search. When an explicit root is given, + // skip auto-discovery entirely; otherwise discover from the standard + // locations and $STORAGE_OPTS. + let mut stores = if let Some(root) = storage_root { + vec![ + Storage::open(root) + .with_context(|| format!("Failed to open storage root at {}", root.display()))?, + ] + } else { + // Auto-discover; tolerate failure only if extra stores are provided. + match Storage::discover_all() { + Ok(s) => s, + Err(e) if !additional_image_stores.is_empty() => { + tracing::warn!( + "containers-storage auto-discovery failed ({e:#}), \ + using additional image stores only" + ); + Vec::new() + } + Err(e) => return Err(e).context("Failed to discover containers-storage"), + } + }; + for path in additional_image_stores { + stores.push(Storage::open(path).with_context(|| { + format!( + "Failed to open additional image store at {}", + path.display() + ) + })?); + } + + // Search all stores for the image (primary first, then additional image stores) + let (_image_store, image) = stores + .iter() + .find_map(|s| { + Image::open(s, image_id) + .or_else(|_| s.find_image_by_name(image_id)) + .ok() + .map(|img| (s, img)) + }) + .with_context(|| format!("Failed to find image {image_id} in any storage"))?; + + // Get the storage layer IDs — layers may span stores (e.g. base layers + // in an additional image store, new layers in the primary) + let storage_layer_ids = image + .storage_layer_ids(&stores) + .context("Failed to get storage layer IDs from image")?; + + // Get the config to access diff_ids + let config = image.config().context("Failed to read image config")?; + let diff_ids: Vec = config + .rootfs() + .diff_ids() + .iter() + .map(|s| s.parse::().context("parsing diff_id")) + .collect::>()?; + + // Ensure layer count matches + anyhow::ensure!( + storage_layer_ids.len() == diff_ids.len(), + "Layer count mismatch: {} layers in storage, {} diff_ids in config", + storage_layer_ids.len(), + diff_ids.len() + ); + + stats.layers = storage_layer_ids.len() as u64; + + // Import each layer with progress bar + let progress = ProgressBar::new(storage_layer_ids.len() as u64); + progress.set_style( + ProgressStyle::default_bar() + .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}") + .expect("valid template") + .progress_chars("=>-"), + ); + + let mut layer_refs = Vec::with_capacity(storage_layer_ids.len()); + for (storage_layer_id, diff_id) in storage_layer_ids.iter().zip(diff_ids.iter()) { + let content_id = layer_identifier(diff_id); + let diff_id_str: &str = diff_id.as_ref(); + let short_id = diff_id_str.get(..19).unwrap_or(diff_id_str); + + let layer_verity = if let Some(existing) = repo.has_stream(&content_id)? { + progress.set_message(format!("Already have {short_id}...")); + stats.layers_already_present += 1; + existing + } else { + progress.set_message(format!("Importing {short_id}...")); + let (layer_store, layer) = stores + .iter() + .find_map(|s| Layer::open(s, storage_layer_id).ok().map(|l| (s, l))) + .with_context(|| format!("Failed to open layer {}", storage_layer_id))?; + let (verity, layer_stats) = + import_layer_direct(repo, layer_store, &layer, diff_id, zerocopy, &mut ctx)?; + stats.merge(&layer_stats); + verity + }; + + layer_refs.push((diff_id.clone(), layer_verity)); + progress.inc(1); + } + progress.finish_with_message("Layers imported"); + + finalize_import(repo, &image, &layer_refs, reference, &progress, stats) +} + +/// Proxied (rootless) implementation of containers-storage import. +/// +/// This spawns a helper process via `podman unshare` that can read all files +/// in containers-storage, and communicates with it via Unix socket + fd passing. +async fn import_from_containers_storage_proxied( + repo: &Arc>, + image_id: &str, + reference: Option<&str>, + zerocopy: bool, +) -> Result<(CstorImportResult, ImportStats)> { + let mut stats = ImportStats::default(); + let mut ctx = ImportContext::default(); + + // Spawn the proxy helper + let mut proxy = StorageProxy::spawn() + .await + .context("Failed to spawn userns helper")? + .context("Expected proxy but got None")?; + + // Discover storage paths (primary + additional from $STORAGE_OPTS) + let storage_paths = discover_storage_paths()?; + + // Search all storage paths for the image + let mut image_info = None; + let mut found_storage_path = String::new(); + for path in &storage_paths { + match proxy.get_image(path, image_id).await { + Ok(info) => { + found_storage_path = path.clone(); + image_info = Some(info); + break; + } + Err(_) => continue, + } + } + let image_info = + image_info.with_context(|| format!("Failed to find image {} in any storage", image_id))?; + let storage_path = found_storage_path; + + // Ensure layer count matches + anyhow::ensure!( + image_info.storage_layer_ids.len() == image_info.layer_diff_ids.len(), + "Layer count mismatch: {} layers in storage, {} diff_ids in config", + image_info.storage_layer_ids.len(), + image_info.layer_diff_ids.len() + ); + + stats.layers = image_info.storage_layer_ids.len() as u64; + + // Import each layer with progress bar + let progress = ProgressBar::new(image_info.storage_layer_ids.len() as u64); + progress.set_style( + ProgressStyle::default_bar() + .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}") + .expect("valid template") + .progress_chars("=>-"), + ); + + let mut layer_refs = Vec::with_capacity(image_info.storage_layer_ids.len()); + + for (storage_layer_id, diff_id) in image_info + .storage_layer_ids + .iter() + .zip(image_info.layer_diff_ids.iter()) + { + let content_id = layer_identifier(diff_id); + let diff_id_str: &str = diff_id.as_ref(); + let short_id = diff_id_str.get(..19).unwrap_or(diff_id_str); + + let layer_verity = if let Some(existing) = repo.has_stream(&content_id)? { + progress.set_message(format!("Already have {short_id}...")); + stats.layers_already_present += 1; + existing + } else { + progress.set_message(format!("Importing {short_id}...")); + let (verity, layer_stats) = import_layer_proxied( + repo, + &mut proxy, + &storage_path, + storage_layer_id, + diff_id, + zerocopy, + &mut ctx, + ) + .await?; + stats.merge(&layer_stats); + verity + }; + + layer_refs.push((diff_id.clone(), layer_verity)); + progress.inc(1); + } + progress.finish_with_message("Layers imported"); + + // Config and manifest metadata don't have restrictive file permissions, + // so we can read them directly without the proxy. + let stores = Storage::discover_all().context("Failed to discover containers-storage")?; + let (_, image) = stores + .iter() + .find_map(|s| Image::open(s, &image_info.id).ok().map(|img| (s, img))) + .with_context(|| format!("Failed to open image {}", image_info.id))?; + + // Shutdown the proxy before the blocking finalization + proxy.shutdown().await.context("Failed to shutdown proxy")?; + + finalize_import(repo, &image, &layer_refs, reference, &progress, stats) +} + +/// Create config + manifest splitstreams, generate the EROFS image, and tag. +/// +/// This is the shared finalization step for both direct and proxied import +/// paths. By this point all layers are already imported; this function: +/// 1. Creates the config splitstream with layer references +/// 2. Creates the manifest splitstream +/// 3. Generates the composefs EROFS image and links it to the config +/// 4. Tags the manifest if a reference was provided +fn finalize_import( + repo: &Arc>, + image: &Image, + layer_refs: &[(OciDigest, ObjectID)], + reference: Option<&str>, + progress: &ProgressBar, + stats: ImportStats, +) -> Result<(CstorImportResult, ImportStats)> { + // Read the raw config JSON bytes from metadata + let config_key = format!("sha256:{}", image.id()); + let encoded_key = base64::engine::general_purpose::STANDARD.encode(config_key.as_bytes()); + let config_json = image + .read_metadata(&encoded_key) + .context("Failed to read config bytes")?; + let config_digest = crate::sha256_content_digest(&config_json); + let content_id = config_identifier(&config_digest); + + let config_verity = if let Some(existing) = repo.has_stream(&content_id)? { + progress.println(format!("Already have config {config_digest}")); + existing + } else { + progress.println(format!("Creating config splitstream {config_digest}")); + let mut writer = repo.create_stream(OCI_CONFIG_CONTENT_TYPE)?; + + for (diff_id, verity) in layer_refs { + let key: &str = diff_id.as_ref(); + writer.add_named_stream_ref(key, verity); + } + + writer.write_external(&config_json)?; + repo.write_stream(writer, &content_id, None)? + }; + + // Create the manifest splitstream (matching the skopeo path) + let manifest_json = image + .read_manifest_raw() + .context("Failed to read manifest bytes")?; + let manifest_digest = crate::sha256_content_digest(&manifest_json); + + let manifest_content_id = manifest_identifier(&manifest_digest); + let manifest_verity = if let Some(existing) = repo.has_stream(&manifest_content_id)? { + progress.println(format!("Already have manifest {manifest_digest}")); + existing + } else { + progress.println(format!("Creating manifest splitstream {manifest_digest}")); + let mut writer = repo.create_stream(OCI_MANIFEST_CONTENT_TYPE)?; + + let config_ref_key = format!("config:{config_digest}"); + writer.add_named_stream_ref(&config_ref_key, &config_verity); + + for (diff_id, verity) in layer_refs { + let key: &str = diff_id.as_ref(); + writer.add_named_stream_ref(key, verity); + } + + writer.write_external(&manifest_json)?; + repo.write_stream(writer, &manifest_content_id, None)? + }; + + // Generate the composefs EROFS image and tag the manifest. + // Skip if the image already has an EROFS ref (idempotent re-import). + let existing_erofs = + crate::composefs_erofs_for_manifest(repo, &manifest_digest, Some(&manifest_verity))?; + if existing_erofs.is_none() { + let erofs = crate::ensure_oci_composefs_erofs( + repo, + &manifest_digest, + Some(&manifest_verity), + reference, + )?; + if erofs.is_none() { + // Not a container image (unlikely for cstor, but handle consistently) + if let Some(name) = reference { + crate::oci_image::tag_image(repo, &manifest_digest, name)?; + } + } + } else if let Some(name) = reference { + crate::oci_image::tag_image(repo, &manifest_digest, name)?; + } + + // Re-read verities: ensure_oci_composefs_erofs rewrites config and + // manifest splitstreams (adding the EROFS ref), so the verities captured + // above may be stale. + let config_verity = repo + .has_stream(&content_id)? + .context("config splitstream missing after finalization")?; + let manifest_verity = repo + .has_stream(&manifest_content_id)? + .context("manifest splitstream missing after finalization")?; + + Ok(( + ( + (manifest_digest, manifest_verity), + (config_digest, config_verity), + ), + stats, + )) +} + +/// Import a single layer directly (privileged mode). +fn import_layer_direct( + repo: &Arc>, + storage: &Storage, + layer: &Layer, + diff_id: &OciDigest, + zerocopy: bool, + ctx: &mut ImportContext, +) -> Result<(ObjectID, ImportStats)> { + let mut stats = ImportStats::default(); + let mut inline_buf = Vec::new(); + + let mut stream = TarSplitFdStream::new(storage, layer) + .with_context(|| format!("Failed to create tar-split stream for layer {}", layer.id()))?; + + let mut writer = repo.create_stream(TAR_LAYER_CONTENT_TYPE)?; + let content_id = layer_identifier(diff_id); + + // Track padding from previous file - tar-split bundles padding with the NEXT + // file's header in Segment entries, but we need to write padding immediately + // after file content (like tar.rs does) for consistent splitstream output. + let mut prev_file_padding: usize = 0; + + while let Some(item) = stream.next()? { + match item { + TarSplitItem::Segment(bytes) => { + // Skip the leading padding bytes (we already wrote them after prev file) + let header_bytes = &bytes[prev_file_padding..]; + stats.bytes_inlined += header_bytes.len() as u64; + writer.write_inline(header_bytes); + prev_file_padding = 0; + } + TarSplitItem::FileContent { fd, size, name } => { + process_file_content( + repo, + &mut writer, + &mut stats, + ctx, + fd, + size, + &name, + zerocopy, + &mut inline_buf, + )?; + + // Write padding inline immediately after file content + let padding_size = (size as usize).next_multiple_of(512) - size as usize; + if padding_size > 0 { + stats.bytes_inlined += padding_size as u64; + writer.write_inline(&ZERO_PADDING[..padding_size]); + } + prev_file_padding = padding_size; + } + } + } + + // Write the stream with the content identifier + let verity = repo.write_stream(writer, &content_id, None)?; + Ok((verity, stats)) +} + +/// Import a single layer via the proxy (rootless mode). +async fn import_layer_proxied( + repo: &Arc>, + proxy: &mut StorageProxy, + storage_path: &str, + layer_id: &str, + diff_id: &OciDigest, + zerocopy: bool, + ctx: &mut ImportContext, +) -> Result<(ObjectID, ImportStats)> { + let mut stats = ImportStats::default(); + let mut inline_buf = Vec::new(); + + let mut writer = repo.create_stream(TAR_LAYER_CONTENT_TYPE)?; + let content_id = layer_identifier(diff_id); + + // Track padding from previous file - tar-split bundles padding with the NEXT + // file's header in Segment entries, but we need to write padding immediately + // after file content (like tar.rs does) for consistent splitstream output. + let mut prev_file_padding: usize = 0; + + // Stream the layer via the proxy + let mut stream = proxy + .stream_layer(storage_path, layer_id) + .await + .with_context(|| format!("Failed to start streaming layer {}", layer_id))?; + + while let Some(item) = stream + .next() + .await + .with_context(|| format!("Failed to receive stream item for layer {}", layer_id))? + { + match item { + ProxiedTarSplitItem::Segment(bytes) => { + // Skip the leading padding bytes (we already wrote them after prev file) + let header_bytes = &bytes[prev_file_padding..]; + stats.bytes_inlined += header_bytes.len() as u64; + writer.write_inline(header_bytes); + prev_file_padding = 0; + } + ProxiedTarSplitItem::FileContent { fd, size, name } => { + process_file_content( + repo, + &mut writer, + &mut stats, + ctx, + fd, + size, + &name, + zerocopy, + &mut inline_buf, + )?; + + // Write padding inline immediately after file content + let padding_size = (size as usize).next_multiple_of(512) - size as usize; + if padding_size > 0 { + stats.bytes_inlined += padding_size as u64; + writer.write_inline(&ZERO_PADDING[..padding_size]); + } + prev_file_padding = padding_size; + } + } + } + + // Write the stream with the content identifier + let verity = repo.write_stream(writer, &content_id, None)?; + Ok((verity, stats)) +} + +/// Process file content (shared between direct and proxied modes). +#[allow(clippy::too_many_arguments)] +fn process_file_content( + repo: &Arc>, + writer: &mut composefs::splitstream::SplitStreamWriter, + stats: &mut ImportStats, + ctx: &mut ImportContext, + fd: OwnedFd, + size: u64, + name: &str, + zerocopy: bool, + inline_buf: &mut Vec, +) -> Result<()> { + // Convert fd to File for operations + let file = std::fs::File::from(fd); + + if size as usize > INLINE_CONTENT_MAX_V0 { + // Large file: store as external object + let (object_id, method) = if zerocopy { + repo.ensure_object_from_file_zerocopy(&file, size, ctx) + } else { + repo.ensure_object_from_file(&file, size, ctx) + } + .with_context(|| format!("Failed to store object for {}", name))?; + + match method { + ObjectStoreMethod::Reflinked => { + stats.objects_reflinked += 1; + stats.bytes_reflinked += size; + } + ObjectStoreMethod::Hardlinked => { + stats.objects_hardlinked += 1; + stats.bytes_hardlinked += size; + } + ObjectStoreMethod::Copied => { + stats.objects_copied += 1; + stats.bytes_copied += size; + } + ObjectStoreMethod::AlreadyPresent => { + stats.objects_already_present += 1; + } + } + + writer.add_external_size(size); + writer.write_reference(object_id)?; + } else { + // Small file: read and embed inline (reuse buffer across calls) + inline_buf.resize(size as usize, 0); + file.read_exact_at(inline_buf, 0)?; + stats.bytes_inlined += size; + writer.write_inline(inline_buf); + } + + Ok(()) +} + +/// Discover storage paths: the primary store plus any additional image stores +/// from `$STORAGE_OPTS`. +fn discover_storage_paths() -> Result> { + let mut paths = Vec::new(); + + // Try user storage first (rootless podman) + if let Ok(home) = std::env::var("HOME") { + let user_path = format!("{}/.local/share/containers/storage", home); + if std::path::Path::new(&user_path).exists() { + paths.push(user_path); + } + } + + // Fall back to system storage + let system_path = "/var/lib/containers/storage"; + if std::path::Path::new(system_path).exists() { + paths.push(system_path.to_string()); + } + + // Also check $STORAGE_OPTS for additional image stores + if let Ok(opts) = std::env::var("STORAGE_OPTS") { + for item in opts.split(',') { + let item = item.trim(); + if let Some(path) = item.strip_prefix("additionalimagestore=") + && std::path::Path::new(path).exists() + { + paths.push(path.to_string()); + } + } + } + + anyhow::ensure!( + !paths.is_empty(), + "Could not find containers-storage at standard locations" + ); + Ok(paths) +} + +/// Check if an image reference uses the containers-storage transport. +/// +/// Returns the image ID portion if the reference starts with "containers-storage:", +/// otherwise returns None. +pub fn parse_containers_storage_ref(imgref: &str) -> Option<&str> { + imgref.strip_prefix("containers-storage:") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_containers_storage_ref() { + assert_eq!( + parse_containers_storage_ref("containers-storage:sha256:abc123"), + Some("sha256:abc123") + ); + assert_eq!( + parse_containers_storage_ref("containers-storage:quay.io/fedora:latest"), + Some("quay.io/fedora:latest") + ); + assert_eq!( + parse_containers_storage_ref("docker://quay.io/fedora:latest"), + None + ); + assert_eq!(parse_containers_storage_ref("sha256:abc123"), None); + } +} diff --git a/crates/composefs-oci/src/image.rs b/crates/composefs-oci/src/image.rs index 615c849c..14a8ae2f 100644 --- a/crates/composefs-oci/src/image.rs +++ b/crates/composefs-oci/src/image.rs @@ -557,4 +557,267 @@ mod test { Ok(()) } + + // --- Whiteout-specific tests --- + + #[test] + fn test_whiteout_file_removes_entry() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/etc"))?; + process_entry(&mut fs, file_entry("/etc/hosts"))?; + process_entry(&mut fs, file_entry("/etc/passwd"))?; + assert_files(&fs, &["/", "/etc", "/etc/hosts", "/etc/passwd"])?; + + // Whiteout hosts — only hosts should be removed + process_entry(&mut fs, file_entry("/etc/.wh.hosts"))?; + assert_files(&fs, &["/", "/etc", "/etc/passwd"])?; + + Ok(()) + } + + #[test] + fn test_whiteout_nonexistent_file_is_noop() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/etc"))?; + process_entry(&mut fs, file_entry("/etc/hosts"))?; + assert_files(&fs, &["/", "/etc", "/etc/hosts"])?; + + // Whiteout a file that doesn't exist — should be a no-op + process_entry(&mut fs, file_entry("/etc/.wh.nosuchfile"))?; + assert_files(&fs, &["/", "/etc", "/etc/hosts"])?; + + Ok(()) + } + + #[test] + fn test_whiteout_directory() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/usr"))?; + process_entry(&mut fs, dir_entry("/usr/local"))?; + process_entry(&mut fs, file_entry("/usr/local/bin"))?; + process_entry(&mut fs, dir_entry("/etc"))?; + assert_files(&fs, &["/", "/etc", "/usr", "/usr/local", "/usr/local/bin"])?; + + // Whiteout the directory /usr/local (removes the entire subtree) + process_entry(&mut fs, file_entry("/usr/.wh.local"))?; + assert_files(&fs, &["/", "/etc", "/usr"])?; + + Ok(()) + } + + #[test] + fn test_whiteout_in_root_directory() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/mydir"))?; + process_entry(&mut fs, file_entry("/toplevel"))?; + assert_files(&fs, &["/", "/mydir", "/toplevel"])?; + + // Whiteout in root (no leading dir component) + process_entry(&mut fs, file_entry("/.wh.toplevel"))?; + assert_files(&fs, &["/", "/mydir"])?; + + // Also works without leading slash + process_entry(&mut fs, file_entry(".wh.mydir"))?; + assert_files(&fs, &["/"])?; + + Ok(()) + } + + #[test] + fn test_whiteout_in_nested_directory() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/a"))?; + process_entry(&mut fs, dir_entry("/a/b"))?; + process_entry(&mut fs, dir_entry("/a/b/c"))?; + process_entry(&mut fs, file_entry("/a/b/c/deep"))?; + assert_files(&fs, &["/", "/a", "/a/b", "/a/b/c", "/a/b/c/deep"])?; + + process_entry(&mut fs, file_entry("/a/b/c/.wh.deep"))?; + assert_files(&fs, &["/", "/a", "/a/b", "/a/b/c"])?; + + Ok(()) + } + + #[test] + fn test_opaque_whiteout_clears_directory() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/etc"))?; + process_entry(&mut fs, file_entry("/etc/hosts"))?; + process_entry(&mut fs, file_entry("/etc/passwd"))?; + process_entry(&mut fs, file_entry("/etc/resolv.conf"))?; + assert_files( + &fs, + &["/", "/etc", "/etc/hosts", "/etc/passwd", "/etc/resolv.conf"], + )?; + + // Opaque whiteout — clears all entries in /etc + process_entry(&mut fs, file_entry("/etc/.wh..wh..opq"))?; + assert_files(&fs, &["/", "/etc"])?; + + Ok(()) + } + + #[test] + fn test_opaque_whiteout_then_add_new_entries() -> Result<()> { + // This is a very common pattern in container images: the layer + // marks a dir opaque (hiding all lower-layer contents), then + // adds new entries in the same directory. + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/etc"))?; + process_entry(&mut fs, file_entry("/etc/old_config"))?; + process_entry(&mut fs, file_entry("/etc/another_old"))?; + assert_files(&fs, &["/", "/etc", "/etc/another_old", "/etc/old_config"])?; + + // Opaque whiteout clears everything + process_entry(&mut fs, file_entry("/etc/.wh..wh..opq"))?; + assert_files(&fs, &["/", "/etc"])?; + + // Then re-add new entries + process_entry(&mut fs, file_entry("/etc/new_config"))?; + process_entry(&mut fs, file_entry("/etc/new_other"))?; + assert_files(&fs, &["/", "/etc", "/etc/new_config", "/etc/new_other"])?; + + Ok(()) + } + + #[test] + fn test_multiple_whiteouts_in_single_layer() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/usr"))?; + process_entry(&mut fs, file_entry("/usr/a"))?; + process_entry(&mut fs, file_entry("/usr/b"))?; + process_entry(&mut fs, file_entry("/usr/c"))?; + process_entry(&mut fs, file_entry("/usr/d"))?; + assert_files(&fs, &["/", "/usr", "/usr/a", "/usr/b", "/usr/c", "/usr/d"])?; + + // Multiple whiteouts in the same directory + process_entry(&mut fs, file_entry("/usr/.wh.a"))?; + process_entry(&mut fs, file_entry("/usr/.wh.c"))?; + assert_files(&fs, &["/", "/usr", "/usr/b", "/usr/d"])?; + + Ok(()) + } + + #[test] + fn test_double_whiteout_is_idempotent() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/d"))?; + process_entry(&mut fs, file_entry("/d/target"))?; + assert_files(&fs, &["/", "/d", "/d/target"])?; + + // Whiteout the same file twice — the second is a no-op + process_entry(&mut fs, file_entry("/d/.wh.target"))?; + assert_files(&fs, &["/", "/d"])?; + + process_entry(&mut fs, file_entry("/d/.wh.target"))?; + assert_files(&fs, &["/", "/d"])?; + + Ok(()) + } + + #[test] + fn test_whiteout_unusual_name_dot_wh_dot() -> Result<()> { + // ".wh..wh." (without trailing "opq") is a whiteout for a file + // literally named ".wh." — it is NOT an opaque whiteout. + // The code checks `whiteout == b".wh..opq"` for the complete + // filename ".wh..wh..opq", so ".wh..wh." won't match. + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/d"))?; + process_entry(&mut fs, file_entry("/d/real_file"))?; + assert_files(&fs, &["/", "/d", "/d/real_file"])?; + + // ".wh..wh." is interpreted as a whiteout for the file named ".wh." + // (strip ".wh." prefix → ".wh." remainder). Since no file named ".wh." + // exists, it's a no-op. Crucially, it is NOT treated as an opaque + // whiteout — those require the exact name ".wh..wh..opq". + process_entry(&mut fs, file_entry("/d/.wh..wh."))?; + assert_files(&fs, &["/", "/d", "/d/real_file"])?; + + // Note: a tar entry named ".wh." is consumed as a whiteout for "" (empty + // name), which is effectively a no-op — the file is never stored. + process_entry(&mut fs, file_entry("/d/.wh."))?; + assert_files(&fs, &["/", "/d", "/d/real_file"])?; + + Ok(()) + } + + #[test] + fn test_whiteout_across_multiple_directories() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/a"))?; + process_entry(&mut fs, dir_entry("/b"))?; + process_entry(&mut fs, file_entry("/a/file1"))?; + process_entry(&mut fs, file_entry("/a/file2"))?; + process_entry(&mut fs, file_entry("/b/file1"))?; + process_entry(&mut fs, file_entry("/b/file2"))?; + assert_files( + &fs, + &[ + "/", "/a", "/a/file1", "/a/file2", "/b", "/b/file1", "/b/file2", + ], + )?; + + // Whiteout file1 in /a and file2 in /b independently + process_entry(&mut fs, file_entry("/a/.wh.file1"))?; + process_entry(&mut fs, file_entry("/b/.wh.file2"))?; + assert_files(&fs, &["/", "/a", "/a/file2", "/b", "/b/file1"])?; + + Ok(()) + } + + #[test] + fn test_opaque_whiteout_with_subdirectories() -> Result<()> { + // Opaque whiteout should clear subdirectories too + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/parent"))?; + process_entry(&mut fs, dir_entry("/parent/child"))?; + process_entry(&mut fs, file_entry("/parent/child/deep"))?; + process_entry(&mut fs, file_entry("/parent/sibling"))?; + assert_files( + &fs, + &[ + "/", + "/parent", + "/parent/child", + "/parent/child/deep", + "/parent/sibling", + ], + )?; + + process_entry(&mut fs, file_entry("/parent/.wh..wh..opq"))?; + assert_files(&fs, &["/", "/parent"])?; + + Ok(()) + } + + #[test] + fn test_whiteout_then_recreate() -> Result<()> { + // Delete a file with whiteout, then re-add it in the same layer + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/etc"))?; + process_entry(&mut fs, file_entry("/etc/config"))?; + assert_files(&fs, &["/", "/etc", "/etc/config"])?; + + // Whiteout and then re-add + process_entry(&mut fs, file_entry("/etc/.wh.config"))?; + assert_files(&fs, &["/", "/etc"])?; + + process_entry(&mut fs, file_entry("/etc/config"))?; + assert_files(&fs, &["/", "/etc", "/etc/config"])?; + + Ok(()) + } } diff --git a/crates/composefs-oci/src/lib.rs b/crates/composefs-oci/src/lib.rs index 4dfaa98d..8d59b35c 100644 --- a/crates/composefs-oci/src/lib.rs +++ b/crates/composefs-oci/src/lib.rs @@ -8,9 +8,13 @@ //! - Pulling container images from registries using skopeo //! - Converting OCI image layers from tar format to composefs split streams //! - Creating mountable filesystems from OCI image configurations +//! - Importing from containers-storage with zero-copy reflinks (optional feature) + #![forbid(unsafe_code)] pub mod boot; +#[cfg(feature = "containers-storage")] +pub mod cstor; pub mod image; pub mod oci_image; pub mod skopeo; @@ -67,27 +71,55 @@ pub use skopeo::pull_image; /// Statistics from an image import operation. #[derive(Debug, Clone, Default)] pub struct ImportStats { - /// Number of objects stored via copy. + /// Number of layers in the image. + pub layers: u64, + /// Number of layers that were already present (skipped). + pub layers_already_present: u64, + /// Number of objects stored via regular copy. pub objects_copied: u64, + /// Number of objects stored via reflink (zero-copy). + pub objects_reflinked: u64, + /// Number of objects stored via hardlink (zero-copy). + pub objects_hardlinked: u64, /// Number of objects that already existed (deduplicated). pub objects_already_present: u64, - /// Total bytes stored as new objects. + /// Total bytes stored via regular copy. pub bytes_copied: u64, + /// Total bytes stored via reflink. + pub bytes_reflinked: u64, + /// Total bytes stored via hardlink. + pub bytes_hardlinked: u64, /// Total bytes inlined in splitstreams (small files + headers). pub bytes_inlined: u64, } impl ImportStats { + /// Total number of new objects stored (copied + reflinked + hardlinked). + pub fn new_objects(&self) -> u64 { + self.objects_copied + self.objects_reflinked + self.objects_hardlinked + } + /// Total number of objects processed (new + already present). pub fn total_objects(&self) -> u64 { - self.objects_copied + self.objects_already_present + self.new_objects() + self.objects_already_present + } + + /// Total bytes stored as new objects (copied + reflinked + hardlinked). + pub fn new_bytes(&self) -> u64 { + self.bytes_copied + self.bytes_reflinked + self.bytes_hardlinked } /// Merge another `ImportStats` into this one. pub fn merge(&mut self, other: &ImportStats) { + self.layers += other.layers; + self.layers_already_present += other.layers_already_present; self.objects_copied += other.objects_copied; + self.objects_reflinked += other.objects_reflinked; + self.objects_hardlinked += other.objects_hardlinked; self.objects_already_present += other.objects_already_present; self.bytes_copied += other.bytes_copied; + self.bytes_reflinked += other.bytes_reflinked; + self.bytes_hardlinked += other.bytes_hardlinked; self.bytes_inlined += other.bytes_inlined; } @@ -103,6 +135,14 @@ impl ImportStats { stats.objects_copied += 1; stats.bytes_copied += size; } + ObjectStoreMethod::Reflinked => { + stats.objects_reflinked += 1; + stats.bytes_reflinked += size; + } + ObjectStoreMethod::Hardlinked => { + stats.objects_hardlinked += 1; + stats.bytes_hardlinked += size; + } ObjectStoreMethod::AlreadyPresent => { stats.objects_already_present += 1; } @@ -114,20 +154,105 @@ impl ImportStats { impl std::fmt::Display for ImportStats { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{} new + {} already present objects; {} stored, {} inlined", - self.objects_copied, - self.objects_already_present, - indicatif::HumanBytes(self.bytes_copied), - indicatif::HumanBytes(self.bytes_inlined), - ) + let has_zerocopy = self.objects_reflinked > 0 || self.objects_hardlinked > 0; + if has_zerocopy { + // Show detailed breakdown when zero-copy methods were used + let mut parts = Vec::new(); + if self.objects_reflinked > 0 { + parts.push(format!("{} reflinked", self.objects_reflinked)); + } + if self.objects_hardlinked > 0 { + parts.push(format!("{} hardlinked", self.objects_hardlinked)); + } + parts.push(format!("{} copied", self.objects_copied)); + parts.push(format!("{} already present", self.objects_already_present)); + write!(f, "{} objects; ", parts.join(" + "))?; + + let mut byte_parts = Vec::new(); + if self.objects_reflinked > 0 { + byte_parts.push(format!( + "{} reflinked", + indicatif::HumanBytes(self.bytes_reflinked) + )); + } + if self.objects_hardlinked > 0 { + byte_parts.push(format!( + "{} hardlinked", + indicatif::HumanBytes(self.bytes_hardlinked) + )); + } + byte_parts.push(format!( + "{} copied", + indicatif::HumanBytes(self.bytes_copied) + )); + byte_parts.push(format!( + "{} inlined", + indicatif::HumanBytes(self.bytes_inlined) + )); + write!(f, "{}", byte_parts.join(", ")) + } else { + write!( + f, + "{} new + {} already present objects; {} stored, {} inlined", + self.objects_copied, + self.objects_already_present, + indicatif::HumanBytes(self.bytes_copied), + indicatif::HumanBytes(self.bytes_inlined), + ) + } } } +/// Controls whether and how the `containers-storage:` native import path +/// is used when pulling images. +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] +pub enum LocalFetchOpt { + /// Do not use the native containers-storage import path; fall through + /// to skopeo. + #[default] + Disabled, + /// Use native containers-storage import with reflink → hardlink → copy + /// fallback chain. + IfPossible, + /// Use native containers-storage import but error if zero-copy + /// (reflink or hardlink) is not possible. + ZeroCopy, +} + +/// Options for a [`pull`] operation. +/// +/// Use `Default::default()` for the common case (skopeo transport, no +/// containers-storage import). +#[derive(Debug, Default)] +pub struct PullOptions<'a> { + /// Image proxy configuration passed to skopeo (ignored for + /// `containers-storage:` references when `local_fetch` is not + /// [`Disabled`](LocalFetchOpt::Disabled)). + pub img_proxy_config: Option, + + /// Controls whether the native containers-storage import path is used. + /// See [`LocalFetchOpt`] for details. + pub local_fetch: LocalFetchOpt, + + /// Explicit containers-storage root. When set, auto-discovery is skipped + /// and only this path (plus any `additional_image_stores`) is searched. + /// Only relevant when `local_fetch` is not [`Disabled`](LocalFetchOpt::Disabled). + pub storage_root: Option<&'a std::path::Path>, + + /// Additional read-only image stores to search beyond the primary + /// (auto-discovered or explicit) store. Equivalent to the + /// `additionalimagestore=` option in containers/storage. + /// Only relevant when `local_fetch` is not [`Disabled`](LocalFetchOpt::Disabled). + pub additional_image_stores: &'a [&'a std::path::Path], +} + /// Result of a pull operation. #[derive(Debug)] pub struct PullResult { + /// The manifest digest (sha256:...). + pub manifest_digest: OciDigest, + /// The fs-verity hash of the manifest splitstream. + pub manifest_verity: ObjectID, /// The config digest (sha256:...). pub config_digest: OciDigest, /// The fs-verity hash of the config splitstream. @@ -136,7 +261,8 @@ pub struct PullResult { pub stats: ImportStats, } -type ContentAndVerity = (OciDigest, ObjectID); +/// A tuple of (content digest, fs-verity ObjectID). +pub type ContentAndVerity = (OciDigest, ObjectID); /// Parsed OCI config and its associated references. pub struct OpenConfig { @@ -160,11 +286,11 @@ impl std::fmt::Debug for OpenConfig { } } -fn layer_identifier(diff_id: &OciDigest) -> String { +pub(crate) fn layer_identifier(diff_id: &OciDigest) -> String { format!("oci-layer-{diff_id}") } -fn config_identifier(config: &OciDigest) -> String { +pub(crate) fn config_identifier(config: &OciDigest) -> String { format!("oci-config-{config}") } @@ -223,17 +349,51 @@ pub fn ls_layer( /// Pull the target image, and add the provided tag. If this is a mountable /// image (i.e. not an artifact), it is *not* unpacked by default. +/// +/// When the `containers-storage` feature is enabled, the image reference +/// starts with `containers-storage:`, **and** [`PullOptions::local_fetch`] +/// is not [`LocalFetchOpt::Disabled`], this uses the native cstor import path +/// which supports zero-copy reflinks/hardlinks. Otherwise, it uses skopeo. +/// +/// See [`PullOptions`] for tunable knobs (local-copy mode, extra storage +/// roots, image proxy configuration). pub async fn pull( repo: &Arc>, imgref: &str, reference: Option<&str>, - img_proxy_config: Option, + opts: PullOptions<'_>, ) -> Result> { - let (config_digest, config_verity, stats) = - skopeo::pull(repo, imgref, reference, img_proxy_config).await?; + #[cfg(feature = "containers-storage")] + if opts.local_fetch != LocalFetchOpt::Disabled + && let Some(image_id) = cstor::parse_containers_storage_ref(imgref) + { + let zerocopy = opts.local_fetch == LocalFetchOpt::ZeroCopy; + let (((manifest_digest, manifest_verity), (config_digest, config_verity)), stats) = + cstor::import_from_containers_storage( + repo, + image_id, + reference, + zerocopy, + opts.storage_root, + opts.additional_image_stores, + ) + .await?; + return Ok(PullResult { + manifest_digest, + manifest_verity, + config_digest, + config_verity, + stats, + }); + } + + let (result, stats) = + skopeo::pull_image(repo, imgref, reference, opts.img_proxy_config).await?; Ok(crate::PullResult { - config_digest, - config_verity, + manifest_digest: result.manifest_digest, + manifest_verity: result.manifest_verity, + config_digest: result.config_digest, + config_verity: result.config_verity, stats, }) } @@ -868,25 +1028,27 @@ mod test { "\ / 0 40755 6 0 0 0 0.0 - - - /etc 0 40755 2 0 0 0 0.0 - - - -/etc/hostname 9 100644 1 0 0 0 0.0 - testhost\\n - +/etc/hostname 9 100644 1 0 0 0 0.0 - test-host - /etc/os-release 23 100644 1 0 0 0 0.0 - ID=test\\nVERSION_ID=1.0\\n - -/etc/passwd 34 100644 1 0 0 0 0.0 - root:x:0:0:root:/root:/usr/bin/sh\\n - +/etc/passwd 100 100644 1 0 0 0 0.0 f2/c4fd5735bd46db3b18d402ae87c5086c97c0e1321901cfd30f320b73ef25aa - f2c4fd5735bd46db3b18d402ae87c5086c97c0e1321901cfd30f320b73ef25aa /tmp 0 40755 2 0 0 0 0.0 - - - /usr 0 40755 5 0 0 0 0.0 - - - /usr/bin 0 40755 2 0 0 0 0.0 - - - -/usr/bin/busybox 22 100755 1 0 0 0 0.0 - busybox-binary-content - +/usr/bin/busybox 4096 100755 1 0 0 0 0.0 f0/f7e1e58fdd31f5792222087377a4a976760c416ecdf5f426193e608681b7a1 - f0f7e1e58fdd31f5792222087377a4a976760c416ecdf5f426193e608681b7a1 /usr/bin/cat 7 120777 1 0 0 0 0.0 busybox - - /usr/bin/cp 7 120777 1 0 0 0 0.0 busybox - - /usr/bin/ls 7 120777 1 0 0 0 0.0 busybox - - /usr/bin/mv 7 120777 1 0 0 0 0.0 busybox - - -/usr/bin/myapp 25 100755 1 0 0 0 0.0 - #!/usr/bin/sh\\necho\\x20hello\\n - +/usr/bin/ping 7 120777 1 0 0 0 0.0 busybox - - security.capability=\\x02\\x00\\x00\\x02\\x00\\x20\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00 /usr/bin/rm 7 120777 1 0 0 0 0.0 busybox - - /usr/bin/sh 7 120777 1 0 0 0 0.0 busybox - - /usr/lib 0 40755 2 0 0 0 0.0 - - - /usr/share 0 40755 3 0 0 0 0.0 - - - -/usr/share/myapp 0 40755 2 0 0 0 0.0 - - - -/usr/share/myapp/data.txt 16 100644 1 0 0 0 0.0 - application-data - -/var 0 40755 2 0 0 0 0.0 - - - +/usr/share/doc 0 40755 2 0 0 0 0.0 - - - +/usr/share/doc/README 512 100644 1 0 0 0 0.0 51/44b8f80be57c3518f410d930e18c4e405387c82e4993c18265a1ba4a80263b - 5144b8f80be57c3518f410d930e18c4e405387c82e4993c18265a1ba4a80263b +/var 0 40755 3 0 0 0 0.0 - - - +/var/data 0 40755 2 0 0 0 0.0 - - - +/var/data/app.json 256 100644 1 0 0 0 0.0 c9/21965b74ac1780bc437cec640b27186d85317b9afdb3dbb68626aed5ecd2b6 - c921965b74ac1780bc437cec640b27186d85317b9afdb3dbb68626aed5ecd2b6 " ); } @@ -936,9 +1098,10 @@ mod test { // Untag and GC — everything gets collected oci_image::untag_image(repo, "gctest:v1").unwrap(); let gc2 = repo.gc(&[]).unwrap(); - // 10 objects: 5 layer splitstreams + config JSON + manifest JSON - // + EROFS image + new config splitstream + new manifest splitstream - assert_eq!(gc2.objects_removed, 10, "all objects collected after untag"); + // 14 objects: 5 layer splitstreams + 4 external file objects + // + config JSON + manifest JSON + EROFS image + // + new config splitstream + new manifest splitstream + assert_eq!(gc2.objects_removed, 14, "all objects collected after untag"); // 7 streams: 5 layers + 1 config + 1 manifest (tag ref removed by untag) assert_eq!(gc2.streams_pruned, 7, "all stream symlinks pruned"); // 1 image: the EROFS symlink under images/ @@ -1072,16 +1235,77 @@ mod test { #[test] fn test_import_stats_display() { + // Copy-only stats (no reflinks) let stats = ImportStats { objects_copied: 42, objects_already_present: 100, bytes_copied: 1_500_000, bytes_inlined: 800, + ..Default::default() }; assert_eq!( stats.to_string(), "42 new + 100 already present objects; 1.43 MiB stored, 800 B inlined" ); + assert_eq!(stats.total_objects(), 142); + assert_eq!(stats.new_objects(), 42); + assert_eq!(stats.new_bytes(), 1_500_000); + + // Stats with reflinks + let reflink_stats = ImportStats { + objects_reflinked: 30, + objects_copied: 12, + objects_already_present: 100, + bytes_reflinked: 1_000_000, + bytes_copied: 500_000, + bytes_inlined: 800, + ..Default::default() + }; + assert_eq!( + reflink_stats.to_string(), + "30 reflinked + 12 copied + 100 already present objects; 976.56 KiB reflinked, 488.28 KiB copied, 800 B inlined" + ); + assert_eq!(reflink_stats.total_objects(), 142); + assert_eq!(reflink_stats.new_objects(), 42); + assert_eq!(reflink_stats.new_bytes(), 1_500_000); + + // Stats with hardlinks only + let hardlink_stats = ImportStats { + objects_hardlinked: 20, + objects_copied: 5, + objects_already_present: 50, + bytes_hardlinked: 800_000, + bytes_copied: 200_000, + bytes_inlined: 400, + ..Default::default() + }; + assert_eq!( + hardlink_stats.to_string(), + "20 hardlinked + 5 copied + 50 already present objects; 781.25 KiB hardlinked, 195.31 KiB copied, 400 B inlined" + ); + assert_eq!(hardlink_stats.total_objects(), 75); + assert_eq!(hardlink_stats.new_objects(), 25); + assert_eq!(hardlink_stats.new_bytes(), 1_000_000); + + // Stats with both reflinks and hardlinks + let mixed_stats = ImportStats { + objects_reflinked: 10, + objects_hardlinked: 15, + objects_copied: 5, + objects_already_present: 70, + bytes_reflinked: 500_000, + bytes_hardlinked: 750_000, + bytes_copied: 250_000, + bytes_inlined: 600, + ..Default::default() + }; + assert_eq!( + mixed_stats.to_string(), + "10 reflinked + 15 hardlinked + 5 copied + 70 already present objects; 488.28 KiB reflinked, 732.42 KiB hardlinked, 244.14 KiB copied, 600 B inlined" + ); + assert_eq!(mixed_stats.total_objects(), 100); + assert_eq!(mixed_stats.new_objects(), 30); + assert_eq!(mixed_stats.new_bytes(), 1_500_000); let empty = ImportStats::default(); assert_eq!( @@ -1089,6 +1313,247 @@ mod test { "0 new + 0 already present objects; 0 B stored, 0 B inlined" ); assert_eq!(empty.total_objects(), 0); - assert_eq!(stats.total_objects(), 142); + } + + /// End-to-end test: multi-layer OCI image with nontrivial whiteout usage. + /// + /// Builds three tar layers exercising individual file whiteouts (`.wh.`) + /// and opaque directory whiteouts (`.wh..wh..opq`), imports them through the + /// full OCI pipeline (tar → splitstream → OCI config/manifest → EROFS), and + /// verifies the resulting filesystem contains exactly the expected files. + #[tokio::test] + async fn test_whiteout_multi_layer_import() { + use composefs::test::TestRepo; + use containers_image_proxy::oci_spec::image::{ + ConfigBuilder, DescriptorBuilder, ImageConfigurationBuilder, ImageManifestBuilder, + MediaType, RootFsBuilder, + }; + + // --- Tar builder helpers (local to this test) --- + + fn tar_dir(builder: &mut ::tar::Builder>, name: &str) { + let mut header = ::tar::Header::new_ustar(); + header.set_uid(0); + header.set_gid(0); + header.set_mode(0o755); + header.set_entry_type(::tar::EntryType::Directory); + header.set_size(0); + builder + .append_data(&mut header, name, std::io::empty()) + .unwrap(); + } + + fn tar_file(builder: &mut ::tar::Builder>, name: &str, content: &[u8]) { + let mut header = ::tar::Header::new_ustar(); + header.set_uid(0); + header.set_gid(0); + header.set_mode(0o644); + header.set_entry_type(::tar::EntryType::Regular); + header.set_size(content.len() as u64); + builder.append_data(&mut header, name, content).unwrap(); + } + + /// Zero-length regular file — used for `.wh.` and `.wh..wh..opq` entries. + fn tar_whiteout(builder: &mut ::tar::Builder>, name: &str) { + tar_file(builder, name, &[]); + } + + // --- Build the three layers --- + + // Layer 1 (base): create initial filesystem + let layer1 = { + let mut b = ::tar::Builder::new(vec![]); + tar_dir(&mut b, "etc"); + tar_file(&mut b, "etc/config.toml", b"[server]\nport = 8080\n"); + tar_file(&mut b, "etc/hosts", b"127.0.0.1 localhost\n"); + tar_dir(&mut b, "usr"); + tar_dir(&mut b, "usr/bin"); + tar_file(&mut b, "usr/bin/app", b"#!/bin/sh\necho hello\n"); + tar_dir(&mut b, "usr/lib"); + tar_file(&mut b, "usr/lib/old-lib.so", b"fake-old-lib-content"); + tar_file(&mut b, "usr/lib/shared.so", b"fake-shared-lib-content"); + tar_dir(&mut b, "tmp"); + tar_dir(&mut b, "tmp/cache"); + tar_file(&mut b, "tmp/cache/data.bin", b"cached-data-payload"); + tar_file(&mut b, "tmp/cache/index.db", b"cached-index-payload"); + b.into_inner().unwrap() + }; + + // Layer 2 (whiteout + modify): + // - delete /etc/hosts (file whiteout) + // - delete /usr/lib/old-lib.so (file whiteout) + // - add /etc/hosts.new (replacement) + // - opaque whiteout on /tmp/cache (clears data.bin + index.db) + // - add /tmp/cache/fresh.bin (re-populate after opaque) + let layer2 = { + let mut b = ::tar::Builder::new(vec![]); + tar_dir(&mut b, "etc"); + tar_whiteout(&mut b, "etc/.wh.hosts"); + tar_file(&mut b, "etc/hosts.new", b"127.0.0.1 localhost.new\n"); + tar_dir(&mut b, "usr"); + tar_dir(&mut b, "usr/lib"); + tar_whiteout(&mut b, "usr/lib/.wh.old-lib.so"); + tar_dir(&mut b, "tmp"); + tar_dir(&mut b, "tmp/cache"); + tar_whiteout(&mut b, "tmp/cache/.wh..wh..opq"); + tar_file(&mut b, "tmp/cache/fresh.bin", b"fresh-cache-content"); + b.into_inner().unwrap() + }; + + // Layer 3 (more whiteouts): + // - delete /usr/bin/app (file whiteout) + // - add /usr/bin/app-v2 (replacement) + let layer3 = { + let mut b = ::tar::Builder::new(vec![]); + tar_dir(&mut b, "usr"); + tar_dir(&mut b, "usr/bin"); + tar_whiteout(&mut b, "usr/bin/.wh.app"); + tar_file(&mut b, "usr/bin/app-v2", b"#!/bin/sh\necho hello v2\n"); + b.into_inner().unwrap() + }; + + // --- Import layers and build OCI image --- + + let test_repo = TestRepo::::new(); + let repo = &test_repo.repo; + + let layers_data = [&layer1[..], &layer2[..], &layer3[..]]; + let mut layer_digests = Vec::new(); + let mut layer_verities_map: HashMap, composefs::fsverity::Sha256HashValue> = + HashMap::new(); + let mut layer_descriptors = Vec::new(); + + for tar_data in &layers_data { + let digest = hash_sha256(tar_data); + let (verity, _stats) = import_layer(repo, &digest, None, *tar_data).await.unwrap(); + + let descriptor = DescriptorBuilder::default() + .media_type(MediaType::ImageLayerGzip) + .digest(digest.clone()) + .size(tar_data.len() as u64) + .build() + .unwrap(); + + layer_verities_map.insert(digest.to_string().into_boxed_str(), verity); + layer_digests.push(digest.to_string()); + layer_descriptors.push(descriptor); + } + + // Build OCI config + let rootfs = RootFsBuilder::default() + .typ("layers") + .diff_ids(layer_digests.clone()) + .build() + .unwrap(); + + let cfg = ConfigBuilder::default().build().unwrap(); + + let config = ImageConfigurationBuilder::default() + .architecture("amd64") + .os("linux") + .rootfs(rootfs) + .config(cfg) + .build() + .unwrap(); + + let config_json = config.to_string().unwrap(); + let config_digest = hash_sha256(config_json.as_bytes()); + + let mut config_stream = repo.create_stream(skopeo::OCI_CONFIG_CONTENT_TYPE).unwrap(); + for (digest, verity) in &layer_verities_map { + config_stream.add_named_stream_ref(digest, verity); + } + config_stream + .write_external(config_json.as_bytes()) + .unwrap(); + let config_verity = repo + .write_stream(config_stream, &config_identifier(&config_digest), None) + .unwrap(); + + // Build OCI manifest + let config_descriptor = DescriptorBuilder::default() + .media_type(MediaType::ImageConfig) + .digest(config_digest.clone()) + .size(config_json.len() as u64) + .build() + .unwrap(); + + let manifest = ImageManifestBuilder::default() + .schema_version(2u32) + .media_type(MediaType::ImageManifest) + .config(config_descriptor) + .layers(layer_descriptors) + .build() + .unwrap(); + + let manifest_json = manifest.to_string().unwrap(); + let manifest_digest = hash_sha256(manifest_json.as_bytes()); + + let (_stored_digest, manifest_verity) = oci_image::write_manifest( + repo, + &manifest, + &manifest_digest, + &config_verity, + &layer_verities_map, + Some("whiteout-test:v1"), + ) + .unwrap(); + + // --- Create the EROFS image --- + + let erofs_id = ensure_oci_composefs_erofs( + repo, + &manifest_digest, + Some(&manifest_verity), + Some("whiteout-test:v1"), + ) + .unwrap() + .expect("container image should produce EROFS"); + + // --- Verify the flattened filesystem --- + + let erofs_data = repo.read_object(&erofs_id).unwrap(); + let fs = + composefs::erofs::reader::erofs_to_filesystem::(&erofs_data).unwrap(); + let mut dump = Vec::new(); + composefs::dumpfile::write_dumpfile(&mut dump, &fs).unwrap(); + let dump = String::from_utf8(dump).unwrap(); + + // Extract just the paths from the dumpfile for structural verification + let paths: Vec<&str> = dump.lines().map(|l| l.split_once(' ').unwrap().0).collect(); + + // Files that SHOULD exist after all three layers + let expected_present = [ + "/", + "/etc", + "/etc/config.toml", // layer 1, survived + "/etc/hosts.new", // layer 2 addition + "/tmp", + "/tmp/cache", + "/tmp/cache/fresh.bin", // layer 2, after opaque whiteout + "/usr", + "/usr/bin", + "/usr/bin/app-v2", // layer 3 replacement + "/usr/lib", + "/usr/lib/shared.so", // layer 1, survived + ]; + + // Files that MUST NOT exist (removed by whiteouts) + let must_not_exist = [ + "/etc/hosts", // deleted by layer 2 file whiteout + "/usr/lib/old-lib.so", // deleted by layer 2 file whiteout + "/usr/bin/app", // deleted by layer 3 file whiteout + "/tmp/cache/data.bin", // cleared by layer 2 opaque whiteout + "/tmp/cache/index.db", // cleared by layer 2 opaque whiteout + ]; + + similar_asserts::assert_eq!(paths, expected_present); + + for path in &must_not_exist { + assert!( + !paths.contains(path), + "{path} should have been removed by whiteout but is still present" + ); + } } } diff --git a/crates/composefs-oci/src/oci_image.rs b/crates/composefs-oci/src/oci_image.rs index 3c2c5b9e..da624254 100644 --- a/crates/composefs-oci/src/oci_image.rs +++ b/crates/composefs-oci/src/oci_image.rs @@ -574,7 +574,7 @@ pub fn list_images( }); } Err(e) => { - eprintln!("Warning: skipping image {name}: {e:#}"); + tracing::warn!("skipping image {name}: {e:#}"); continue; } } diff --git a/crates/composefs-oci/src/skopeo.rs b/crates/composefs-oci/src/skopeo.rs index 8827f453..977f0a60 100644 --- a/crates/composefs-oci/src/skopeo.rs +++ b/crates/composefs-oci/src/skopeo.rs @@ -242,6 +242,14 @@ impl ImageOp { stats.objects_copied += 1; stats.bytes_copied += size; } + ObjectStoreMethod::Reflinked => { + stats.objects_reflinked += 1; + stats.bytes_reflinked += size; + } + ObjectStoreMethod::Hardlinked => { + stats.objects_hardlinked += 1; + stats.bytes_hardlinked += size; + } ObjectStoreMethod::AlreadyPresent => { stats.objects_already_present += 1; } diff --git a/crates/composefs-oci/src/test_util.rs b/crates/composefs-oci/src/test_util.rs index 9953350e..8ff04b13 100644 --- a/crates/composefs-oci/src/test_util.rs +++ b/crates/composefs-oci/src/test_util.rs @@ -18,7 +18,6 @@ /// /usr/bin/sh 0 120777 1 0 0 0 0.0 busybox - - /// ``` use std::collections::HashMap; -use std::io::Read as _; use std::sync::Arc; use crate::oci_image::write_manifest; @@ -41,12 +40,50 @@ fn hash(bytes: &[u8]) -> OciDigest { .unwrap() } +/// Write a PAX extended header entry followed by the real entry to a +/// [`tar::Builder`]. +/// +/// The `xattrs` are encoded as `SCHILY.xattr.=` PAX records, +/// which is the de-facto standard used by GNU tar, BSD tar, and all +/// container runtimes. +fn append_with_xattrs( + builder: &mut ::tar::Builder, + header: &mut ::tar::Header, + path: &str, + data: &[u8], + xattrs: &[(String, Vec)], +) { + // Build the PAX extended header payload using tar_core's PaxBuilder. + let mut pax = tar_core::builder::PaxBuilder::new(); + for (key, value) in xattrs { + pax.add(&format!("SCHILY.xattr.{key}"), value); + } + let pax_data = pax.finish(); + + // Write the PAX header entry (type 'x'). + let mut pax_header = ::tar::Header::new_ustar(); + pax_header.set_entry_type(::tar::EntryType::XHeader); + pax_header.set_size(pax_data.len() as u64); + pax_header.set_mode(0o644); + let pax_path = format!("PaxHeader/{path}"); + builder + .append_data(&mut pax_header, &pax_path, &pax_data[..]) + .unwrap(); + + // Write the actual entry immediately after (same archive stream). + builder.append_data(header, path, data).unwrap(); +} + /// Convert composefs dumpfile lines into tar bytes. /// /// Parses each line as a composefs [`Entry`] and builds the corresponding /// tar entry. The root directory (`/`) is skipped since tar archives don't -/// include it. Only regular files (inline), directories, and symlinks are -/// supported — this is sufficient for test images. +/// include it. Regular files (inline and external), directories, and +/// symlinks are supported. Any xattrs present (e.g. `security.capability`) +/// are emitted as PAX extended headers. +/// +/// External files (`Item::Regular` with no inline content) get +/// deterministic pseudo-random data from a seeded RNG (keyed on file size). fn dumpfile_to_tar(dumpfile: &str) -> Vec { let mut builder = ::tar::Builder::new(vec![]); @@ -71,6 +108,20 @@ fn dumpfile_to_tar(dumpfile: &str) -> Vec { .expect("non-UTF8 path") .trim_start_matches('/'); + // Collect xattrs for PAX headers. + let xattrs: Vec<(String, Vec)> = entry + .xattrs + .iter() + .map(|x| { + let key = x + .key + .to_str() + .unwrap_or_else(|| panic!("non-UTF8 xattr key: {:?}", x.key)); + (key.to_owned(), x.value.to_vec()) + }) + .collect(); + let has_xattrs = !xattrs.is_empty(); + let ty = FileType::from_raw_mode(entry.mode); match ty { FileType::Directory => { @@ -80,36 +131,40 @@ fn dumpfile_to_tar(dumpfile: &str) -> Vec { header.set_mode(entry.mode & 0o7777); header.set_entry_type(::tar::EntryType::Directory); header.set_size(0); - builder - .append_data(&mut header, path, std::io::empty()) - .unwrap(); - } - FileType::RegularFile => match &entry.item { - Item::RegularInline { content, .. } => { - let mut header = ::tar::Header::new_ustar(); - header.set_uid(entry.uid.into()); - header.set_gid(entry.gid.into()); - header.set_mode(entry.mode & 0o7777); - header.set_entry_type(::tar::EntryType::Regular); - header.set_size(content.len() as u64); + if has_xattrs { + append_with_xattrs(&mut builder, &mut header, path, &[], &xattrs); + } else { builder - .append_data(&mut header, path, &content[..]) + .append_data(&mut header, path, std::io::empty()) .unwrap(); } - Item::Regular { size, .. } => { - // External file with no inline content — create sized entry - let mut header = ::tar::Header::new_ustar(); - header.set_uid(entry.uid.into()); - header.set_gid(entry.gid.into()); - header.set_mode(entry.mode & 0o7777); - header.set_entry_type(::tar::EntryType::Regular); - header.set_size(*size); + } + FileType::RegularFile => { + let content: Vec = match &entry.item { + Item::RegularInline { content, .. } => content.to_vec(), + Item::Regular { size, .. } => { + use rand::{RngExt, SeedableRng, rngs::SmallRng}; + let mut rng = SmallRng::seed_from_u64(*size); + let mut buf = vec![0u8; *size as usize]; + rng.fill(&mut buf[..]); + buf + } + other => panic!("unexpected regular file item variant: {other:?}"), + }; + let mut header = ::tar::Header::new_ustar(); + header.set_uid(entry.uid.into()); + header.set_gid(entry.gid.into()); + header.set_mode(entry.mode & 0o7777); + header.set_entry_type(::tar::EntryType::Regular); + header.set_size(content.len() as u64); + if has_xattrs { + append_with_xattrs(&mut builder, &mut header, path, &content, &xattrs); + } else { builder - .append_data(&mut header, path, std::io::repeat(0u8).take(*size)) + .append_data(&mut header, path, &content[..]) .unwrap(); } - other => panic!("unexpected regular file item variant: {other:?}"), - }, + } FileType::Symlink => { let target = match &entry.item { Item::Symlink { target, .. } => target, @@ -124,9 +179,13 @@ fn dumpfile_to_tar(dumpfile: &str) -> Vec { header .set_link_name(target.as_ref()) .expect("failed to set symlink target"); - builder - .append_data(&mut header, path, std::io::empty()) - .unwrap(); + if has_xattrs { + append_with_xattrs(&mut builder, &mut header, path, &[], &xattrs); + } else { + builder + .append_data(&mut header, path, std::io::empty()) + .unwrap(); + } } other => panic!("unsupported file type in test dumpfile: {other:?}"), } @@ -252,12 +311,16 @@ async fn create_multi_layer_image( // --------------------------------------------------------------------------- // Layer definitions in composefs dumpfile format // -// Format: /path size mode nlink uid gid rdev mtime payload content digest +// Format: /path size mode nlink uid gid rdev mtime payload content digest [xattr=val ...] // // Directories: /path 0 40755 2 0 0 0 0.0 - - - // Inline files: /path 100644 1 0 0 0 0.0 - - +// External: /path 100644 1 0 0 0 0.0 / - - (data is auto-generated) // Executables: /path 100755 1 0 0 0 0.0 - - // Symlinks: /path 120777 1 0 0 0 0.0 - - +// +// Xattrs are appended as space-separated key=value pairs after the digest +// (e.g. security.capability for file capabilities). // --------------------------------------------------------------------------- const LAYER_ROOT_STRUCTURE: &str = "\ @@ -271,14 +334,18 @@ const LAYER_ROOT_STRUCTURE: &str = "\ /tmp 0 40755 2 0 0 0 0.0 - - - "; +/// Busybox layer with a 4 KiB binary (external, > INLINE_CONTENT_MAX_V0). const LAYER_BUSYBOX: &str = "\ / 0 40755 2 0 0 0 0.0 - - - /usr 0 40755 2 0 0 0 0.0 - - - /usr/bin 0 40755 2 0 0 0 0.0 - - - -/usr/bin/busybox 22 100755 1 0 0 0 0.0 - busybox-binary-content - +/usr/bin/busybox 4096 100755 1 0 0 0 0.0 / - - /usr/bin/sh 7 120777 1 0 0 0 0.0 busybox - - "; +/// Core utils layer. `/usr/bin/ping` carries a `security.capability` xattr +/// granting CAP_NET_RAW (VFS_CAP_REVISION_2), which is the realistic pattern +/// seen in real container images. const LAYER_CORE_UTILS: &str = "\ / 0 40755 2 0 0 0 0.0 - - - /usr 0 40755 2 0 0 0 0.0 - - - @@ -287,25 +354,29 @@ const LAYER_CORE_UTILS: &str = "\ /usr/bin/cat 7 120777 1 0 0 0 0.0 busybox - - /usr/bin/cp 7 120777 1 0 0 0 0.0 busybox - - /usr/bin/mv 7 120777 1 0 0 0 0.0 busybox - - +/usr/bin/ping 7 120777 1 0 0 0 0.0 busybox - - security.capability=\\x02\\x00\\x00\\x02\\x00\\x20\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00 /usr/bin/rm 7 120777 1 0 0 0 0.0 busybox - - "; +/// Config layer with /etc/passwd as a 100-byte external file. const LAYER_CONFIG: &str = "\ / 0 40755 2 0 0 0 0.0 - - - /etc 0 40755 2 0 0 0 0.0 - - - /etc/os-release 26 100644 1 0 0 0 0.0 - ID=test\\nVERSION_ID=1.0\\n - -/etc/hostname 9 100644 1 0 0 0 0.0 - testhost\\n - -/etc/passwd 36 100644 1 0 0 0 0.0 - root:x:0:0:root:/root:/usr/bin/sh\\n - +/etc/hostname 9 100644 1 0 0 0 0.0 - test-host - +/etc/passwd 100 100644 1 0 0 0 0.0 / - - "; +/// App layer with a 512-byte README and 256-byte JSON (both external). const LAYER_APP: &str = "\ / 0 40755 2 0 0 0 0.0 - - - /usr 0 40755 2 0 0 0 0.0 - - - /usr/share 0 40755 2 0 0 0 0.0 - - - -/usr/share/myapp 0 40755 2 0 0 0 0.0 - - - -/usr/share/myapp/data.txt 16 100644 1 0 0 0 0.0 - application-data - -/usr/bin 0 40755 2 0 0 0 0.0 - - - -/usr/bin/myapp 26 100755 1 0 0 0 0.0 - #!/usr/bin/sh\\necho\\x20hello\\n - +/usr/share/doc 0 40755 2 0 0 0 0.0 - - - +/usr/share/doc/README 512 100644 1 0 0 0 0.0 / - - +/var 0 40755 2 0 0 0 0.0 - - - +/var/data 0 40755 2 0 0 0 0.0 - - - +/var/data/app.json 256 100644 1 0 0 0 0.0 / - - "; const LAYER_BOOT_DIRS: &str = "\ @@ -420,16 +491,16 @@ const LAYER_LIBS_1: &str = "\ / 0 40755 2 0 0 0 0.0 - - - /usr 0 40755 2 0 0 0 0.0 - - - /usr/lib 0 40755 2 0 0 0 0.0 - - - -/usr/lib/libc.so.6 16 100644 1 0 0 0 0.0 - fake-libc-content - -/usr/lib/libm.so.6 16 100644 1 0 0 0 0.0 - fake-libm-content - +/usr/lib/libc.so.6 8192 100644 1 0 0 0 0.0 / - - +/usr/lib/libm.so.6 4096 100644 1 0 0 0 0.0 / - - "; const LAYER_LIBS_2: &str = "\ / 0 40755 2 0 0 0 0.0 - - - /usr 0 40755 2 0 0 0 0.0 - - - /usr/lib 0 40755 2 0 0 0 0.0 - - - -/usr/lib/libpthread.so.0 22 100644 1 0 0 0 0.0 - fake-libpthread-content - -/usr/lib/libdl.so.2 16 100644 1 0 0 0 0.0 - fake-libdl-content - +/usr/lib/libpthread.so.0 4096 100644 1 0 0 0 0.0 / - - +/usr/lib/libdl.so.2 2048 100644 1 0 0 0 0.0 / - - "; const LAYER_LOCALE: &str = "\ diff --git a/crates/composefs/Cargo.toml b/crates/composefs/Cargo.toml index 0e732372..d8c78357 100644 --- a/crates/composefs/Cargo.toml +++ b/crates/composefs/Cargo.toml @@ -23,7 +23,7 @@ fn-error-context = "0.2" hex = { version = "0.4.0", default-features = false, features = ["std"] } log = { version = "0.4.8", default-features = false } once_cell = { version = "1.21.3", default-features = false, features = ["std"] } -rustix = { version = "1.0.0", default-features = false, features = ["fs", "mount", "process", "std"] } +rustix = { version = "1.0.0", default-features = false, features = ["fs", "mount", "process", "std", "thread"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } sha2 = { version = "0.11.0", default-features = false } thiserror = { version = "2.0.0", default-features = false } diff --git a/crates/composefs/src/repository.rs b/crates/composefs/src/repository.rs index e402a82c..9bd84c9a 100644 --- a/crates/composefs/src/repository.rs +++ b/crates/composefs/src/repository.rs @@ -523,16 +523,57 @@ pub(crate) fn write_repo_metadata( /// How an object was stored in the repository. /// -/// Returned by [`Repository::ensure_object_from_file_with_stats`] to indicate -/// whether the operation used a regular copy or found an existing object. +/// Returned by [`Repository::ensure_object_from_file`] to indicate +/// whether the operation used zero-copy reflinks, a regular copy, or found +/// an existing object. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ObjectStoreMethod { - /// Object was stored via regular file copy. + /// Object was stored via reflink (zero-copy, FICLONE ioctl). + Reflinked, + /// Object was stored via hardlink (zero-copy, source file linked directly). + Hardlinked, + /// Object was stored via regular file copy (reflink not supported). Copied, /// Object already existed in the repository (deduplicated). AlreadyPresent, } +/// Per-operation context for [`Repository::ensure_object_from_file`]. +/// +/// Create one of these at the start of a bulk import operation (e.g. importing +/// all layers of a container image) and pass it to every +/// `ensure_object_from_file` call. The context caches which +/// `(source_device, dest_device)` pairs do not support reflinks, so that +/// after the first `EOPNOTSUPP` / `EXDEV` on a given pair, subsequent +/// calls skip the FICLONE probe entirely. +/// +/// This correctly handles multi-store imports where layers may come from +/// different filesystems: a reflink failure on ext4→xfs does not suppress +/// reflink attempts for a later xfs→xfs pair. +#[derive(Debug, Default)] +pub struct ImportContext { + /// Device-ID pairs where FICLONE has already failed. Stored as + /// `(source_dev, dest_dev)` from `fstat().st_dev`. + reflink_unsupported_devs: Vec<(u64, u64)>, +} + +impl ImportContext { + /// Check whether reflinks are known to be unsupported for this + /// source→destination device pair. + pub(crate) fn is_reflink_unsupported(&self, src_dev: u64, dst_dev: u64) -> bool { + self.reflink_unsupported_devs + .iter() + .any(|&(s, d)| s == src_dev && d == dst_dev) + } + + /// Record that reflinks are unsupported for this device pair. + pub(crate) fn mark_reflink_unsupported(&mut self, src_dev: u64, dst_dev: u64) { + if !self.is_reflink_unsupported(src_dev, dst_dev) { + self.reflink_unsupported_devs.push((src_dev, dst_dev)); + } + } +} + /// Call openat() on the named subdirectory of "dirfd", possibly creating it first. /// /// We assume that the directory will probably exist (ie: we try the open first), and on ENOENT, we @@ -558,6 +599,17 @@ fn ensure_dir_and_openat(dirfd: impl AsFd, filename: &str, flags: OFlags) -> Err } } +/// Create a directory under `dirfd` if it doesn't already exist. +/// +/// Returns `Ok(())` on success or if the directory already exists. +/// Propagates all other errors from `mkdirat`. +fn ensure_dir_at(dirfd: impl AsFd, path: &str, mode: Mode) -> ErrnoResult<()> { + match mkdirat(dirfd, path, mode) { + Ok(()) | Err(Errno::EXIST) => Ok(()), + Err(e) => Err(e), + } +} + /// A zero-sized proof token confirming that a [`Repository`] is writable. /// /// Obtained by calling [`Repository::ensure_writable`], which performs a fast @@ -1133,6 +1185,215 @@ impl Repository { Ok(fd) } + /// Ensure an object exists by reflinking or hardlinking from a source file. + /// + /// The fallback chain is: reflink -> hardlink -> copy. + /// + /// - **Reflink** (FICLONE): zero-copy clone on btrfs/XFS. Uses a tmpfile. + /// - **Hardlink**: enables fs-verity on the source file in-place, then + /// hardlinks it directly into the objects directory. This avoids all data + /// copying on filesystems like ext4 that don't support reflinks. + /// - **Copy**: regular data copy into a tmpfile as last resort. + /// + /// The `ctx` argument accumulates knowledge across calls in the same + /// import operation. After the first reflink attempt fails with + /// `EOPNOTSUPP` / `EXDEV`, the context records this so that subsequent + /// calls skip straight to the hardlink path. + /// + /// This is particularly useful for importing from containers-storage where + /// we already have the file on disk and want to avoid copying data. + pub fn ensure_object_from_file( + &self, + src: &std::fs::File, + size: u64, + ctx: &mut ImportContext, + ) -> Result<(ObjectID, ObjectStoreMethod)> { + self.ensure_object_from_file_inner(src, size, true, ctx) + } + + /// Like [`ensure_object_from_file`](Self::ensure_object_from_file) but + /// errors if neither reflink nor hardlink succeeds, instead of falling back + /// to a regular copy. + /// + /// Intended for bootc's unified storage path where the composefs repo and + /// containers-storage are always on the same filesystem, so zero-copy + /// should always be possible. + pub fn ensure_object_from_file_zerocopy( + &self, + src: &std::fs::File, + size: u64, + ctx: &mut ImportContext, + ) -> Result<(ObjectID, ObjectStoreMethod)> { + self.ensure_object_from_file_inner(src, size, false, ctx) + } + + /// Inner implementation for [`ensure_object_from_file`](Self::ensure_object_from_file) and + /// [`ensure_object_from_file_zerocopy`](Self::ensure_object_from_file_zerocopy). + /// + /// When `allow_copy` is false, the copy fallback returns an error instead. + fn ensure_object_from_file_inner( + &self, + src: &std::fs::File, + size: u64, + allow_copy: bool, + ctx: &mut ImportContext, + ) -> Result<(ObjectID, ObjectStoreMethod)> { + use rustix::fs::{fstat, ioctl_ficlone}; + + let writable = self.ensure_writable_token()?; + + // Determine the source and destination device IDs so we can look up + // whether this particular filesystem pair supports reflinks. + let src_dev = fstat(src)?.st_dev; + let dst_dev = fstat(self.objects_dir()?)?.st_dev; + + // Try reflink first, unless a previous call on this device pair + // already discovered that FICLONE is unsupported. + if !ctx.is_reflink_unsupported(src_dev, dst_dev) { + let tmpfile_fd = self.create_object_tmpfile_impl(&writable)?; + let tmpfile = File::from(tmpfile_fd); + + match ioctl_ficlone(&tmpfile, src) { + Ok(()) => { + // Reflink succeeded — verify size matches + let stat = fstat(&tmpfile)?; + anyhow::ensure!( + stat.st_size as u64 == size, + "Reflink size mismatch: expected {}, got {}", + size, + stat.st_size + ); + + let (object_id, method) = self.finalize_object_tmpfile(tmpfile, size)?; + let method = match method { + ObjectStoreMethod::Copied => ObjectStoreMethod::Reflinked, + other => other, + }; + return Ok((object_id, method)); + } + Err(Errno::OPNOTSUPP | Errno::XDEV) => { + // Record for this device pair so subsequent calls skip. + ctx.mark_reflink_unsupported(src_dev, dst_dev); + drop(tmpfile); + } + Err(e) => { + return Err(e).context("Reflinking source file to objects directory")?; + } + } + } + + // Try hardlink: enable verity on the source in-place, then link it + // directly into objects/. This avoids all data copying. + match self.try_hardlink_object(src, size) { + Ok(result) => return Ok(result), + Err(_) if allow_copy => { + // Hardlink failed, fall through to copy. + // Common causes: cross-mount (overlay bind mount), EPERM, + // or verity enablement failure. + } + Err(e) => { + return Err(e).context( + "reflink and hardlink both failed; copy fallback is disabled (zerocopy mode)", + ); + } + } + + // Final fallback: copy data into a new tmpfile. + let tmpfile_fd = self.create_object_tmpfile_impl(&writable)?; + let mut tmpfile = File::from(tmpfile_fd); + { + use std::io::{Seek, SeekFrom}; + let mut src_clone = src.try_clone()?; + src_clone.seek(SeekFrom::Start(0))?; + std::io::copy(&mut src_clone, &mut tmpfile)?; + } + + let (object_id, method) = self.finalize_object_tmpfile(tmpfile, size)?; + Ok((object_id, method)) + } + + /// Try to hardlink a source file directly into the objects directory. + /// + /// Enables fs-verity on the source file in-place, measures the digest to + /// determine the object ID, then hardlinks the source into `objects/`. + /// + /// Returns an error if verity cannot be enabled, the digest cannot be + /// measured, or the hardlink fails (e.g. cross-device). + fn try_hardlink_object( + &self, + src: &std::fs::File, + size: u64, + ) -> Result<(ObjectID, ObjectStoreMethod)> { + use crate::fsverity::enable_verity_with_retry; + use rustix::thread::{CapabilitySet, capabilities}; + + // AT_EMPTY_PATH linkat requires CAP_DAC_READ_SEARCH. Check upfront + // so callers get a clear error instead of a confusing ENOENT. + let has_cap = capabilities(None) + .map(|caps| caps.effective.contains(CapabilitySet::DAC_READ_SEARCH)) + .unwrap_or(false); + if !has_cap { + anyhow::bail!( + "hardlinking objects requires CAP_DAC_READ_SEARCH \ + (run as root or use the copy fallback)" + ); + } + + let objects_dir = self.objects_dir()?; + + // Enable fs-verity on the source file in-place. + // This is safe because the caller (bootc/containers-storage) owns the + // source files and they are immutable image data. + // AlreadyEnabled is fine — the file was already verity-protected. + let verity_enabled = match enable_verity_with_retry::(src) { + Ok(()) => true, + Err(EnableVerityError::AlreadyEnabled) => true, + Err(EnableVerityError::FilesystemNotSupported) if self.insecure => false, + Err(e) => { + return Err(e).context("enabling verity on source file for hardlink")?; + } + }; + + // Get the object ID from the verity digest (kernel-measured or userspace-computed) + let id: ObjectID = if verity_enabled { + measure_verity(src).context("measuring verity digest on source file")? + } else { + // Insecure mode on a filesystem without verity: compute digest in userspace + let mut reader = std::io::BufReader::new( + src.try_clone() + .context("cloning fd for digest computation")?, + ); + Self::compute_verity_digest(&mut reader) + .context("computing verity digest in insecure mode")? + }; + + // Check if object already exists (dedup) + let path = id.to_object_pathname(); + match statat(objects_dir, &path, AtFlags::empty()) { + Ok(stat) if stat.st_size as u64 == size => { + return Ok((id, ObjectStoreMethod::AlreadyPresent)); + } + _ => {} + } + + // Ensure parent directory exists (e.g. objects/4e/) + let parent_dir = id.to_object_dir(); + ensure_dir_at(objects_dir, &parent_dir, Mode::from_raw_mode(0o755)) + .context("creating object parent directory")?; + + // Hardlink the source file directly into objects/. + // Use AT_EMPTY_PATH to link by fd, which avoids the kernel's + // may_linkat() restriction that rejects AT_SYMLINK_FOLLOW on + // /proc/self/fd/ magic symlinks for non-root mounts. + // AT_EMPTY_PATH requires CAP_DAC_READ_SEARCH, which is available + // when running as root (the expected case for containers-storage). + match linkat(src, "", objects_dir, &path, AtFlags::EMPTY_PATH) { + Ok(()) => Ok((id, ObjectStoreMethod::Hardlinked)), + Err(Errno::EXIST) => Ok((id, ObjectStoreMethod::AlreadyPresent)), + Err(e) => Err(e).context("hardlinking source file into objects directory")?, + } + } + /// Finalize a tmpfile as an object. /// /// This method should be called from a blocking context (e.g., `spawn_blocking`) @@ -1222,7 +1483,8 @@ impl Repository { } let parent_dir = id.to_object_dir(); - let _ = mkdirat(objects_dir, &parent_dir, Mode::from_raw_mode(0o755)); + ensure_dir_at(objects_dir, &parent_dir, Mode::from_raw_mode(0o755)) + .context("creating object parent directory")?; match linkat( CWD, @@ -3454,7 +3716,7 @@ mod tests { Ok(()) } - use crate::tree::{FileSystem, Inode, Leaf, LeafContent, RegularFile, Stat}; + use crate::tree::{FileSystem, Inode, LeafContent, RegularFile, Stat}; /// Create a default root stat for test filesystems fn test_root_stat() -> Stat { @@ -3717,6 +3979,39 @@ mod tests { Ok(()) } + #[test] + fn test_ensure_object_from_file() -> Result<()> { + use std::io::{Seek, SeekFrom, Write}; + + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + let mut ctx = ImportContext::default(); + + let test_data = generate_test_data(64 * 1024, 0xBE); + let mut temp_file = crate::test::tempfile(); + temp_file.write_all(&test_data)?; + temp_file.seek(SeekFrom::Start(0))?; + + // First store should return Copied or Reflinked (depending on fs) + let (object_id, method) = + repo.ensure_object_from_file(&temp_file, test_data.len() as u64, &mut ctx)?; + assert_ne!(method, ObjectStoreMethod::AlreadyPresent); + assert!(test_object_exists(&tmp, &object_id)?); + + // Read back and verify contents match + let stored_data = repo.read_object(&object_id)?; + assert_eq!(stored_data, test_data); + + // Second store of same data should return AlreadyPresent + temp_file.seek(SeekFrom::Start(0))?; + let (object_id_2, method_2) = + repo.ensure_object_from_file(&temp_file, test_data.len() as u64, &mut ctx)?; + assert_eq!(object_id, object_id_2); + assert_eq!(method_2, ObjectStoreMethod::AlreadyPresent); + + Ok(()) + } + // ==================== Fsck Tests ==================== #[tokio::test] @@ -4451,4 +4746,28 @@ mod tests { FeatureCheck::ReadWrite ); } + + #[test] + fn test_object_store_method_variants() { + // Verify all variants exist and are distinct + let methods = [ + ObjectStoreMethod::Reflinked, + ObjectStoreMethod::Hardlinked, + ObjectStoreMethod::Copied, + ObjectStoreMethod::AlreadyPresent, + ]; + + for (i, a) in methods.iter().enumerate() { + for (j, b) in methods.iter().enumerate() { + if i == j { + assert_eq!(a, b); + } else { + assert_ne!(a, b); + } + } + } + + // Verify Debug impl works + assert_eq!(format!("{:?}", ObjectStoreMethod::Hardlinked), "Hardlinked"); + } } diff --git a/crates/composefs/src/splitstream.rs b/crates/composefs/src/splitstream.rs index ea4c6d1a..19d3b57f 100644 --- a/crates/composefs/src/splitstream.rs +++ b/crates/composefs/src/splitstream.rs @@ -969,7 +969,7 @@ mod tests { use super::*; use crate::fsverity::{Sha256HashValue, compute_verity}; use crate::test::tempdir; - use rustix::fs::{CWD, Mode, mkdirat}; + use rustix::fs::CWD; use std::io::Cursor; use std::path::Path; diff --git a/crates/composefs/tests/mkfs.rs b/crates/composefs/tests/mkfs.rs index ae8d83a8..b2896c69 100644 --- a/crates/composefs/tests/mkfs.rs +++ b/crates/composefs/tests/mkfs.rs @@ -14,7 +14,7 @@ use composefs::{ dumpfile::write_dumpfile, erofs::{debug::debug_img, writer::mkfs_erofs}, fsverity::{FsVerityHashValue, Sha256HashValue}, - tree::{FileSystem, Inode, Leaf, LeafContent, RegularFile, Stat}, + tree::{FileSystem, Inode, LeafContent, RegularFile, Stat}, }; fn default_stat() -> Stat { diff --git a/crates/cstorage/Cargo.toml b/crates/cstorage/Cargo.toml new file mode 100644 index 00000000..e4e524d8 --- /dev/null +++ b/crates/cstorage/Cargo.toml @@ -0,0 +1,41 @@ +[package] +name = "cstorage" +description = "Read-only access to containers-storage (overlay driver)" +keywords = ["containers", "storage", "overlay", "podman", "buildah"] + +edition.workspace = true +license.workspace = true +readme.workspace = true +repository.workspace = true +rust-version.workspace = true +version.workspace = true + +[dependencies] +anyhow = { version = "1.0", default-features = false, features = ["std"] } +base64 = { version = "0.22", default-features = false, features = ["std"] } +cap-std = { version = "4.0", default-features = false } +cap-std-ext = { version = "4.0", default-features = false } +crc = { version = "3.0", default-features = false } +flate2 = { version = "1.0", default-features = false, features = ["rust_backend"] } +jsonrpc-fdpass = { workspace = true, optional = true } +oci-spec = { version = "0.8", default-features = false, features = ["image"] } +rustix = { version = "1.0", default-features = false, features = ["fs", "std", "process", "thread"] } +serde = { version = "1.0", default-features = false, features = ["derive"] } +serde_json = { version = "1.0", default-features = false, features = ["std"] } +sha2 = { version = "0.10", default-features = false, features = ["std"] } +tar-core = "0.1.0" +thiserror = { version = "2.0", default-features = false } +tokio = { version = "1.40", default-features = false, features = ["rt", "net", "sync"], optional = true } +toml = { version = "0.8", default-features = false, features = ["parse"] } +tracing = { version = "0.1", default-features = false, optional = true } +zstd = { version = "0.13", default-features = false } + +[features] +default = [] +userns-helper = ["dep:jsonrpc-fdpass", "dep:tokio", "dep:tracing"] + +[dev-dependencies] +tempfile = { version = "3.8", default-features = false } + +[lints] +workspace = true diff --git a/crates/cstorage/src/config.rs b/crates/cstorage/src/config.rs new file mode 100644 index 00000000..8d8d14a2 --- /dev/null +++ b/crates/cstorage/src/config.rs @@ -0,0 +1,119 @@ +//! Configuration parsing for container storage. +//! +//! This module provides structures for parsing storage.conf files used by +//! containers-storage. Configuration files define storage locations, drivers, +//! and additional read-only image stores. +//! +//! # Overview +//! +//! Container storage configuration is typically found in: +//! - System-wide: `/etc/containers/storage.conf` +//! - User-specific: `~/.config/containers/storage.conf` +//! +//! The configuration uses TOML format and specifies the storage driver +//! (overlay, btrfs, etc.), root paths, and additional layer/image stores. +//! +//! # Configuration Structure +//! +//! A typical storage.conf file looks like: +//! ```toml +//! [storage] +//! driver = "overlay" +//! root = "/var/lib/containers/storage" +//! run_root = "/run/containers/storage" +//! +//! # Additional read-only image stores +//! image_stores = [ +//! "/usr/share/containers/storage" +//! ] +//! +//! # Additional layer stores configuration +//! [[storage.layer_stores]] +//! path = "/mnt/layers" +//! with_reference = true +//! ``` + +use serde::Deserialize; +use std::path::PathBuf; + +/// Storage configuration, typically parsed from storage.conf files. +/// +/// Configuration files are searched in: +/// - `/etc/containers/storage.conf` +/// - `$HOME/.config/containers/storage.conf` +#[derive(Debug, Clone, Deserialize)] +pub struct StorageConfig { + /// Storage driver name (should be "overlay" for this library). + #[serde(default)] + pub driver: String, + + /// Primary storage root path. + #[serde(default)] + pub root: PathBuf, + + /// Runtime root for transient data. + #[serde(default)] + pub run_root: PathBuf, + + /// Additional read-only image stores. + #[serde(default)] + pub image_stores: Vec, + + /// Additional layer stores configuration. + #[serde(default)] + pub layer_stores: Vec, +} + +/// Configuration for an additional layer store. +#[derive(Debug, Clone, Deserialize)] +pub struct AdditionalLayerStore { + /// Path to the additional layer store. + pub path: PathBuf, + + /// Whether to use base64-encoded references in paths. + #[serde(default)] + pub with_reference: bool, +} + +impl StorageConfig { + /// Parse storage configuration from TOML content. + /// + /// # Errors + /// + /// Returns an error if the TOML content is invalid. + pub fn from_toml(content: &str) -> Result { + toml::from_str(content) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_basic_config() { + let config_str = r#" +driver = "overlay" +root = "/var/lib/containers/storage" +"#; + let config = StorageConfig::from_toml(config_str).unwrap(); + assert_eq!(config.driver, "overlay"); + assert_eq!(config.root, PathBuf::from("/var/lib/containers/storage")); + } + + #[test] + fn test_parse_with_layer_stores() { + let config_str = r#" +driver = "overlay" +root = "/var/lib/containers/storage" + +[[layer_stores]] +path = "/mnt/layers" +with_reference = true +"#; + let config = StorageConfig::from_toml(config_str).unwrap(); + assert_eq!(config.layer_stores.len(), 1); + assert_eq!(config.layer_stores[0].path, PathBuf::from("/mnt/layers")); + assert!(config.layer_stores[0].with_reference); + } +} diff --git a/crates/cstorage/src/error.rs b/crates/cstorage/src/error.rs new file mode 100644 index 00000000..959042f3 --- /dev/null +++ b/crates/cstorage/src/error.rs @@ -0,0 +1,64 @@ +//! Error types for the cstorage library. +//! +//! This module defines the error types used throughout the library. All operations +//! that can fail return a [`Result`] which is an alias for `Result`. +//! +//! # Error Categories +//! +//! Errors are organized into several categories: +//! +//! - **Storage errors**: [`RootNotFound`], [`InvalidStorage`] +//! - **Entity errors**: [`LayerNotFound`], [`ImageNotFound`] +//! - **Link resolution**: [`LinkReadError`] +//! - **Tar-split processing**: [`TarSplitError`] +//! - **System errors**: [`Io`], [`JsonParse`] +//! +//! [`RootNotFound`]: StorageError::RootNotFound +//! [`InvalidStorage`]: StorageError::InvalidStorage +//! [`LayerNotFound`]: StorageError::LayerNotFound +//! [`ImageNotFound`]: StorageError::ImageNotFound +//! [`LinkReadError`]: StorageError::LinkReadError +//! [`TarSplitError`]: StorageError::TarSplitError +//! [`Io`]: StorageError::Io +//! [`JsonParse`]: StorageError::JsonParse + +use std::path::PathBuf; + +/// Result type alias for operations that may return a StorageError. +pub type Result = std::result::Result; + +/// Error types for storage operations. +#[derive(Debug, thiserror::Error)] +pub enum StorageError { + /// Storage root directory was not found at the specified path. + #[error("storage root not found at {0}")] + RootNotFound(PathBuf), + + /// Storage validation failed with the provided reason. + #[error("invalid storage: {0}")] + InvalidStorage(String), + + /// The requested layer was not found. + #[error("layer not found: {0}")] + LayerNotFound(String), + + /// The requested image was not found. + #[error("image not found: {0}")] + ImageNotFound(String), + + /// Failed to read a link file. + #[error("failed to read link file: {0}")] + LinkReadError(String), + + /// Error related to tar-split processing. + #[error("tar-split error: {0}")] + TarSplitError(String), + + /// I/O error occurred during file operations. + #[error("I/O error: {0}")] + Io(#[from] std::io::Error), + + /// JSON parsing error occurred. + #[error("JSON parse error: {0}")] + JsonParse(#[from] serde_json::Error), +} diff --git a/crates/cstorage/src/image.rs b/crates/cstorage/src/image.rs new file mode 100644 index 00000000..51ea6bdc --- /dev/null +++ b/crates/cstorage/src/image.rs @@ -0,0 +1,288 @@ +//! Image reading and manifest parsing. +//! +//! This module provides access to OCI image manifests and metadata stored in +//! the `overlay-images/` directory. All operations use fd-relative access via +//! cap-std Dir handles. +//! +//! # Overview +//! +//! The [`Image`] struct represents a container image stored in the overlay driver. +//! It provides access to: +//! - OCI image manifests ([`oci_spec::image::ImageManifest`]) +//! - OCI image configurations ([`oci_spec::image::ImageConfiguration`]) +//! - Layer information (diff_ids that map to storage layer IDs) +//! - Additional metadata stored in base64-encoded files +//! +//! # Image Directory Structure +//! +//! Each image is stored in `overlay-images//`: +//! ```text +//! overlay-images// +//! +-- manifest # OCI image manifest (JSON) +//! +-- = # Additional metadata files +//! ``` + +use base64::{Engine, engine::general_purpose::STANDARD}; +use cap_std::fs::Dir; +use oci_spec::image::{ImageConfiguration, ImageManifest}; +use std::io::Read; + +use crate::error::{Result, StorageError}; +use crate::storage::Storage; + +/// Filename for OCI image manifest in the image directory. +const MANIFEST_FILENAME: &str = "manifest"; + +/// Represents an OCI image with its metadata and manifest. +#[derive(Debug)] +pub struct Image { + /// Image ID (typically a 64-character hex digest). + id: String, + + /// Directory handle for overlay-images/\/. + image_dir: Dir, +} + +impl Image { + /// Open an image by ID using fd-relative operations. + /// + /// The ID can be provided with or without a `sha256:` prefix - the prefix + /// will be stripped if present, since containers-storage directories use + /// just the hex digest. + /// + /// # Errors + /// + /// Returns an error if the image directory doesn't exist or cannot be opened. + pub fn open(storage: &Storage, id: &str) -> Result { + // Strip the sha256: prefix if present - containers-storage directories + // use just the hex digest, but image IDs from podman (e.g. via --iidfile) + // include the prefix. See https://github.com/containers/skopeo/issues/2750 + let id = id.strip_prefix("sha256:").unwrap_or(id); + + // Open overlay-images directory from storage root + let images_dir = storage.root_dir().open_dir("overlay-images")?; + + // Open specific image directory + let image_dir = images_dir + .open_dir(id) + .map_err(|_| StorageError::ImageNotFound(id.to_string()))?; + + Ok(Self { + id: id.to_string(), + image_dir, + }) + } + + /// Get the image ID. + pub fn id(&self) -> &str { + &self.id + } + + /// Read the raw manifest JSON bytes. + /// + /// Returns the original manifest bytes as stored on disk, preserving + /// whitespace and field ordering for content-addressed hashing. + /// + /// # Errors + /// + /// Returns an error if the manifest file cannot be read. + pub fn read_manifest_raw(&self) -> Result> { + let mut file = self.image_dir.open(MANIFEST_FILENAME)?; + let mut data = Vec::new(); + file.read_to_end(&mut data)?; + Ok(data) + } + + /// Read and parse the image manifest. + /// + /// The manifest is stored as a JSON file named "manifest" in the image directory. + /// + /// # Errors + /// + /// Returns an error if the manifest file cannot be read or parsed. + pub fn manifest(&self) -> Result { + let file = self.image_dir.open(MANIFEST_FILENAME)?; + serde_json::from_reader(file) + .map_err(|e| StorageError::InvalidStorage(format!("Invalid manifest JSON: {}", e))) + } + + /// Read and parse the image configuration. + /// + /// The image config is stored with a base64-encoded key based on the image digest. + /// + /// # Errors + /// + /// Returns an error if the config file cannot be read or parsed. + pub fn config(&self) -> Result { + // The config is stored with key: sha256: + // Base64 encode: "sha256:" + let key = format!("sha256:{}", self.id); + let encoded_key = STANDARD.encode(key.as_bytes()); + + let config_data = self.read_metadata(&encoded_key).map_err(|e| { + StorageError::Io(std::io::Error::other(format!( + "reading config metadata ={} for image {}: {}", + encoded_key, self.id, e + ))) + })?; + serde_json::from_slice(&config_data) + .map_err(|e| StorageError::InvalidStorage(format!("Invalid config JSON: {}", e))) + } + + /// Get the OCI diff_ids for this image in order (base to top). + /// + /// This returns the diff_ids from the image config, which are the uncompressed + /// tar digests. Note that these are **not** the same as the storage layer IDs! + /// To get the actual storage layer IDs, use [`storage_layer_ids()`](Self::storage_layer_ids). + /// + /// # Errors + /// + /// Returns an error if the config cannot be read or parsed. + pub fn layers(&self) -> Result> { + let config = self.config()?; + + // Extract diff_ids from config - these are NOT the storage layer IDs + let diff_ids: Vec = config + .rootfs() + .diff_ids() + .iter() + .map(|digest| { + // Remove the "sha256:" prefix if present + let diff_id = digest.to_string(); + diff_id + .strip_prefix("sha256:") + .unwrap_or(&diff_id) + .to_string() + }) + .collect(); + + Ok(diff_ids) + } + + /// Get the storage layer IDs for this image in order (base to top). + /// + /// Unlike [`layers()`](Self::layers) which returns OCI diff_ids, this method + /// returns the actual storage layer directory names by resolving diff_ids + /// through the `layers.json` mapping file. + /// + /// # Errors + /// + /// Returns an error if the config cannot be read, parsed, or if any layer + /// cannot be resolved. + pub fn storage_layer_ids(&self, stores: &[Storage]) -> Result> { + let diff_ids = self.layers()?; + + // Try to resolve all diff_ids from each store (batch parse of layers.json). + // Layers may span stores (e.g. base layers in an additional image store, + // new layers in the primary), so we merge results across stores. + let mut resolved: Vec> = vec![None; diff_ids.len()]; + for store in stores { + if resolved.iter().all(|r| r.is_some()) { + break; + } + // resolve_diff_ids parses layers.json once for all diff_ids + if let Ok(found) = store.resolve_diff_ids(&diff_ids) { + for (i, id) in found.into_iter().enumerate() { + if resolved[i].is_none() { + resolved[i] = id; + } + } + } + } + + resolved + .into_iter() + .enumerate() + .map(|(i, opt)| opt.ok_or_else(|| StorageError::LayerNotFound(diff_ids[i].clone()))) + .collect() + } + + /// Read additional metadata files. + /// + /// Metadata files are stored with base64-encoded keys as filenames, + /// prefixed with '='. + /// + /// # Errors + /// + /// Returns an error if the metadata file doesn't exist or cannot be read. + pub fn read_metadata(&self, key: &str) -> Result> { + let filename = format!("={}", key); + let mut file = self.image_dir.open(&filename)?; + let mut data = Vec::new(); + file.read_to_end(&mut data)?; + Ok(data) + } + + /// Get a reference to the image directory handle. + pub fn image_dir(&self) -> &Dir { + &self.image_dir + } + + /// Get the repository names/tags for this image. + /// + /// Reads from the `overlay-images/images.json` index file to find the + /// names associated with this image. + /// + /// # Errors + /// + /// Returns an error if the images.json file cannot be read or parsed. + pub fn names(&self, storage: &Storage) -> Result> { + let images_dir = storage.root_dir().open_dir("overlay-images")?; + let mut file = images_dir.open("images.json")?; + let mut contents = String::new(); + file.read_to_string(&mut contents)?; + + let entries: Vec = serde_json::from_str(&contents) + .map_err(|e| StorageError::InvalidStorage(format!("Invalid images.json: {}", e)))?; + + for entry in entries { + if entry.id == self.id { + return Ok(entry.names.unwrap_or_default()); + } + } + + // Image not found in images.json - return empty names + Ok(Vec::new()) + } +} + +/// Entry in images.json for image name lookups. +#[derive(Debug, serde::Deserialize)] +pub(crate) struct ImageJsonEntry { + pub(crate) id: String, + pub(crate) names: Option>, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_manifest_parsing() { + let manifest_json = r#"{ + "schemaVersion": 2, + "mediaType": "application/vnd.oci.image.manifest.v1+json", + "config": { + "mediaType": "application/vnd.oci.image.config.v1+json", + "digest": "sha256:0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "size": 1234 + }, + "layers": [ + { + "mediaType": "application/vnd.oci.image.layer.v1.tar+gzip", + "digest": "sha256:1111111111111111111111111111111111111111111111111111111111111111", + "size": 5678 + }, + { + "mediaType": "application/vnd.oci.image.layer.v1.tar+gzip", + "digest": "sha256:2222222222222222222222222222222222222222222222222222222222222222", + "size": 9012 + } + ] + }"#; + + let manifest: ImageManifest = serde_json::from_str(manifest_json).unwrap(); + assert_eq!(manifest.schema_version(), 2); + assert_eq!(manifest.layers().len(), 2); + } +} diff --git a/crates/cstorage/src/layer.rs b/crates/cstorage/src/layer.rs new file mode 100644 index 00000000..9a05a6b9 --- /dev/null +++ b/crates/cstorage/src/layer.rs @@ -0,0 +1,489 @@ +//! Layer reading and metadata handling. +//! +//! This module provides access to individual overlay layers and their metadata. +//! Layers are the fundamental storage units in the overlay driver, representing +//! filesystem changes that are stacked to form complete container images. +//! +//! # Overview +//! +//! The [`Layer`] struct represents a single layer in the overlay filesystem. +//! Each layer contains: +//! - A `diff/` directory with the actual file contents +//! - A `link` file containing a short 26-character identifier +//! - A `lower` file listing parent layers (if not a base layer) +//! - Metadata for whiteouts and opaque directories +//! +//! # Layer Structure +//! +//! Each layer is stored in `overlay//`: +//! ```text +//! overlay// +//! +-- diff/ # Layer file contents +//! | +-- etc/ +//! | | +-- hosts +//! | +-- usr/ +//! | +-- bin/ +//! +-- link # Short link ID (26 chars) +//! +-- lower # Parent references: "l/:l/:..." +//! ``` +//! +//! # Whiteouts and Opaque Directories +//! +//! The overlay driver uses special markers to indicate file deletions: +//! - `.wh.` - Whiteout file (marks `` as deleted) +//! - `.wh..wh..opq` - Opaque directory marker (hides lower layer contents) + +use crate::error::{Result, StorageError}; +use crate::storage::Storage; +use cap_std::fs::Dir; + +/// Represents an overlay layer with its metadata and content. +#[derive(Debug)] +pub struct Layer { + /// Layer ID (typically a 64-character hex digest). + id: String, + + /// Directory handle for the layer directory (overlay/\/). + layer_dir: Dir, + + /// Directory handle for the diff/ subdirectory containing layer content. + diff_dir: Dir, + + /// Short link identifier from the link file (26 characters). + link_id: String, + + /// Parent layer link IDs from the lower file. + parent_links: Vec, +} + +impl Layer { + /// Open a layer by ID using fd-relative operations. + /// + /// # Errors + /// + /// Returns an error if the layer directory doesn't exist or cannot be opened. + pub fn open(storage: &Storage, id: &str) -> Result { + // Open overlay directory from storage root + let overlay_dir = storage.root_dir().open_dir("overlay")?; + + // Open layer directory relative to overlay + let layer_dir = overlay_dir + .open_dir(id) + .map_err(|_| StorageError::LayerNotFound(id.to_string()))?; + + // Open diff directory for content access + let diff_dir = layer_dir.open_dir("diff")?; + + // Read metadata files using fd-relative operations + let link_id = Self::read_link(&layer_dir)?; + let parent_links = Self::read_lower(&layer_dir)?; + + Ok(Self { + id: id.to_string(), + layer_dir, + diff_dir, + link_id, + parent_links, + }) + } + + /// Get the layer ID. + pub fn id(&self) -> &str { + &self.id + } + + /// Read the link file (26-char identifier) via Dir handle. + fn read_link(layer_dir: &Dir) -> Result { + let content = layer_dir.read_to_string("link")?; + Ok(content.trim().to_string()) + } + + /// Read the lower file (colon-separated parent links) via Dir handle. + fn read_lower(layer_dir: &Dir) -> Result> { + match layer_dir.read_to_string("lower") { + Ok(content) => { + // Format is "l/:l/:..." + let links: Vec = content + .trim() + .split(':') + .filter_map(|s| s.strip_prefix("l/")) + .map(|s| s.to_string()) + .collect(); + Ok(links) + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Vec::new()), // Base layer has no lower file + Err(e) => Err(StorageError::Io(e)), + } + } + + /// Get the short link ID for this layer. + pub fn link_id(&self) -> &str { + &self.link_id + } + + /// Get the parent link IDs for this layer. + pub fn parent_links(&self) -> &[String] { + &self.parent_links + } + + /// Get parent layer IDs (resolved from link IDs). + /// + /// This resolves the short link IDs from the `lower` file to full layer IDs + /// by reading the symlinks in the `overlay/l/` directory. + /// + /// # Errors + /// + /// Returns an error if any link cannot be resolved. + pub fn parents(&self, storage: &Storage) -> Result> { + self.parent_links + .iter() + .map(|link_id| storage.resolve_link(link_id)) + .collect() + } + + /// Get a reference to the layer directory handle. + pub fn layer_dir(&self) -> &Dir { + &self.layer_dir + } + + /// Get a reference to the diff directory handle. + pub fn diff_dir(&self) -> &Dir { + &self.diff_dir + } + + /// Get the complete chain of layers from this layer to the base. + /// + /// Returns layers in order: [self, parent, grandparent, ..., base] + /// + /// # Errors + /// + /// Returns an error if the layer chain exceeds the maximum depth of 500 layers. + pub fn layer_chain(self, storage: &Storage) -> Result> { + let mut chain = vec![self]; + let mut current_idx = 0; + + // Maximum depth to prevent infinite loops + const MAX_DEPTH: usize = 500; + + while current_idx < chain.len() && chain.len() < MAX_DEPTH { + let parent_ids = chain[current_idx].parents(storage)?; + + // Add all parents to the chain + for parent_id in parent_ids { + chain.push(Layer::open(storage, &parent_id)?); + } + + current_idx += 1; + } + + if chain.len() >= MAX_DEPTH { + return Err(StorageError::InvalidStorage( + "Layer chain exceeds maximum depth of 500".to_string(), + )); + } + + Ok(chain) + } + + /// Open a file in the layer's diff directory using fd-relative operations. + /// + /// # Errors + /// + /// Returns an error if the file doesn't exist or cannot be opened. + pub fn open_file(&self, path: impl AsRef) -> Result { + self.diff_dir.open(path).map_err(StorageError::Io) + } + + /// Open a file and return a standard library File. + /// + /// # Errors + /// + /// Returns an error if the file doesn't exist or cannot be opened. + pub fn open_file_std(&self, path: impl AsRef) -> Result { + let file = self.diff_dir.open(path).map_err(StorageError::Io)?; + Ok(file.into_std()) + } + + /// Get metadata for a file in the layer's diff directory. + /// + /// # Errors + /// + /// Returns an error if the file doesn't exist. + pub fn metadata(&self, path: impl AsRef) -> Result { + self.diff_dir.metadata(path).map_err(StorageError::Io) + } + + /// Read directory entries using Dir handle. + /// + /// # Errors + /// + /// Returns an error if the directory doesn't exist. + pub fn read_dir(&self, path: impl AsRef) -> Result { + self.diff_dir.read_dir(path).map_err(StorageError::Io) + } + + /// Check if a whiteout file exists for the given filename. + /// + /// Whiteout format: `.wh.` + /// + /// # Arguments + /// + /// * `parent_path` - The directory path containing the file (empty string or "." for root) + /// * `filename` - The name of the file to check for whiteout + /// + /// # Errors + /// + /// Returns an error if the directory cannot be accessed. + pub fn has_whiteout(&self, parent_path: &str, filename: &str) -> Result { + let whiteout_name = format!(".wh.{}", filename); + + // Handle root directory case + if parent_path.is_empty() || parent_path == "." { + Ok(self.diff_dir.try_exists(&whiteout_name)?) + } else { + match self.diff_dir.open_dir(parent_path) { + Ok(parent_dir) => Ok(parent_dir.try_exists(&whiteout_name)?), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), + Err(e) => Err(StorageError::Io(e)), + } + } + } + + /// Check if a directory is marked as opaque (hides lower layers). + /// + /// Opaque marker: `.wh..wh..opq` + /// + /// # Errors + /// + /// Returns an error if the directory cannot be accessed. + pub fn is_opaque_dir(&self, path: &str) -> Result { + const OPAQUE_MARKER: &str = ".wh..wh..opq"; + + if path.is_empty() || path == "." { + Ok(self.diff_dir.try_exists(OPAQUE_MARKER)?) + } else { + match self.diff_dir.open_dir(path) { + Ok(dir) => Ok(dir.try_exists(OPAQUE_MARKER)?), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), + Err(e) => Err(StorageError::Io(e)), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::Path; + + #[test] + fn test_parse_lower_format() { + // Test that we correctly parse the lower file format + let content = "l/ABCDEFGHIJKLMNOPQRSTUVWXY:l/BCDEFGHIJKLMNOPQRSTUVWXYZ"; + let links: Vec = content + .trim() + .split(':') + .filter_map(|s| s.strip_prefix("l/")) + .map(|s| s.to_string()) + .collect(); + + assert_eq!(links.len(), 2); + assert_eq!(links[0], "ABCDEFGHIJKLMNOPQRSTUVWXY"); + assert_eq!(links[1], "BCDEFGHIJKLMNOPQRSTUVWXYZ"); + } + + /// Create a minimal mock storage + layer on disk so that `Layer::open()` succeeds. + /// Returns the opened `Layer`. + fn create_mock_layer(root: &Path) -> Layer { + // Storage validation requires these three directories + for d in ["overlay", "overlay-layers", "overlay-images"] { + std::fs::create_dir_all(root.join(d)).unwrap(); + } + + let layer_id = "test-layer-001"; + let layer_dir = root.join("overlay").join(layer_id); + std::fs::create_dir_all(layer_dir.join("diff")).unwrap(); + std::fs::write(layer_dir.join("link"), "ABCDEFGHIJKLMNOPQRSTUVWXYZ").unwrap(); + + let storage = Storage::open(root).unwrap(); + Layer::open(&storage, layer_id).unwrap() + } + + // --- has_whiteout tests --- + + #[test] + fn test_has_whiteout_in_root() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + + // No whiteout yet + assert!(!layer.has_whiteout("", "somefile").unwrap()); + assert!(!layer.has_whiteout(".", "somefile").unwrap()); + + // Create whiteout marker (regular file — has_whiteout uses try_exists) + std::fs::write( + dir.path().join("overlay/test-layer-001/diff/.wh.somefile"), + "", + ) + .unwrap(); + + assert!(layer.has_whiteout("", "somefile").unwrap()); + assert!(layer.has_whiteout(".", "somefile").unwrap()); + // Different name still returns false + assert!(!layer.has_whiteout("", "otherfile").unwrap()); + } + + #[test] + fn test_has_whiteout_in_subdirectory() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + std::fs::create_dir_all(diff.join("etc")).unwrap(); + std::fs::write(diff.join("etc/.wh.hosts"), "").unwrap(); + + assert!(layer.has_whiteout("etc", "hosts").unwrap()); + // Root doesn't have this whiteout + assert!(!layer.has_whiteout("", "hosts").unwrap()); + } + + #[test] + fn test_has_whiteout_in_nested_subdirectory() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + std::fs::create_dir_all(diff.join("usr/local/bin")).unwrap(); + std::fs::write(diff.join("usr/local/bin/.wh.myapp"), "").unwrap(); + + assert!(layer.has_whiteout("usr/local/bin", "myapp").unwrap()); + assert!(!layer.has_whiteout("usr/local", "myapp").unwrap()); + assert!(!layer.has_whiteout("usr", "myapp").unwrap()); + } + + #[test] + fn test_has_whiteout_nonexistent_parent() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + + // Parent directory doesn't exist — should return false, not error + assert!(!layer.has_whiteout("no/such/dir", "file").unwrap()); + } + + #[test] + fn test_has_whiteout_multiple() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + std::fs::write(diff.join(".wh.file_a"), "").unwrap(); + std::fs::write(diff.join(".wh.file_b"), "").unwrap(); + + assert!(layer.has_whiteout("", "file_a").unwrap()); + assert!(layer.has_whiteout("", "file_b").unwrap()); + assert!(!layer.has_whiteout("", "file_c").unwrap()); + } + + // --- is_opaque_dir tests --- + + #[test] + fn test_is_opaque_dir_in_root() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + assert!(!layer.is_opaque_dir("").unwrap()); + assert!(!layer.is_opaque_dir(".").unwrap()); + + std::fs::write(diff.join(".wh..wh..opq"), "").unwrap(); + + assert!(layer.is_opaque_dir("").unwrap()); + assert!(layer.is_opaque_dir(".").unwrap()); + } + + #[test] + fn test_is_opaque_dir_in_subdirectory() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + std::fs::create_dir_all(diff.join("etc")).unwrap(); + std::fs::write(diff.join("etc/.wh..wh..opq"), "").unwrap(); + + assert!(layer.is_opaque_dir("etc").unwrap()); + // Root is not opaque + assert!(!layer.is_opaque_dir("").unwrap()); + } + + #[test] + fn test_is_opaque_dir_false_for_normal_dir() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + // Create a subdirectory with a regular file, but no opaque marker + std::fs::create_dir_all(diff.join("var/log")).unwrap(); + std::fs::write(diff.join("var/log/syslog"), "log data").unwrap(); + + assert!(!layer.is_opaque_dir("var").unwrap()); + assert!(!layer.is_opaque_dir("var/log").unwrap()); + } + + #[test] + fn test_is_opaque_dir_nonexistent_path() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + + // Non-existent directory should return false, not error + assert!(!layer.is_opaque_dir("no/such/path").unwrap()); + } + + #[test] + fn test_is_opaque_dir_nested() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + // Only the nested dir is opaque, not its parents + std::fs::create_dir_all(diff.join("a/b/c")).unwrap(); + std::fs::write(diff.join("a/b/c/.wh..wh..opq"), "").unwrap(); + + assert!(!layer.is_opaque_dir("a").unwrap()); + assert!(!layer.is_opaque_dir("a/b").unwrap()); + assert!(layer.is_opaque_dir("a/b/c").unwrap()); + } + + // --- Interaction between whiteout and opaque --- + + #[test] + fn test_whiteout_and_opaque_coexist() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + std::fs::create_dir_all(diff.join("mydir")).unwrap(); + // Opaque marker in mydir + std::fs::write(diff.join("mydir/.wh..wh..opq"), "").unwrap(); + // Also a file whiteout in mydir + std::fs::write(diff.join("mydir/.wh.oldfile"), "").unwrap(); + + assert!(layer.is_opaque_dir("mydir").unwrap()); + assert!(layer.has_whiteout("mydir", "oldfile").unwrap()); + } + + #[test] + fn test_whiteout_of_dotdot_prefix_name() { + // .wh..wh. is NOT an opaque whiteout — it's a whiteout for a file + // literally named ".wh." (the has_whiteout logic just prepends ".wh.") + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + // Create .wh..wh. (whiteout for file named ".wh.") + std::fs::write(diff.join(".wh..wh."), "").unwrap(); + + assert!(layer.has_whiteout("", ".wh.").unwrap()); + // This is NOT an opaque marker — the opaque marker is ".wh..wh..opq" + assert!(!layer.is_opaque_dir("").unwrap()); + } +} diff --git a/crates/cstorage/src/lib.rs b/crates/cstorage/src/lib.rs new file mode 100644 index 00000000..1863004c --- /dev/null +++ b/crates/cstorage/src/lib.rs @@ -0,0 +1,76 @@ +//! Read-only access to containers-storage overlay driver. +//! +//! This library provides efficient, capability-based access to container image +//! storage using the overlay driver. All file operations are performed using +//! file descriptor-relative operations via cap-std, providing security against +//! path traversal attacks and TOCTOU race conditions. +//! +//! # Overview +//! +//! The library is designed to access containers-storage (overlay driver) without +//! requiring tar serialization. Instead, it provides direct file descriptor access +//! to layer content, enabling zero-copy operations. +//! +//! # Key Features +//! +//! - **Capability-based security**: All file access via `cap_std::fs::Dir` handles +//! - **Zero-copy access**: File descriptors instead of data copies +//! - **Safe by design**: No path traversal vulnerabilities +//! - **Tar-split integration**: Bit-for-bit identical TAR reconstruction +//! - **OCI compatibility**: Uses oci-spec for standard image formats +//! +//! # Example +//! +//! ```no_run +//! use cstorage::Storage; +//! +//! // Discover storage from default locations +//! let storage = Storage::discover()?; +//! +//! // Or open storage at a specific path +//! let storage = Storage::open("/var/lib/containers/storage")?; +//! +//! // List images +//! for image in storage.list_images()? { +//! println!("Image: {}", image.id()); +//! } +//! # Ok::<(), cstorage::StorageError>(()) +//! ``` +//! +//! # Architecture +//! +//! The library uses cap-std for all file operations: +//! - `Storage` holds a `Dir` handle to the storage root +//! - All file access is relative to `Dir` handles +//! - No absolute paths are constructed during operations +//! - SQLite database accessed via fd-relative path + +// Core storage access +pub mod config; +pub mod error; +pub mod image; +pub mod layer; +pub mod storage; +pub mod tar_split; + +// User namespace support for rootless access +pub mod userns; +#[cfg(feature = "userns-helper")] +pub mod userns_helper; + +// Re-export commonly used types +pub use config::{AdditionalLayerStore, StorageConfig}; +pub use error::{Result, StorageError}; +pub use image::Image; +pub use layer::Layer; +pub use storage::{LayerMetadata, Storage}; +pub use tar_split::{TarHeader, TarSplitFdStream, TarSplitItem}; +pub use userns::can_bypass_file_permissions; +#[cfg(feature = "userns-helper")] +pub use userns_helper::{ + GetImageResult, HelperError, ImageInfo, ProxiedLayerStream, ProxiedTarSplitItem, StorageProxy, + init_if_helper, +}; + +// Re-export OCI spec types for convenience +pub use oci_spec::image::{Descriptor, ImageConfiguration, ImageManifest}; diff --git a/crates/cstorage/src/storage.rs b/crates/cstorage/src/storage.rs new file mode 100644 index 00000000..f00b5a75 --- /dev/null +++ b/crates/cstorage/src/storage.rs @@ -0,0 +1,623 @@ +//! Storage access for container overlay filesystem. +//! +//! This module provides the main [`Storage`] struct for accessing containers-storage +//! overlay driver data. All file access uses cap-std for fd-relative operations, +//! providing security against path traversal attacks and TOCTOU race conditions. +//! +//! # Overview +//! +//! The `Storage` struct is the primary entry point for interacting with container +//! storage. It holds a capability-based directory handle to the storage root. +//! +//! # Storage Structure +//! +//! Container storage on disk follows this layout: +//! ```text +//! /var/lib/containers/storage/ +//! +-- overlay/ # Layer data +//! | +-- / # Individual layer directories +//! | | +-- diff/ # Layer file contents +//! | | +-- link # Short link ID (26 chars) +//! | | +-- lower # Parent layer references +//! | +-- l/ # Short link directory (symlinks) +//! +-- overlay-layers/ # Tar-split metadata +//! | +-- .tar-split.gz +//! +-- overlay-images/ # Image metadata +//! +-- / +//! +-- manifest # OCI image manifest +//! +-- = # Base64-encoded metadata files +//! ``` +//! +//! # Security Model +//! +//! All file operations are performed via [`cap_std::fs::Dir`] handles, which provide: +//! - Protection against path traversal attacks +//! - Prevention of TOCTOU race conditions +//! - Guarantee that all access stays within the storage directory tree + +use crate::error::{Result, StorageError}; +use cap_std::ambient_authority; +use cap_std::fs::Dir; +use std::env; +use std::io::Read; +use std::path::{Path, PathBuf}; + +/// Main storage handle providing read-only access to container storage. +/// +/// The Storage struct holds a `Dir` handle to the storage root for fd-relative +/// file operations. +#[derive(Debug)] +pub struct Storage { + /// Directory handle for the storage root, used for all fd-relative operations. + root_dir: Dir, +} + +impl Storage { + /// Open storage at the given root path. + /// + /// This validates that the path points to a valid container storage directory + /// by checking for required subdirectories and the database file. + /// + /// # Errors + /// + /// Returns an error if: + /// - The path does not exist or is not a directory + /// - Required subdirectories are missing + /// - The database file is missing or invalid + pub fn open>(root: P) -> Result { + let root_path = root.as_ref(); + + // Open the directory handle for fd-relative operations + let root_dir = Dir::open_ambient_dir(root_path, ambient_authority()).map_err(|e| { + if e.kind() == std::io::ErrorKind::NotFound { + StorageError::RootNotFound(root_path.to_path_buf()) + } else { + StorageError::Io(e) + } + })?; + + // Validate storage structure + Self::validate_storage(&root_dir)?; + + Ok(Self { root_dir }) + } + + /// Discover storage root from default locations. + /// + /// Searches for container storage in the following order: + /// 1. `$CONTAINERS_STORAGE_ROOT` environment variable + /// 2. Rootless storage: `$XDG_DATA_HOME/containers/storage` or `~/.local/share/containers/storage` + /// 3. Root storage: `/var/lib/containers/storage` + /// + /// # Errors + /// + /// Returns an error if no valid storage location is found. + pub fn discover() -> Result { + let search_paths = Self::default_search_paths(); + + for path in search_paths { + if path.exists() { + match Self::open(&path) { + Ok(storage) => return Ok(storage), + Err(_) => continue, + } + } + } + + Err(StorageError::InvalidStorage( + "No valid storage location found. Searched default locations.".to_string(), + )) + } + + /// Discover all storage locations: the primary store plus any additional + /// image stores from `$STORAGE_OPTS`. + /// + /// The `containers/storage` library supports + /// `STORAGE_OPTS=additionalimagestore=/path` to add read-only image stores + /// (used by e.g. `bcvk` to expose the host's containers-storage inside a VM). + /// + /// Returns a non-empty vec with the primary store first (if it exists), + /// followed by any additional stores. Returns an error only if no stores + /// are found at all. + pub fn discover_all() -> Result> { + let mut stores = Vec::new(); + if let Ok(primary) = Self::discover() { + stores.push(primary); + } + stores.extend(Self::additional_image_stores_from_env()); + if stores.is_empty() { + return Err(StorageError::InvalidStorage( + "No valid storage location found. Searched default locations and $STORAGE_OPTS." + .to_string(), + )); + } + Ok(stores) + } + + /// Parse `$STORAGE_OPTS` for `additionalimagestore=` entries and + /// open any that point to valid overlay storage. + /// + /// Invalid or inaccessible paths are silently skipped. + fn additional_image_stores_from_env() -> Vec { + let opts = match env::var("STORAGE_OPTS") { + Ok(v) => v, + Err(_) => return Vec::new(), + }; + + Self::parse_additional_image_stores(&opts) + } + + /// Parse a `STORAGE_OPTS` value for `additionalimagestore=` entries + /// and open any that point to valid overlay storage. + /// + /// This is separated from [`additional_image_stores_from_env()`] so the + /// parsing logic can be tested without mutating process-global environment + /// variables. + fn parse_additional_image_stores(opts: &str) -> Vec { + let mut stores = Vec::new(); + // STORAGE_OPTS is comma-separated, e.g. + // "additionalimagestore=/run/host-container-storage,additionalimagestore=/other" + for item in opts.split(',') { + let item = item.trim(); + if let Some(path) = item.strip_prefix("additionalimagestore=") + && let Ok(s) = Self::open(path) + { + stores.push(s); + } + } + stores + } + + /// Get the default search paths for storage discovery. + fn default_search_paths() -> Vec { + let mut paths = Vec::new(); + + // 1. Check CONTAINERS_STORAGE_ROOT environment variable + if let Ok(root) = env::var("CONTAINERS_STORAGE_ROOT") { + paths.push(PathBuf::from(root)); + } + + // 2. Check rootless locations + if let Ok(home) = env::var("HOME") { + let home_path = PathBuf::from(home); + + // Try XDG_DATA_HOME first + if let Ok(xdg_data) = env::var("XDG_DATA_HOME") { + paths.push(PathBuf::from(xdg_data).join("containers/storage")); + } + + // Fallback to ~/.local/share/containers/storage + paths.push(home_path.join(".local/share/containers/storage")); + } + + // 3. Check root location + paths.push(PathBuf::from("/var/lib/containers/storage")); + + paths + } + + /// Validate that the directory structure is a valid overlay storage. + fn validate_storage(root_dir: &Dir) -> Result<()> { + // Check for required subdirectories + let required_dirs = ["overlay", "overlay-layers", "overlay-images"]; + + for dir_name in &required_dirs { + match root_dir.try_exists(dir_name) { + Ok(exists) if !exists => { + return Err(StorageError::InvalidStorage(format!( + "Missing required directory: {}", + dir_name + ))); + } + Err(e) => return Err(StorageError::Io(e)), + _ => {} + } + } + + Ok(()) + } + + /// Create storage from an existing root directory handle. + /// + /// # Errors + /// + /// Returns an error if the directory is not a valid container storage. + pub fn from_root_dir(root_dir: Dir) -> Result { + Self::validate_storage(&root_dir)?; + Ok(Self { root_dir }) + } + + /// Get a reference to the root directory handle. + pub fn root_dir(&self) -> &Dir { + &self.root_dir + } + + /// Resolve a link ID to a layer ID using fd-relative symlink reading. + /// + /// # Errors + /// + /// Returns an error if the link doesn't exist or has an invalid format. + pub fn resolve_link(&self, link_id: &str) -> Result { + // Open overlay directory from storage root + let overlay_dir = self.root_dir.open_dir("overlay")?; + + // Open link directory + let link_dir = overlay_dir.open_dir("l")?; + + // Read symlink target using fd-relative operation + let target = link_dir.read_link(link_id).map_err(|e| { + StorageError::LinkReadError(format!("Failed to read link {}: {}", link_id, e)) + })?; + + // Extract layer ID from symlink target + Self::extract_layer_id_from_link(&target) + } + + /// Extract layer ID from symlink target path. + /// + /// Target format: ..//diff + fn extract_layer_id_from_link(target: &Path) -> Result { + // Convert to string for processing + let target_str = target.to_str().ok_or_else(|| { + StorageError::LinkReadError("Invalid UTF-8 in link target".to_string()) + })?; + + // Split by '/' and find the layer ID component + let components: Vec<&str> = target_str.split('/').collect(); + + // Expected format: ..//diff + // So we need the second-to-last component + if components.len() >= 2 { + let layer_id = components[components.len() - 2]; + if !layer_id.is_empty() && layer_id != ".." { + return Ok(layer_id.to_string()); + } + } + + Err(StorageError::LinkReadError(format!( + "Invalid link target format: {}", + target_str + ))) + } + + /// List all images in storage. + /// + /// # Errors + /// + /// Returns an error if the images directory cannot be read. + pub fn list_images(&self) -> Result> { + use crate::image::Image; + + let images_dir = self.root_dir.open_dir("overlay-images")?; + let mut images = Vec::new(); + + for entry in images_dir.entries()? { + let entry = entry?; + if entry.file_type()?.is_dir() { + let id = entry + .file_name() + .to_str() + .ok_or_else(|| { + StorageError::InvalidStorage( + "Invalid UTF-8 in image directory name".to_string(), + ) + })? + .to_string(); + images.push(Image::open(self, &id)?); + } + } + Ok(images) + } + + /// Get an image by ID. + /// + /// # Errors + /// + /// Returns [`StorageError::ImageNotFound`] if the image doesn't exist. + pub fn get_image(&self, id: &str) -> Result { + crate::image::Image::open(self, id) + } + + /// Get layers for an image (in order from base to top). + /// + /// # Errors + /// + /// Returns an error if any layer cannot be opened. + pub fn get_image_layers( + &self, + image: &crate::image::Image, + ) -> Result> { + use crate::layer::Layer; + // image.layers() returns diff_ids, which need to be mapped to storage layer IDs. + // Use the batch method to parse layers.json only once. + let diff_ids = image.layers()?; + let layer_ids: Vec = self + .resolve_diff_ids(&diff_ids)? + .into_iter() + .enumerate() + .map(|(i, opt)| opt.ok_or_else(|| StorageError::LayerNotFound(diff_ids[i].clone()))) + .collect::>()?; + layer_ids + .iter() + .map(|layer_id| Layer::open(self, layer_id)) + .collect() + } + + /// Find an image by name. + /// + /// # Errors + /// + /// Returns [`StorageError::ImageNotFound`] if no image with the given name is found. + pub fn find_image_by_name(&self, name: &str) -> Result { + // Read images.json from overlay-images/ + let images_dir = self.root_dir.open_dir("overlay-images")?; + let mut file = images_dir.open("images.json")?; + let mut contents = String::new(); + file.read_to_string(&mut contents)?; + + // Parse the JSON array + let entries: Vec = serde_json::from_str(&contents) + .map_err(|e| StorageError::InvalidStorage(format!("Invalid images.json: {}", e)))?; + + // Search for matching name + for entry in &entries { + if let Some(names) = &entry.names { + for image_name in names { + if image_name == name { + return self.get_image(&entry.id); + } + } + } + } + + // Try partial matching (e.g., "alpine:latest" matches "docker.io/library/alpine:latest") + for entry in &entries { + if let Some(names) = &entry.names { + for image_name in names { + // Check if name is a suffix (after removing registry/namespace prefix) + if let Some(prefix) = image_name.strip_suffix(name) { + // Verify it's a proper boundary (preceded by '/') + if prefix.is_empty() || prefix.ends_with('/') { + return self.get_image(&entry.id); + } + } + } + } + } + + // Try matching short name without tag (e.g., "busybox" matches "docker.io/library/busybox:latest") + // This handles the common case of just specifying the image name + let name_with_tag = if name.contains(':') { + name.to_string() + } else { + format!("{}:latest", name) + }; + + for entry in &entries { + if let Some(names) = &entry.names { + for image_name in names { + // Check if image_name ends with /name:tag pattern + if let Some(prefix) = image_name.strip_suffix(&name_with_tag) + && (prefix.is_empty() || prefix.ends_with('/')) + { + return self.get_image(&entry.id); + } + } + } + } + + Err(StorageError::ImageNotFound(name.to_string())) + } + + /// Parse layers.json and return all entries. + /// + /// This is used internally to avoid re-parsing on every lookup. + fn read_layer_entries(&self) -> Result> { + let layers_dir = self.root_dir.open_dir("overlay-layers").map_err(|e| { + StorageError::Io(std::io::Error::new( + e.kind(), + format!("opening overlay-layers/: {e}"), + )) + })?; + let mut file = layers_dir.open("layers.json").map_err(|e| { + StorageError::Io(std::io::Error::new( + e.kind(), + format!("opening overlay-layers/layers.json: {e}"), + )) + })?; + let mut contents = String::new(); + file.read_to_string(&mut contents)?; + + serde_json::from_str(&contents) + .map_err(|e| StorageError::InvalidStorage(format!("Invalid layers.json: {}", e))) + } + + /// Resolve multiple diff-digests to storage layer IDs in a single pass. + /// + /// Parses `layers.json` once and looks up all diff_ids, avoiding the O(N×M) + /// overhead of calling [`resolve_diff_id()`] in a loop. + /// + /// Returns a `Vec>` with the same length as `diff_digests`, + /// where `Some(id)` means the diff-digest was found and `None` means it was not. + /// This allows callers to merge results across multiple stores without + /// short-circuiting on the first miss. + /// + /// # Errors + /// + /// Returns an error only if `layers.json` cannot be read or parsed. + pub fn resolve_diff_ids(&self, diff_digests: &[String]) -> Result>> { + let entries = self.read_layer_entries()?; + + // Build a map from normalized diff-digest -> layer ID + let mut digest_to_id = std::collections::HashMap::with_capacity(entries.len()); + for entry in &entries { + if let Some(digest) = &entry.diff_digest { + digest_to_id.insert(digest.as_str(), entry.id.as_str()); + } + } + + Ok(diff_digests + .iter() + .map(|diff_digest| { + let normalized = if diff_digest.starts_with("sha256:") { + diff_digest.clone() + } else { + format!("sha256:{}", diff_digest) + }; + digest_to_id + .get(normalized.as_str()) + .map(|id| id.to_string()) + }) + .collect()) + } + + /// Resolve a diff-digest to a storage layer ID. + /// + /// # Errors + /// + /// Returns [`StorageError::LayerNotFound`] if no layer with the given diff-digest exists. + pub fn resolve_diff_id(&self, diff_digest: &str) -> Result { + self.resolve_diff_ids(&[diff_digest.to_string()])? + .into_iter() + .next() + .flatten() + .ok_or_else(|| StorageError::LayerNotFound(diff_digest.to_string())) + } + + /// Get layer metadata including size information. + /// + /// # Errors + /// + /// Returns an error if the layer is not found. + pub fn get_layer_metadata(&self, layer_id: &str) -> Result { + let entries = self.read_layer_entries()?; + + for entry in entries { + if entry.id == layer_id { + return Ok(LayerMetadata { + id: entry.id, + parent: entry.parent, + diff_size: entry.diff_size, + compressed_size: entry.compressed_size, + }); + } + } + + Err(StorageError::LayerNotFound(layer_id.to_string())) + } + + /// Calculate the total uncompressed size of an image. + /// + /// # Errors + /// + /// Returns an error if any layer metadata cannot be read. + pub fn calculate_image_size(&self, image: &crate::image::Image) -> Result { + let layers = self.get_image_layers(image)?; + let mut total_size: u64 = 0; + + for layer in &layers { + let metadata = self.get_layer_metadata(layer.id())?; + if let Some(size) = metadata.diff_size { + total_size = total_size.saturating_add(size); + } + } + + Ok(total_size) + } +} + +use crate::image::ImageJsonEntry; + +/// Entry in layers.json for layer ID lookups. +#[derive(Debug, serde::Deserialize)] +#[serde(rename_all = "kebab-case")] +struct LayerEntry { + id: String, + parent: Option, + diff_digest: Option, + diff_size: Option, + compressed_size: Option, +} + +/// Metadata about a layer from layers.json. +#[derive(Debug, Clone)] +pub struct LayerMetadata { + /// Layer storage ID. + pub id: String, + /// Parent layer ID (if not base layer). + pub parent: Option, + /// Uncompressed diff size in bytes. + pub diff_size: Option, + /// Compressed size in bytes. + pub compressed_size: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_search_paths() { + let paths = Storage::default_search_paths(); + assert!(!paths.is_empty(), "Should have at least one search path"); + } + + #[test] + fn test_storage_validation() { + // Create a mock storage directory structure for testing + let dir = tempfile::tempdir().unwrap(); + let storage_path = dir.path(); + + // Create required directories + std::fs::create_dir_all(storage_path.join("overlay")).unwrap(); + std::fs::create_dir_all(storage_path.join("overlay-layers")).unwrap(); + std::fs::create_dir_all(storage_path.join("overlay-images")).unwrap(); + + let storage = Storage::open(storage_path).unwrap(); + assert!(storage.root_dir().try_exists("overlay").unwrap()); + } + + /// Helper: create a mock overlay storage directory. + fn create_mock_storage(path: &Path) { + for d in ["overlay", "overlay-layers", "overlay-images"] { + std::fs::create_dir_all(path.join(d)).unwrap(); + } + } + + #[test] + fn test_parse_additional_image_stores() { + let dir = tempfile::tempdir().unwrap(); + let store_a = dir.path().join("a"); + let store_b = dir.path().join("b"); + create_mock_storage(&store_a); + create_mock_storage(&store_b); + + // Empty string returns empty + assert!(Storage::parse_additional_image_stores("").is_empty()); + + // Single store + let opts = format!("additionalimagestore={}", store_a.display()); + let stores = Storage::parse_additional_image_stores(&opts); + assert_eq!(stores.len(), 1); + + // Multiple stores (comma-separated) + let opts = format!( + "additionalimagestore={},additionalimagestore={}", + store_a.display(), + store_b.display() + ); + let stores = Storage::parse_additional_image_stores(&opts); + assert_eq!(stores.len(), 2); + + // Non-existent path is silently skipped + assert!( + Storage::parse_additional_image_stores("additionalimagestore=/no/such/path").is_empty() + ); + + // Unrelated options are ignored + assert!( + Storage::parse_additional_image_stores("overlay.mount_program=/usr/bin/fuse-overlayfs") + .is_empty() + ); + } +} diff --git a/crates/cstorage/src/tar_split.rs b/crates/cstorage/src/tar_split.rs new file mode 100644 index 00000000..5f6a418c --- /dev/null +++ b/crates/cstorage/src/tar_split.rs @@ -0,0 +1,711 @@ +//! Tar-split integration for reading container layers without full tar serialization. +//! +//! This module provides the `TarSplitFdStream` which reads tar-split metadata files +//! and returns file descriptors for the actual file content, enabling zero-copy +//! access to layer data. +//! +//! # Overview +//! +//! The tar-split format stores tar header metadata separately from file content, +//! allowing reconstruction of tar archives without duplicating the actual file data. +//! This implementation uses that metadata to provide file descriptors directly to +//! the files in the overlay diff directory. +//! +//! # Architecture +//! +//! The tar-split format is NDJSON (newline-delimited JSON), gzip-compressed: +//! - Type 1 (FileType): File/directory references with name, optional size, optional CRC64 +//! - Type 2 (SegmentType): Raw TAR header bytes and padding (base64-encoded) +//! - CRC64-ISO algorithm for checksums + +use std::io::{BufRead, BufReader, Read, Seek}; +use std::os::fd::OwnedFd; + +use base64::prelude::*; +use cap_std::fs::{Dir, File}; +use crc::{CRC_64_GO_ISO, Crc}; +use flate2::read::GzDecoder; +use serde::Deserialize; + +use crate::error::{Result, StorageError}; +use crate::layer::Layer; +use crate::storage::Storage; + +/// CRC64-ISO implementation for verifying file checksums. +const CRC64_ISO: Crc = Crc::::new(&CRC_64_GO_ISO); + +/// Item returned from tar-split stream iteration. +#[derive(Debug)] +pub enum TarSplitItem { + /// Raw segment bytes (TAR header + padding) to write directly. + Segment(Vec), + + /// File content to write. + FileContent { + /// File descriptor for reading the content. + /// + /// The caller takes ownership of this file descriptor and is responsible + /// for reading the content and closing it when done. + fd: OwnedFd, + /// Expected file size in bytes. + /// + /// Used for tar padding calculation: TAR files are padded to 512-byte + /// boundaries, so the consumer needs to know the size to write the + /// correct amount of padding after the file content. + size: u64, + /// File path from the tar-split entry. + /// + /// This is the path as recorded in the original tar archive + /// (e.g., "./etc/hosts"). + name: String, + }, +} + +/// Raw tar-split entry from NDJSON format before validation. +#[derive(Debug, Deserialize)] +struct TarSplitEntryRaw { + /// Entry type discriminant: 1 for File, 2 for Segment. + #[serde(rename = "type")] + type_id: u8, + /// File name from TAR header (type 1 only). + #[serde(default)] + name: Option, + /// File size in bytes (type 1 only). + #[serde(default)] + size: Option, + /// CRC64-ISO checksum, base64-encoded (type 1 only). + #[serde(default)] + crc64: Option, + /// Base64-encoded TAR header bytes or padding (type 2 only). + #[serde(default)] + payload: Option, +} + +/// Tar-split entry from NDJSON format. +#[derive(Debug)] +enum TarSplitEntry { + /// File type entry: references a file/directory with metadata. + File { + /// File name from TAR header. + name: Option, + /// File size in bytes. + size: Option, + /// CRC64-ISO checksum (base64-encoded). + crc64: Option, + }, + /// Segment type entry: raw TAR header bytes and padding. + Segment { + /// Base64-encoded TAR header bytes (512 bytes) or padding. + payload: Option, + }, +} + +impl TarSplitEntry { + /// Parse a tar-split entry from raw format with validation. + fn from_raw(raw: TarSplitEntryRaw) -> Result { + match raw.type_id { + 1 => Ok(TarSplitEntry::File { + name: raw.name, + size: raw.size, + crc64: raw.crc64, + }), + 2 => Ok(TarSplitEntry::Segment { + payload: raw.payload, + }), + _ => Err(StorageError::TarSplitError(format!( + "Invalid tar-split entry type: {}", + raw.type_id + ))), + } + } +} + +/// Tar header information extracted from tar-split metadata. +#[derive(Debug, Clone)] +pub struct TarHeader { + /// File path in the tar archive (e.g., "./etc/hosts") + pub name: String, + + /// File mode (permissions and type information) + pub mode: u32, + + /// User ID of the file owner + pub uid: u32, + + /// Group ID of the file owner + pub gid: u32, + + /// File size in bytes + pub size: u64, + + /// Modification time (Unix timestamp) + pub mtime: i64, + + /// Tar entry type flag + pub typeflag: u8, + + /// Link target for symbolic links and hard links + pub linkname: String, + + /// User name of the file owner + pub uname: String, + + /// Group name of the file owner + pub gname: String, + + /// Major device number (for device files) + pub devmajor: u32, + + /// Minor device number (for device files) + pub devminor: u32, +} + +impl TarHeader { + /// Parse a TarHeader from a 512-byte TAR header block. + /// + /// # Errors + /// + /// Returns an error if the header is too short or has an invalid checksum. + pub fn from_bytes(header_bytes: &[u8]) -> Result { + let header_array: &[u8; tar_core::HEADER_SIZE] = header_bytes.try_into().map_err(|_| { + StorageError::TarSplitError(format!( + "TAR header wrong size: {} bytes (expected {})", + header_bytes.len(), + tar_core::HEADER_SIZE + )) + })?; + let header = tar_core::Header::from_bytes(header_array); + + let name = String::from_utf8(header.path_bytes().to_vec()).map_err(|e| { + StorageError::TarSplitError(format!("Non-UTF-8 path in TAR header: {}", e)) + })?; + let mode = header + .mode() + .map_err(|e| StorageError::TarSplitError(format!("Invalid mode: {}", e)))?; + let uid = header + .uid() + .map_err(|e| StorageError::TarSplitError(format!("Invalid uid: {}", e)))? + as u32; + let gid = header + .gid() + .map_err(|e| StorageError::TarSplitError(format!("Invalid gid: {}", e)))? + as u32; + let size = header + .entry_size() + .map_err(|e| StorageError::TarSplitError(format!("Invalid size: {}", e)))?; + let mtime = header + .mtime() + .map_err(|e| StorageError::TarSplitError(format!("Invalid mtime: {}", e)))? + as i64; + let typeflag = header.entry_type().as_byte(); + let link_bytes = header.link_name_bytes(); + let linkname = if link_bytes.is_empty() { + String::new() + } else { + String::from_utf8(link_bytes.to_vec()).map_err(|e| { + StorageError::TarSplitError(format!("Non-UTF-8 link name in TAR header: {}", e)) + })? + }; + let uname = header + .username() + .map(|b| { + String::from_utf8(b.to_vec()).map_err(|e| { + StorageError::TarSplitError(format!("Non-UTF-8 username in TAR header: {}", e)) + }) + }) + .transpose()? + .unwrap_or_default(); + let gname = header + .groupname() + .map(|b| { + String::from_utf8(b.to_vec()).map_err(|e| { + StorageError::TarSplitError(format!( + "Non-UTF-8 group name in TAR header: {}", + e + )) + }) + }) + .transpose()? + .unwrap_or_default(); + let devmajor = header + .device_major() + .map_err(|e| StorageError::TarSplitError(format!("Invalid devmajor: {}", e)))? + .unwrap_or(0); + let devminor = header + .device_minor() + .map_err(|e| StorageError::TarSplitError(format!("Invalid devminor: {}", e)))? + .unwrap_or(0); + + Ok(TarHeader { + name, + mode, + uid, + gid, + size, + mtime, + typeflag, + linkname, + uname, + gname, + devmajor, + devminor, + }) + } + + /// Check if this header represents a regular file. + pub fn is_regular_file(&self) -> bool { + self.typeflag == b'0' || self.typeflag == b'\0' + } + + /// Check if this header represents a directory. + pub fn is_directory(&self) -> bool { + self.typeflag == b'5' + } + + /// Check if this header represents a symbolic link. + pub fn is_symlink(&self) -> bool { + self.typeflag == b'2' + } + + /// Check if this header represents a hard link. + pub fn is_hardlink(&self) -> bool { + self.typeflag == b'1' + } + + /// Normalize the path by stripping leading "./" + pub fn normalized_name(&self) -> &str { + self.name.strip_prefix("./").unwrap_or(&self.name) + } +} + +/// Stream that reads tar-split metadata and provides file descriptors for file content. +#[derive(Debug)] +pub struct TarSplitFdStream { + /// The current layer for file lookups. + layer: Layer, + + /// Storage root directory for accessing parent layers on-demand. + storage_root: Dir, + + /// Gzip decompressor reading from the tar-split file. + reader: BufReader>, + + /// Entry counter for debugging and error messages. + entry_count: usize, +} + +impl TarSplitFdStream { + /// Create a new tar-split stream for a layer. + /// + /// # Errors + /// + /// Returns an error if the tar-split file doesn't exist or cannot be opened. + pub fn new(storage: &Storage, layer: &Layer) -> Result { + // Open overlay-layers directory via Dir handle + let layers_dir = storage.root_dir().open_dir("overlay-layers").map_err(|e| { + StorageError::TarSplitError(format!("Failed to open overlay-layers directory: {}", e)) + })?; + + // Open tar-split file relative to layers directory + let filename = format!("{}.tar-split.gz", layer.id()); + let file = layers_dir.open(&filename).map_err(|e| { + StorageError::TarSplitError(format!( + "Failed to open tar-split file {}: {}", + filename, e + )) + })?; + + // Wrap in gzip decompressor + let gz_decoder = GzDecoder::new(file); + let reader = BufReader::new(gz_decoder); + + // Open the layer for on-demand file lookups + let layer = Layer::open(storage, layer.id())?; + + // Clone storage root dir for on-demand parent layer access + let storage_root = storage.root_dir().try_clone()?; + + Ok(Self { + layer, + storage_root, + reader, + entry_count: 0, + }) + } + + /// Open a file in the layer chain, trying current layer first then parents. + fn open_file_in_chain(&self, path: &str) -> Result { + // Normalize path (remove leading ./) + let normalized_path = path.strip_prefix("./").unwrap_or(path); + + // Try to open in current layer first + match self.layer.diff_dir().open(normalized_path) { + Ok(file) => return Ok(file), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + // Continue to search parent layers + } + Err(e) => return Err(StorageError::Io(e)), + } + + // Search parent layers on-demand + self.search_parent_layers(&self.layer, normalized_path, 0) + } + + /// Recursively search parent layers for a file. + fn search_parent_layers( + &self, + current_layer: &Layer, + path: &str, + depth: usize, + ) -> Result { + const MAX_DEPTH: usize = 500; + + if depth >= MAX_DEPTH { + return Err(StorageError::TarSplitError(format!( + "Layer chain exceeds maximum depth of {} while searching for file: {}", + MAX_DEPTH, path + ))); + } + + // Get parent link IDs + let parent_links = current_layer.parent_links(); + + // Try each parent + for link_id in parent_links { + // Resolve link ID to layer ID by reading the symlink directly + let parent_id = self.resolve_link_direct(link_id)?; + + // Try to open file directly in parent's diff directory + match self.open_file_in_layer(&parent_id, path) { + Ok(file) => return Ok(file), + Err(StorageError::Io(e)) if e.kind() == std::io::ErrorKind::NotFound => { + // File not in this parent, recursively search its parents + match self.search_by_layer_id(&parent_id, path, depth + 1) { + Ok(file) => return Ok(file), + Err(StorageError::TarSplitError(_)) => continue, // File not found in this branch, try next parent + Err(e) => return Err(e), + } + } + Err(e) => return Err(e), + } + } + + Err(StorageError::TarSplitError(format!( + "File not found in layer chain: {}", + path + ))) + } + + /// Search for a file starting from a layer ID. + fn search_by_layer_id( + &self, + layer_id: &str, + path: &str, + depth: usize, + ) -> Result { + const MAX_DEPTH: usize = 500; + + if depth >= MAX_DEPTH { + return Err(StorageError::TarSplitError(format!( + "Layer chain exceeds maximum depth of {} while searching for file: {}", + MAX_DEPTH, path + ))); + } + + // Try to open file in this layer + match self.open_file_in_layer(layer_id, path) { + Ok(file) => return Ok(file), + Err(StorageError::Io(e)) if e.kind() == std::io::ErrorKind::NotFound => { + // File not found, check parents + } + Err(e) => return Err(e), + } + + // Read parent links for this layer + let parent_links = self.read_layer_parent_links(layer_id)?; + + // Try each parent + for link_id in parent_links { + let parent_id = self.resolve_link_direct(&link_id)?; + match self.search_by_layer_id(&parent_id, path, depth + 1) { + Ok(file) => return Ok(file), + Err(StorageError::TarSplitError(_)) => continue, // File not found in this branch, try next parent + Err(e) => return Err(e), + } + } + + Err(StorageError::TarSplitError(format!( + "File not found in layer chain: {}", + path + ))) + } + + /// Resolve a link ID to layer ID by directly reading the symlink. + fn resolve_link_direct(&self, link_id: &str) -> Result { + let overlay_dir = self.storage_root.open_dir("overlay")?; + let link_dir = overlay_dir.open_dir("l")?; + let target = link_dir.read_link(link_id).map_err(|e| { + StorageError::LinkReadError(format!("Failed to read link {}: {}", link_id, e)) + })?; + + // Extract layer ID from symlink target (format: ..//diff) + let target_str = target.to_str().ok_or_else(|| { + StorageError::LinkReadError("Invalid UTF-8 in link target".to_string()) + })?; + let components: Vec<&str> = target_str.split('/').collect(); + if components.len() >= 2 { + let layer_id = components[components.len() - 2]; + if !layer_id.is_empty() && layer_id != ".." { + return Ok(layer_id.to_string()); + } + } + Err(StorageError::LinkReadError(format!( + "Invalid link target format: {}", + target_str + ))) + } + + /// Open a file in a specific layer's diff directory. + fn open_file_in_layer(&self, layer_id: &str, path: &str) -> Result { + let overlay_dir = self.storage_root.open_dir("overlay")?; + let layer_dir = overlay_dir.open_dir(layer_id)?; + let diff_dir = layer_dir.open_dir("diff")?; + diff_dir.open(path).map_err(StorageError::Io) + } + + /// Read parent link IDs from a layer's lower file. + fn read_layer_parent_links(&self, layer_id: &str) -> Result> { + let overlay_dir = self.storage_root.open_dir("overlay")?; + let layer_dir = overlay_dir.open_dir(layer_id)?; + + match layer_dir.read_to_string("lower") { + Ok(content) => Ok(content + .trim() + .split(':') + .filter_map(|s| s.strip_prefix("l/")) + .map(|s| s.to_string()) + .collect()), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Vec::new()), // Base layer has no lower file + Err(e) => Err(StorageError::Io(e)), + } + } + + /// Verify CRC64-ISO checksum of a file. + fn verify_crc64( + &self, + file: &mut cap_std::fs::File, + expected_b64: &str, + size: u64, + ) -> Result<()> { + // Decode base64 checksum + let expected_bytes = BASE64_STANDARD.decode(expected_b64).map_err(|e| { + StorageError::TarSplitError(format!("Failed to decode base64 CRC64: {}", e)) + })?; + + if expected_bytes.len() != 8 { + return Err(StorageError::TarSplitError(format!( + "Invalid CRC64 length: {} bytes", + expected_bytes.len() + ))); + } + + // Convert to u64 (big-endian) + let expected = u64::from_be_bytes(expected_bytes.try_into().unwrap()); + + // Compute CRC64 of file content + let mut digest = CRC64_ISO.digest(); + let mut buffer = vec![0u8; 8192]; + let mut bytes_read = 0u64; + + loop { + let n = file.read(&mut buffer).map_err(|e| { + StorageError::TarSplitError(format!( + "Failed to read file for CRC64 verification: {}", + e + )) + })?; + if n == 0 { + break; + } + digest.update(&buffer[..n]); + bytes_read += n as u64; + } + + // Verify size matches + if bytes_read != size { + return Err(StorageError::TarSplitError(format!( + "File size mismatch: expected {}, got {}", + size, bytes_read + ))); + } + + let computed = digest.finalize(); + if computed != expected { + return Err(StorageError::TarSplitError(format!( + "CRC64 mismatch: expected {:016x}, got {:016x}", + expected, computed + ))); + } + + Ok(()) + } + + /// Read the next item from the tar-split stream. + /// + /// Returns: + /// - `Ok(Some(item))` - Next item was read successfully + /// - `Ok(None)` - End of stream reached + /// - `Err(...)` - Error occurred during reading + #[allow(clippy::should_implement_trait)] + pub fn next(&mut self) -> Result> { + loop { + // Read next line from NDJSON stream + let mut line = String::new(); + match self.reader.read_line(&mut line) { + Ok(0) => { + return Ok(None); + } + Ok(_) => { + // Parse NDJSON entry + let raw: TarSplitEntryRaw = serde_json::from_str(&line).map_err(|e| { + StorageError::TarSplitError(format!( + "Failed to parse tar-split entry: {}", + e + )) + })?; + let entry = TarSplitEntry::from_raw(raw)?; + + match entry { + TarSplitEntry::Segment { payload } => { + if let Some(payload_b64) = payload { + let payload_bytes = + BASE64_STANDARD.decode(&payload_b64).map_err(|e| { + StorageError::TarSplitError(format!( + "Failed to decode base64 payload: {}", + e + )) + })?; + + return Ok(Some(TarSplitItem::Segment(payload_bytes))); + } + // Empty segment, continue + } + + TarSplitEntry::File { name, size, crc64 } => { + self.entry_count += 1; + + // Check if this file has content to write + let file_size = size.unwrap_or(0); + if file_size > 0 { + // Regular file with content - open it + let path = name.as_ref().ok_or_else(|| { + StorageError::TarSplitError( + "FileType entry missing name".to_string(), + ) + })?; + + let mut file = self.open_file_in_chain(path)?; + + // Verify CRC64 if provided + if let Some(ref crc64_b64) = crc64 { + self.verify_crc64(&mut file, crc64_b64, file_size)?; + + // Seek back to start after CRC verification consumed the file + file.rewind().map_err(StorageError::Io)?; + } + + // Convert to OwnedFd and return + let std_file = file.into_std(); + let owned_fd: OwnedFd = std_file.into(); + return Ok(Some(TarSplitItem::FileContent { + fd: owned_fd, + size: file_size, + name: path.clone(), + })); + } + // Empty file or directory - header already in preceding Segment + } + } + } + Err(e) => { + return Err(StorageError::TarSplitError(format!( + "Failed to read tar-split line: {}", + e + ))); + } + } + } + } + + /// Get the number of entries processed so far. + pub fn entry_count(&self) -> usize { + self.entry_count + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_tar_header_type_checks() { + let mut header = TarHeader { + name: "test.txt".to_string(), + mode: 0o644, + uid: 1000, + gid: 1000, + size: 100, + mtime: 0, + typeflag: b'0', + linkname: String::new(), + uname: "user".to_string(), + gname: "group".to_string(), + devmajor: 0, + devminor: 0, + }; + + assert!(header.is_regular_file()); + assert!(!header.is_directory()); + assert!(!header.is_symlink()); + + header.typeflag = b'5'; + assert!(!header.is_regular_file()); + assert!(header.is_directory()); + + header.typeflag = b'2'; + assert!(header.is_symlink()); + } + + #[test] + fn test_tar_split_entry_deserialization() { + // Test type 2 (Segment) with integer discriminant + let json_segment = r#"{"type":2,"payload":"dXN0YXIAMDA="}"#; + let raw: TarSplitEntryRaw = serde_json::from_str(json_segment).unwrap(); + let entry = TarSplitEntry::from_raw(raw).unwrap(); + match entry { + TarSplitEntry::Segment { payload } => { + assert_eq!(payload, Some("dXN0YXIAMDA=".to_string())); + } + _ => panic!("Expected Segment variant"), + } + + // Test type 1 (File) with integer discriminant + let json_file = r#"{"type":1,"name":"./etc/hosts","size":123,"crc64":"AAAAAAAAAA=="}"#; + let raw: TarSplitEntryRaw = serde_json::from_str(json_file).unwrap(); + let entry = TarSplitEntry::from_raw(raw).unwrap(); + match entry { + TarSplitEntry::File { name, size, crc64 } => { + assert_eq!(name, Some("./etc/hosts".to_string())); + assert_eq!(size, Some(123)); + assert_eq!(crc64, Some("AAAAAAAAAA==".to_string())); + } + _ => panic!("Expected File variant"), + } + + // Test invalid type + let json_invalid = r#"{"type":99}"#; + let raw: TarSplitEntryRaw = serde_json::from_str(json_invalid).unwrap(); + let result = TarSplitEntry::from_raw(raw); + assert!(result.is_err()); + } +} diff --git a/crates/cstorage/src/userns.rs b/crates/cstorage/src/userns.rs new file mode 100644 index 00000000..ab44d03c --- /dev/null +++ b/crates/cstorage/src/userns.rs @@ -0,0 +1,67 @@ +//! User namespace utilities for rootless containers-storage access. +//! +//! This module provides utilities for determining when user namespace entry is +//! needed to access overlay storage files that are owned by remapped UIDs/GIDs. +//! +//! # Background +//! +//! When podman runs rootless, it uses user namespaces to remap UIDs. Files in +//! the overlay storage are owned by these remapped UIDs (e.g., UID 100000+N on +//! the host corresponds to UID N inside the container). These files also retain +//! their original permission bits from the container image. +//! +//! Files with restrictive permissions (e.g., `/etc/shadow` with mode 0600) are +//! only readable by their owner - a remapped UID we cannot access as an +//! unprivileged user. +//! +//! # Solution +//! +//! Rather than manually setting up user namespaces (parsing `/etc/subuid`, +//! calling `newuidmap`/`newgidmap`, etc.), we delegate to `podman unshare` +//! which handles all the edge cases. See [`crate::userns_helper`] for the +//! helper process that runs inside the user namespace. + +use rustix::process::getuid; +use rustix::thread::{CapabilitySet, capabilities}; + +/// Check if the current process can read arbitrary files regardless of permissions. +/// +/// This returns `true` if: +/// - The process is running as real root (UID 0), or +/// - The process has `CAP_DAC_OVERRIDE` in its effective capability set +/// +/// When this returns `true`, there's no need to spawn a userns helper for +/// file access - the process can already read any file in the storage. +pub fn can_bypass_file_permissions() -> bool { + // Real root can read anything + if getuid().is_root() { + return true; + } + + // Check for CAP_DAC_OVERRIDE capability + if let Ok(caps) = capabilities(None) + && caps.effective.contains(CapabilitySet::DAC_OVERRIDE) + { + return true; + } + + false +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_can_bypass_file_permissions() { + // This function should not panic and should return a consistent result + let result1 = can_bypass_file_permissions(); + let result2 = can_bypass_file_permissions(); + assert_eq!(result1, result2); + + // If we're root, it should return true + if getuid().is_root() { + assert!(result1, "root should be able to bypass permissions"); + } + } +} diff --git a/crates/cstorage/src/userns_helper.rs b/crates/cstorage/src/userns_helper.rs new file mode 100644 index 00000000..339e1293 --- /dev/null +++ b/crates/cstorage/src/userns_helper.rs @@ -0,0 +1,1086 @@ +//! User namespace helper process for privileged storage access. +//! +//! This module provides a mechanism for unprivileged processes to access +//! containers-storage content that has restrictive permissions. It works by +//! spawning a helper process inside a user namespace (via `podman unshare`) +//! that can read any file, and communicating with it via JSON-RPC over a +//! Unix socket with fd-passing. +//! +//! # Why This Is Needed +//! +//! Container images contain files with various permission bits (e.g., `/etc/shadow` +//! with mode 0600). When stored in rootless containers-storage, these files are +//! owned by remapped UIDs that the unprivileged user cannot access. Even though +//! we have tar-split metadata telling us the file structure, we still need to +//! read the actual file content. +//! +//! # Architecture +//! +//! The helper uses stdin (fd 0) for IPC, avoiding the need for unsafe code: +//! +//! ```text +//! ┌─────────────────────────────────────┐ +//! │ Parent Process │ +//! │ (unprivileged, library user) │ +//! │ │ +//! │ StorageProxy::spawn() │ +//! │ │ │ +//! │ ├─► Create socketpair │ +//! │ ├─► Spawn: podman unshare │ +//! │ │ /proc/self/exe │ +//! │ │ (child's stdin=socket) │ +//! │ │ │ +//! │ proxy.stream_layer() ───────────► │ +//! │ │ │ +//! │ ◄─── receives OwnedFd via SCM_RIGHTS│ +//! └─────────────────────────────────────┘ +//! ``` +//! +//! # Usage +//! +//! Library users must call [`init_if_helper`] early in their `main()` function: +//! +//! ```no_run +//! // This must be called before any other cstorage operations. +//! // If this process was spawned as a userns helper, it will +//! // serve requests and exit, never returning. +//! cstorage::userns_helper::init_if_helper(); +//! +//! // Normal application code continues here... +//! ``` + +use std::os::fd::AsFd; +use std::os::unix::io::OwnedFd; +use std::os::unix::net::UnixStream as StdUnixStream; +use std::path::Path; +use std::process::{Child, Command, Stdio}; + +use base64::prelude::*; +use jsonrpc_fdpass::transport::UnixSocketTransport; +use jsonrpc_fdpass::{JsonRpcMessage, JsonRpcRequest, JsonRpcResponse, MessageWithFds}; +use rustix::io::dup; +use rustix::process::{Signal, set_parent_process_death_signal}; +use serde::{Deserialize, Serialize}; +use tokio::net::UnixStream as TokioUnixStream; + +use crate::layer::Layer; +use crate::storage::Storage; +use crate::tar_split::{TarSplitFdStream, TarSplitItem}; +use crate::userns::can_bypass_file_permissions; + +/// Environment variable that indicates this process is a userns helper. +const HELPER_ENV: &str = "__CSTORAGE_USERNS_HELPER"; + +/// JSON-RPC 2.0 error codes. +/// +/// These codes follow the JSON-RPC 2.0 specification: +/// - Standard errors: -32700 to -32600 +/// - Server errors: -32099 to -32000 (implementation-defined) +mod error_codes { + /// Invalid params - the params passed to a method are invalid. + pub const INVALID_PARAMS: i32 = -32602; + + /// Method not found - the requested method does not exist. + pub const METHOD_NOT_FOUND: i32 = -32601; + + /// Resource not found - the requested resource (image, layer, etc.) was not found. + pub const RESOURCE_NOT_FOUND: i32 = -32000; + + /// Internal error - a server-side error occurred (I/O, storage access, etc.). + pub const INTERNAL_ERROR: i32 = -32003; +} + +/// JSON-RPC method names. +mod methods { + /// Open a file and return its fd. + pub const OPEN_FILE: &str = "userns.openFile"; + /// Shutdown the helper process. + pub const SHUTDOWN: &str = "userns.shutdown"; + /// List images in storage. + pub const LIST_IMAGES: &str = "userns.listImages"; + /// Get image metadata. + pub const GET_IMAGE: &str = "userns.getImage"; + /// Stream layer as tar-split entries with fds. + pub const STREAM_LAYER: &str = "userns.streamLayer"; +} + +/// Parameters for the open_file method. +#[derive(Debug, Serialize, Deserialize)] +pub struct OpenFileParams { + /// Path to open. + pub path: String, +} + +/// Result for the open_file method. +#[derive(Debug, Serialize, Deserialize)] +pub struct OpenFileResult { + /// True if successful (fd is passed out-of-band). + pub success: bool, +} + +/// Parameters for list_images method. +#[derive(Debug, Serialize, Deserialize)] +pub struct ListImagesParams { + /// Storage root path. + pub storage_path: String, +} + +/// Image info returned by list_images. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ImageInfo { + /// Image ID. + pub id: String, + /// Image names/tags. + pub names: Vec, +} + +/// Result for list_images method. +#[derive(Debug, Serialize, Deserialize)] +pub struct ListImagesResult { + /// List of images. + pub images: Vec, +} + +/// Parameters for get_image method. +#[derive(Debug, Serialize, Deserialize)] +pub struct GetImageParams { + /// Storage root path. + pub storage_path: String, + /// Image ID or name. + pub image_ref: String, +} + +/// Result for get_image method. +#[derive(Debug, Serialize, Deserialize)] +pub struct GetImageResult { + /// Image ID. + pub id: String, + /// Image names. + pub names: Vec, + /// Layer diff IDs (sha256:...). + pub layer_diff_ids: Vec, + /// Storage layer IDs (internal IDs used by containers-storage). + pub storage_layer_ids: Vec, +} + +/// Parameters for stream_layer method. +#[derive(Debug, Serialize, Deserialize)] +pub struct StreamLayerParams { + /// Storage root path. + pub storage_path: String, + /// Layer ID (storage layer ID, not diff ID). + pub layer_id: String, +} + +/// Streaming notification for a segment. +#[derive(Debug, Serialize, Deserialize)] +pub struct StreamSegmentNotification { + /// Base64-encoded segment data. + pub data: String, +} + +/// Streaming notification for a file (fd is passed out-of-band). +#[derive(Debug, Serialize, Deserialize)] +pub struct StreamFileNotification { + /// File path in the tar. + pub name: String, + /// File size. + pub size: u64, +} + +/// Result for stream_layer method (sent after all notifications). +#[derive(Debug, Serialize, Deserialize)] +pub struct StreamLayerResult { + /// Number of items streamed. + pub items_sent: usize, +} + +/// Error type for userns helper operations. +#[derive(Debug, thiserror::Error)] +pub enum HelperError { + /// Failed to create socket. + #[error("failed to create socket: {0}")] + Socket(#[source] std::io::Error), + + /// Failed to spawn helper process. + #[error("failed to spawn helper process: {0}")] + Spawn(#[source] std::io::Error), + + /// IPC error. + #[error("IPC error: {0}")] + Ipc(String), + + /// Helper returned an error. + #[error("helper error: {0}")] + HelperError(String), + + /// I/O error. + #[error("I/O error: {0}")] + Io(#[from] std::io::Error), + + /// JSON-RPC error from the helper. + #[error("RPC error: code={code}, message={message}")] + RpcError { + /// JSON-RPC error code. + code: i32, + /// Error message. + message: String, + }, +} + +/// Check if this process was spawned as a userns helper and run the helper loop if so. +/// +/// This function **must** be called early in `main()`, before any other cstorage +/// operations. If this process was spawned as a helper, this function will: +/// +/// 1. Read from stdin (which is a Unix socket from the parent) +/// 2. Serve JSON-RPC requests for file operations +/// 3. Exit when the parent closes the connection +/// +/// If this is not a helper process, this function returns immediately. +pub fn init_if_helper() { + // Check if we're a helper via environment variable + if std::env::var(HELPER_ENV).is_err() { + return; // Not a helper, continue normal execution + } + + // Ensure we exit if parent dies (avoids orphan helper processes) + if let Err(e) = set_parent_process_death_signal(Some(Signal::TERM)) { + eprintln!("cstorage helper: failed to set parent death signal: {}", e); + // Continue anyway - this is a nice-to-have, not critical + } + + // We're a helper - stdin is our IPC socket. + // Use dup() to get a new owned fd from stdin (fd 0). + // This is safe because: + // 1. We were spawned with stdin set to a socket + // 2. dup() gives us a new fd that we own + // 3. We use std::io::stdin().as_fd() which is the safe way to get the fd + let stdin_fd = match dup(std::io::stdin().as_fd()) { + Ok(fd) => fd, + Err(e) => { + eprintln!("cstorage helper: failed to dup stdin: {}", e); + std::process::exit(1); + } + }; + let std_socket = StdUnixStream::from(stdin_fd); + + // Run the helper loop (never returns on success) + if let Err(e) = run_helper_loop_blocking(std_socket) { + eprintln!("cstorage helper: error in helper loop: {}", e); + std::process::exit(1); + } + std::process::exit(0); +} + +/// Run the helper loop synchronously by creating a tokio runtime. +fn run_helper_loop_blocking(std_socket: StdUnixStream) -> std::result::Result<(), HelperError> { + // Set non-blocking for tokio + std_socket.set_nonblocking(true)?; + + // Create a tokio runtime for the helper + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .map_err(|e| HelperError::Ipc(format!("failed to create tokio runtime: {}", e)))?; + + rt.block_on(run_helper_loop_async(std_socket)) +} + +/// Run the helper loop, serving requests from the parent. +async fn run_helper_loop_async(std_socket: StdUnixStream) -> std::result::Result<(), HelperError> { + // Convert std socket to tokio socket + let tokio_socket = TokioUnixStream::from_std(std_socket) + .map_err(|e| HelperError::Ipc(format!("failed to convert socket: {}", e)))?; + + let transport = UnixSocketTransport::new(tokio_socket); + let (mut sender, mut receiver) = transport.split(); + + tracing::debug!("userns helper: starting request loop"); + + loop { + let msg_with_fds = match receiver.receive().await { + Ok(m) => m, + Err(jsonrpc_fdpass::Error::ConnectionClosed) => { + tracing::debug!("userns helper: connection closed"); + return Ok(()); + } + Err(e) => { + return Err(HelperError::Ipc(format!( + "failed to receive message: {}", + e + ))); + } + }; + + match msg_with_fds.message { + JsonRpcMessage::Request(request) => { + let id = request.id.clone(); + + // Handle stream_layer specially since it needs to send multiple messages + if request.method == methods::STREAM_LAYER { + if let Err((code, msg)) = handle_stream_layer(&request, &mut sender).await { + let error = jsonrpc_fdpass::JsonRpcError::owned(code, msg, None::<()>); + let response = JsonRpcResponse::error(error, id); + let message = + MessageWithFds::new(JsonRpcMessage::Response(response), vec![]); + sender.send(message).await.map_err(|e| { + HelperError::Ipc(format!("failed to send error response: {}", e)) + })?; + } + // Success response is sent by handle_stream_layer + continue; + } + + let (result, fds) = handle_request(&request); + + match result { + Ok(response_value) => { + let response = JsonRpcResponse::success(response_value, id); + let message = MessageWithFds::new(JsonRpcMessage::Response(response), fds); + sender.send(message).await.map_err(|e| { + HelperError::Ipc(format!("failed to send response: {}", e)) + })?; + } + Err((code, message_str)) => { + let error = + jsonrpc_fdpass::JsonRpcError::owned(code, message_str, None::<()>); + let response = JsonRpcResponse::error(error, id); + let message = + MessageWithFds::new(JsonRpcMessage::Response(response), vec![]); + sender.send(message).await.map_err(|e| { + HelperError::Ipc(format!("failed to send error response: {}", e)) + })?; + } + } + + // Check for shutdown request (handle after sending response) + if request.method == methods::SHUTDOWN { + tracing::debug!("userns helper: received shutdown request"); + return Ok(()); + } + } + JsonRpcMessage::Notification(notif) => { + if notif.method == methods::SHUTDOWN { + tracing::debug!("userns helper: received shutdown notification"); + return Ok(()); + } + // Ignore other notifications + } + JsonRpcMessage::Response(_) => { + // Unexpected response - ignore + } + } + } +} + +/// Handle stream_layer request - sends multiple notifications with fds. +async fn handle_stream_layer( + request: &JsonRpcRequest, + sender: &mut jsonrpc_fdpass::transport::Sender, +) -> std::result::Result<(), (i32, String)> { + let params: StreamLayerParams = request + .params + .as_ref() + .and_then(|p| serde_json::from_value(p.clone()).ok()) + .ok_or(( + error_codes::INVALID_PARAMS, + "invalid params for streamLayer".to_string(), + ))?; + + let storage = Storage::open(¶ms.storage_path).map_err(|e| { + ( + error_codes::INTERNAL_ERROR, + format!("failed to open storage: {}", e), + ) + })?; + + let layer = Layer::open(&storage, ¶ms.layer_id).map_err(|e| { + ( + error_codes::RESOURCE_NOT_FOUND, + format!("layer not found: {}", e), + ) + })?; + + let mut stream = TarSplitFdStream::new(&storage, &layer).map_err(|e| { + ( + error_codes::INTERNAL_ERROR, + format!("failed to create tar-split stream: {}", e), + ) + })?; + + let mut items_sent = 0usize; + + // Stream all items as notifications + while let Some(item) = stream + .next() + .map_err(|e| (error_codes::INTERNAL_ERROR, format!("stream error: {}", e)))? + { + match item { + TarSplitItem::Segment(bytes) => { + // Send segment as base64-encoded notification + let params = StreamSegmentNotification { + data: BASE64_STANDARD.encode(&bytes), + }; + let notif = jsonrpc_fdpass::JsonRpcNotification::new( + "stream.segment".to_string(), + Some(serde_json::to_value(¶ms).unwrap()), + ); + let message = MessageWithFds::new(JsonRpcMessage::Notification(notif), vec![]); + sender.send(message).await.map_err(|e| { + ( + error_codes::INTERNAL_ERROR, + format!("failed to send segment: {}", e), + ) + })?; + items_sent += 1; + } + TarSplitItem::FileContent { fd, size, name } => { + // Send file notification with fd + let params = StreamFileNotification { name, size }; + let notif = jsonrpc_fdpass::JsonRpcNotification::new( + "stream.file".to_string(), + Some(serde_json::to_value(¶ms).unwrap()), + ); + let message = MessageWithFds::new(JsonRpcMessage::Notification(notif), vec![fd]); + sender.send(message).await.map_err(|e| { + ( + error_codes::INTERNAL_ERROR, + format!("failed to send file: {}", e), + ) + })?; + items_sent += 1; + } + } + } + + // Send success response + let result = StreamLayerResult { items_sent }; + let response = + JsonRpcResponse::success(serde_json::to_value(result).unwrap(), request.id.clone()); + let message = MessageWithFds::new(JsonRpcMessage::Response(response), vec![]); + sender.send(message).await.map_err(|e| { + ( + error_codes::INTERNAL_ERROR, + format!("failed to send response: {}", e), + ) + })?; + + Ok(()) +} + +/// Handle a JSON-RPC request. +fn handle_request( + request: &JsonRpcRequest, +) -> ( + std::result::Result, + Vec, +) { + match request.method.as_str() { + methods::OPEN_FILE => { + let params: OpenFileParams = match request + .params + .as_ref() + .and_then(|p| serde_json::from_value(p.clone()).ok()) + { + Some(p) => p, + None => { + return ( + Err(( + error_codes::INVALID_PARAMS, + "invalid params: missing 'path' field".to_string(), + )), + vec![], + ); + } + }; + + match std::fs::File::open(¶ms.path) { + Ok(file) => { + let fd: OwnedFd = file.into(); + let result = OpenFileResult { success: true }; + (Ok(serde_json::to_value(result).unwrap()), vec![fd]) + } + Err(e) => ( + Err(( + error_codes::INTERNAL_ERROR, + format!("failed to open file: {}", e), + )), + vec![], + ), + } + } + methods::LIST_IMAGES => handle_list_images(request), + methods::GET_IMAGE => handle_get_image(request), + methods::SHUTDOWN => { + // Just return success - the loop will exit after sending the response + (Ok(serde_json::json!({"success": true})), vec![]) + } + _ => ( + Err(( + error_codes::METHOD_NOT_FOUND, + format!("method not found: {}", request.method), + )), + vec![], + ), + } +} + +/// Handle list_images request. +fn handle_list_images( + request: &JsonRpcRequest, +) -> ( + std::result::Result, + Vec, +) { + let params: ListImagesParams = match request + .params + .as_ref() + .and_then(|p| serde_json::from_value(p.clone()).ok()) + { + Some(p) => p, + None => { + return ( + Err(( + error_codes::INVALID_PARAMS, + "invalid params for listImages".to_string(), + )), + vec![], + ); + } + }; + + let storage = match Storage::open(¶ms.storage_path) { + Ok(s) => s, + Err(e) => { + return ( + Err(( + error_codes::INTERNAL_ERROR, + format!("failed to open storage: {}", e), + )), + vec![], + ); + } + }; + + let images = match storage.list_images() { + Ok(imgs) => imgs, + Err(e) => { + return ( + Err(( + error_codes::INTERNAL_ERROR, + format!("failed to list images: {}", e), + )), + vec![], + ); + } + }; + + let image_infos: Vec = images + .iter() + .map(|img| ImageInfo { + id: img.id().to_string(), + names: img.names(&storage).unwrap_or_default(), + }) + .collect(); + + let result = ListImagesResult { + images: image_infos, + }; + (Ok(serde_json::to_value(result).unwrap()), vec![]) +} + +/// Handle get_image request. +fn handle_get_image( + request: &JsonRpcRequest, +) -> ( + std::result::Result, + Vec, +) { + let params: GetImageParams = match request + .params + .as_ref() + .and_then(|p| serde_json::from_value(p.clone()).ok()) + { + Some(p) => p, + None => { + return ( + Err(( + error_codes::INVALID_PARAMS, + "invalid params for getImage".to_string(), + )), + vec![], + ); + } + }; + + let storage = match Storage::open(¶ms.storage_path) { + Ok(s) => s, + Err(e) => { + return ( + Err(( + error_codes::INTERNAL_ERROR, + format!("failed to open storage: {}", e), + )), + vec![], + ); + } + }; + + // Try by ID first, then by name + let image = match crate::image::Image::open(&storage, ¶ms.image_ref) { + Ok(img) => img, + Err(_) => match storage.find_image_by_name(¶ms.image_ref) { + Ok(img) => img, + Err(e) => { + return ( + Err(( + error_codes::RESOURCE_NOT_FOUND, + format!("image not found: {}", e), + )), + vec![], + ); + } + }, + }; + + let config = match image.config() { + Ok(cfg) => cfg, + Err(e) => { + return ( + Err(( + error_codes::INTERNAL_ERROR, + format!("failed to read config: {}", e), + )), + vec![], + ); + } + }; + + let diff_ids: Vec = config + .rootfs() + .diff_ids() + .iter() + .map(|s| s.parse().expect("config diff_id should be valid digest")) + .collect(); + + let storage_layer_ids = match image.storage_layer_ids(std::slice::from_ref(&storage)) { + Ok(ids) => ids, + Err(e) => { + return ( + Err(( + error_codes::INTERNAL_ERROR, + format!("failed to get storage layer IDs: {}", e), + )), + vec![], + ); + } + }; + + let result = GetImageResult { + id: image.id().to_string(), + names: image.names(&storage).unwrap_or_default(), + layer_diff_ids: diff_ids, + storage_layer_ids, + }; + (Ok(serde_json::to_value(result).unwrap()), vec![]) +} + +/// Proxy for accessing files via the userns helper process. +/// +/// This spawns a helper process (via `podman unshare`) that runs inside a +/// user namespace and can read files with restrictive permissions. File +/// descriptors are passed back via SCM_RIGHTS. +pub struct StorageProxy { + child: Child, + sender: jsonrpc_fdpass::transport::Sender, + receiver: jsonrpc_fdpass::transport::Receiver, + next_id: u64, +} + +impl std::fmt::Debug for StorageProxy { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("StorageProxy") + .field("child_pid", &self.child.id()) + .finish_non_exhaustive() + } +} + +impl StorageProxy { + /// Spawn a userns helper process. + /// + /// If the current process can already bypass file permissions (running as + /// root or has CAP_DAC_OVERRIDE), this returns `Ok(None)` since no helper + /// is needed. + pub async fn spawn() -> std::result::Result, HelperError> { + // Check if we even need a helper + if can_bypass_file_permissions() { + return Ok(None); + } + + Self::spawn_helper().await.map(Some) + } + + /// Spawn the helper unconditionally. + async fn spawn_helper() -> std::result::Result { + let exe = std::fs::read_link("/proc/self/exe").map_err(HelperError::Io)?; + Self::spawn_helper_with_binary(exe).await + } + + /// Spawn the helper with a specific binary path. + /// + /// This is used when the default /proc/self/exe is not suitable, + /// such as when running from a test harness. + async fn spawn_helper_with_binary( + exe: std::path::PathBuf, + ) -> std::result::Result { + // Create a socket pair - one end for us, one for the child's stdin + let (parent_sock, child_sock) = StdUnixStream::pair().map_err(HelperError::Socket)?; + + // Spawn via podman unshare, with child_sock as the child's stdin. + // We use `env` to set the HELPER_ENV because podman unshare doesn't + // propagate the parent's environment to the inner command. + let child = Command::new("podman") + .arg("unshare") + .arg("env") + .arg(format!("{}=1", HELPER_ENV)) + .arg(&exe) + .stdin(Stdio::from(OwnedFd::from(child_sock))) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .spawn() + .map_err(HelperError::Spawn)?; + + // Convert our socket to async + parent_sock.set_nonblocking(true)?; + let tokio_socket = TokioUnixStream::from_std(parent_sock) + .map_err(|e| HelperError::Ipc(format!("failed to convert socket: {}", e)))?; + + let transport = UnixSocketTransport::new(tokio_socket); + let (sender, receiver) = transport.split(); + + Ok(Self { + child, + sender, + receiver, + next_id: 1, + }) + } + + /// Open a file via the helper, returning its fd. + /// + /// # Arguments + /// + /// * `path` - The path to open (should be absolute) + /// + /// # Returns + /// + /// The opened file descriptor, which can be used for reading. + pub async fn open_file( + &mut self, + path: impl AsRef, + ) -> std::result::Result { + let params = OpenFileParams { + path: path.as_ref().to_string_lossy().to_string(), + }; + + let id = self.next_id; + self.next_id += 1; + + let request = JsonRpcRequest::new( + methods::OPEN_FILE.to_string(), + Some(serde_json::to_value(¶ms).unwrap()), + serde_json::Value::Number(id.into()), + ); + + let message = MessageWithFds::new(JsonRpcMessage::Request(request), vec![]); + self.sender + .send(message) + .await + .map_err(|e| HelperError::Ipc(format!("failed to send request: {}", e)))?; + + // Receive response + let response = self + .receiver + .receive() + .await + .map_err(|e| HelperError::Ipc(format!("failed to receive response: {}", e)))?; + + match response.message { + JsonRpcMessage::Response(resp) => { + if let Some(error) = resp.error { + return Err(HelperError::RpcError { + code: error.code(), + message: error.message().to_string(), + }); + } + + // The fd should be in the response + if response.file_descriptors.is_empty() { + return Err(HelperError::Ipc( + "response missing file descriptor".to_string(), + )); + } + + Ok(response.file_descriptors.into_iter().next().unwrap()) + } + other => Err(HelperError::Ipc(format!( + "unexpected message type: {:?}", + other + ))), + } + } + + /// Shutdown the helper process gracefully. + pub async fn shutdown(mut self) -> std::result::Result<(), HelperError> { + let id = self.next_id; + + let request = JsonRpcRequest::new( + methods::SHUTDOWN.to_string(), + None, + serde_json::Value::Number(id.into()), + ); + + let message = MessageWithFds::new(JsonRpcMessage::Request(request), vec![]); + // Ignore send errors - the child may have already exited + let _ = self.sender.send(message).await; + + // Wait for the child to exit + let _ = self.child.wait(); + + Ok(()) + } + + /// List images in storage via the helper. + pub async fn list_images( + &mut self, + storage_path: &str, + ) -> std::result::Result, HelperError> { + let params = ListImagesParams { + storage_path: storage_path.to_string(), + }; + let result: ListImagesResult = self.call(methods::LIST_IMAGES, ¶ms).await?; + Ok(result.images) + } + + /// Get image information via the helper. + pub async fn get_image( + &mut self, + storage_path: &str, + image_ref: &str, + ) -> std::result::Result { + let params = GetImageParams { + storage_path: storage_path.to_string(), + image_ref: image_ref.to_string(), + }; + self.call(methods::GET_IMAGE, ¶ms).await + } + + /// Start streaming a layer's tar-split content. + /// + /// Returns a stream that yields `ProxiedTarSplitItem`s. The helper sends + /// notifications with file descriptors for each file in the layer. + pub async fn stream_layer( + &mut self, + storage_path: &str, + layer_id: &str, + ) -> std::result::Result, HelperError> { + let params = StreamLayerParams { + storage_path: storage_path.to_string(), + layer_id: layer_id.to_string(), + }; + + let id = self.next_id; + self.next_id += 1; + + let request = JsonRpcRequest::new( + methods::STREAM_LAYER.to_string(), + Some(serde_json::to_value(¶ms).unwrap()), + serde_json::Value::Number(id.into()), + ); + + let message = MessageWithFds::new(JsonRpcMessage::Request(request), vec![]); + self.sender + .send(message) + .await + .map_err(|e| HelperError::Ipc(format!("failed to send stream_layer request: {}", e)))?; + + Ok(ProxiedLayerStream { + receiver: &mut self.receiver, + request_id: id, + finished: false, + }) + } + + /// Make an RPC call and parse the response. + async fn call Deserialize<'de>>( + &mut self, + method: &str, + params: &P, + ) -> std::result::Result { + let id = self.next_id; + self.next_id += 1; + + let request = JsonRpcRequest::new( + method.to_string(), + Some(serde_json::to_value(params).unwrap()), + serde_json::Value::Number(id.into()), + ); + + let message = MessageWithFds::new(JsonRpcMessage::Request(request), vec![]); + self.sender + .send(message) + .await + .map_err(|e| HelperError::Ipc(format!("failed to send request: {}", e)))?; + + // Receive response + let response = self + .receiver + .receive() + .await + .map_err(|e| HelperError::Ipc(format!("failed to receive response: {}", e)))?; + + match response.message { + JsonRpcMessage::Response(resp) => { + if let Some(error) = resp.error { + return Err(HelperError::RpcError { + code: error.code(), + message: error.message().to_string(), + }); + } + + let result = resp + .result + .ok_or_else(|| HelperError::Ipc("response missing result".to_string()))?; + + serde_json::from_value(result) + .map_err(|e| HelperError::Ipc(format!("failed to parse result: {}", e))) + } + other => Err(HelperError::Ipc(format!( + "unexpected message type: {:?}", + other + ))), + } + } +} + +/// Item received from a proxied layer stream. +#[derive(Debug)] +pub enum ProxiedTarSplitItem { + /// Raw segment bytes (tar header/padding). + Segment(Vec), + /// File content with metadata and fd. + FileContent { + /// File descriptor for the content. + fd: OwnedFd, + /// File size. + size: u64, + /// File name/path. + name: String, + }, +} + +/// Stream of tar-split items received via the helper proxy. +pub struct ProxiedLayerStream<'a> { + receiver: &'a mut jsonrpc_fdpass::transport::Receiver, + request_id: u64, + finished: bool, +} + +impl std::fmt::Debug for ProxiedLayerStream<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ProxiedLayerStream") + .field("request_id", &self.request_id) + .field("finished", &self.finished) + .finish_non_exhaustive() + } +} + +impl<'a> ProxiedLayerStream<'a> { + /// Get the next item from the stream. + /// + /// Returns `None` when the stream is complete. + pub async fn next(&mut self) -> std::result::Result, HelperError> { + if self.finished { + return Ok(None); + } + + let msg_with_fds = match self.receiver.receive().await { + Ok(m) => m, + Err(jsonrpc_fdpass::Error::ConnectionClosed) => { + self.finished = true; + return Ok(None); + } + Err(e) => { + return Err(HelperError::Ipc(format!("failed to receive: {}", e))); + } + }; + + let mut fds = msg_with_fds.file_descriptors; + + match msg_with_fds.message { + JsonRpcMessage::Notification(notif) => { + let params = notif.params.unwrap_or(serde_json::Value::Null); + + match notif.method.as_str() { + "stream.segment" => { + let seg: StreamSegmentNotification = serde_json::from_value(params) + .map_err(|e| { + HelperError::Ipc(format!("invalid segment params: {}", e)) + })?; + + let bytes = BASE64_STANDARD.decode(&seg.data).map_err(|e| { + HelperError::Ipc(format!("failed to decode segment: {}", e)) + })?; + + Ok(Some(ProxiedTarSplitItem::Segment(bytes))) + } + "stream.file" => { + let file: StreamFileNotification = serde_json::from_value(params) + .map_err(|e| HelperError::Ipc(format!("invalid file params: {}", e)))?; + + if fds.is_empty() { + return Err(HelperError::Ipc( + "file notification missing fd".to_string(), + )); + } + + let fd = fds.remove(0); + Ok(Some(ProxiedTarSplitItem::FileContent { + fd, + size: file.size, + name: file.name, + })) + } + other => Err(HelperError::Ipc(format!( + "unknown notification method: {}", + other + ))), + } + } + JsonRpcMessage::Response(resp) => { + // Final response - stream is complete + self.finished = true; + + if let Some(error) = resp.error { + return Err(HelperError::RpcError { + code: error.code(), + message: error.message().to_string(), + }); + } + + Ok(None) + } + JsonRpcMessage::Request(_) => Err(HelperError::Ipc( + "unexpected request from helper".to_string(), + )), + } + } +} + +impl Drop for StorageProxy { + fn drop(&mut self) { + // Try to kill the child if it's still running + let _ = self.child.kill(); + } +} diff --git a/crates/integration-tests/Cargo.toml b/crates/integration-tests/Cargo.toml index 41b3063d..718d3b1b 100644 --- a/crates/integration-tests/Cargo.toml +++ b/crates/integration-tests/Cargo.toml @@ -1,6 +1,9 @@ [package] name = "integration-tests" publish = false +description = "Integration tests for composefs-rs (not published)" +autobins = false +autotests = false edition.workspace = true license.workspace = true @@ -8,25 +11,42 @@ repository.workspace = true rust-version.workspace = true version.workspace = true +# The integration test runner is declared as both [[bin]] (for the container +# image build) and [[test]] (so nextest discovers the libtest-mimic tests). +# The integration-tests crate is excluded from workspace default-members so +# that plain `cargo test` does not run these; use `just test-integration`. +# Cargo warns about the shared source file — this is harmless. [[bin]] name = "cfsctl-integration-tests" path = "src/main.rs" +[[test]] +name = "cfsctl-integration-tests" +path = "src/main.rs" +harness = false + +[[bin]] +name = "test-cleanup" +path = "src/cleanup.rs" + [dependencies] anyhow = "1" cap-std-ext = "5.0" composefs = { workspace = true } # Only the test_util module is used — for creating test OCI images. # All verification must go through the cfsctl CLI. -composefs-oci = { workspace = true, features = ["test", "boot"] } +composefs-oci = { workspace = true, features = ["test", "boot", "containers-storage"] } +hex = "0.4" libtest-mimic = "0.8" linkme = "0.3" ocidir = "0.7" paste = "1" -rustix = { version = "1.0.0", default-features = false, features = ["process"] } -serde_json = "1.0" +rustix = { version = "1", features = ["fs", "process"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" tar = "0.4" tempfile = "3" +tokio = { version = "1", features = ["rt-multi-thread", "macros"] } xshell = "0.2" [lints] diff --git a/crates/integration-tests/src/cleanup.rs b/crates/integration-tests/src/cleanup.rs new file mode 100644 index 00000000..6a2ef8d5 --- /dev/null +++ b/crates/integration-tests/src/cleanup.rs @@ -0,0 +1,54 @@ +//! Cleanup utility for integration test resources +//! +//! This binary cleans up any leftover resources from integration tests. + +use std::process::Command; + +use integration_tests::INTEGRATION_TEST_LABEL; + +fn main() { + println!("Cleaning up integration test resources..."); + + // Clean up podman containers with our label + let output = Command::new("podman") + .args([ + "ps", + "-a", + "--filter", + &format!("label={}", INTEGRATION_TEST_LABEL), + "-q", + ]) + .output(); + + if let Ok(output) = output { + let container_ids = String::from_utf8_lossy(&output.stdout); + for id in container_ids.lines() { + if !id.is_empty() { + println!("Removing container: {}", id); + let _ = Command::new("podman").args(["rm", "-f", id]).output(); + } + } + } + + // Clean up podman images with our label + let output = Command::new("podman") + .args([ + "images", + "--filter", + &format!("label={}", INTEGRATION_TEST_LABEL), + "-q", + ]) + .output(); + + if let Ok(output) = output { + let image_ids = String::from_utf8_lossy(&output.stdout); + for id in image_ids.lines() { + if !id.is_empty() { + println!("Removing image: {}", id); + let _ = Command::new("podman").args(["rmi", "-f", id]).output(); + } + } + } + + println!("Cleanup complete."); +} diff --git a/crates/integration-tests/src/lib.rs b/crates/integration-tests/src/lib.rs index 84391096..c86ebfb1 100644 --- a/crates/integration-tests/src/lib.rs +++ b/crates/integration-tests/src/lib.rs @@ -7,6 +7,14 @@ // linkme requires unsafe for distributed slices #![allow(unsafe_code)] +use std::process::Command; +use std::sync::Arc; + +use anyhow::Result; +use composefs_oci::composefs::fsverity::{Algorithm, Sha256HashValue}; +use composefs_oci::composefs::repository::Repository; +use tempfile::TempDir; + /// A test function that returns a Result. pub type TestFn = fn() -> anyhow::Result<()>; @@ -50,3 +58,120 @@ macro_rules! integration_test { } }; } + +// ============================================================================ +// Utilities for containers-storage tests +// ============================================================================ + +/// Test label for cleanup +pub const INTEGRATION_TEST_LABEL: &str = "composefs-rs.integration-test=1"; + +/// Get the path to cfsctl binary +pub fn get_cfsctl_path() -> Result { + // Check environment first + if let Ok(path) = std::env::var("CFSCTL_PATH") { + return Ok(path); + } + // Look in common locations + for path in [ + "./target/release/cfsctl", + "./target/debug/cfsctl", + "/usr/bin/cfsctl", + ] { + if std::path::Path::new(path).exists() { + return Ok(path.to_string()); + } + } + anyhow::bail!("cfsctl not found; set CFSCTL_PATH or build with `cargo build --release`") +} + +/// Get the primary test image +pub fn get_primary_image() -> String { + std::env::var("COMPOSEFS_RS_PRIMARY_IMAGE") + .unwrap_or_else(|_| "quay.io/centos-bootc/centos-bootc:stream10".to_string()) +} + +/// Get all test images +pub fn get_all_images() -> Vec { + std::env::var("COMPOSEFS_RS_ALL_IMAGES") + .unwrap_or_else(|_| get_primary_image()) + .split_whitespace() + .map(String::from) + .collect() +} + +/// Create a test repository in a temporary directory. +/// +/// Initializes a new insecure SHA-256 repository. +pub fn create_test_repository(tempdir: &TempDir) -> Result>> { + let fd = rustix::fs::open( + tempdir.path(), + rustix::fs::OFlags::CLOEXEC | rustix::fs::OFlags::RDONLY, + 0.into(), + )?; + + let (mut repo, _created) = + Repository::::init_path(&fd, ".", Algorithm::SHA256, false)?; + repo.set_insecure(); + Ok(Arc::new(repo)) +} + +fn podman_command() -> Command { + Command::new("podman") +} + +/// Build a minimal test image using podman and return its ID +pub fn build_test_image() -> Result { + let temp_dir = TempDir::new()?; + let containerfile = temp_dir.path().join("Containerfile"); + + // Create a simple Containerfile with various file sizes to test + // both inline and external storage paths. + // Use centos instead of busybox because busybox has UID 65534 which + // breaks in nested container environments due to user namespace issues. + // Include /boot and /sysroot so the image is compatible with --bootable + // (transform_for_boot requires these directories). + std::fs::write( + &containerfile, + r#"FROM quay.io/centos/centos:stream10 +# Small file (should be inlined) +RUN echo "small content" > /small.txt +# Larger file (should be external) +RUN dd if=/dev/zero of=/large.bin bs=1024 count=100 2>/dev/null +# Directory with files +RUN mkdir -p /testdir && echo "file1" > /testdir/a.txt && echo "file2" > /testdir/b.txt +# Symlink +RUN ln -s /small.txt /link.txt +# Directories required by composefs boot transformations +RUN mkdir -p /boot /sysroot +"#, + )?; + + let iid_file = temp_dir.path().join("image.iid"); + + let output = podman_command() + .args([ + "build", + "--pull=newer", + &format!("--iidfile={}", iid_file.display()), + "-f", + &containerfile.to_string_lossy(), + &temp_dir.path().to_string_lossy(), + ]) + .output()?; + + if !output.status.success() { + anyhow::bail!( + "podman build failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + } + + let image_id = std::fs::read_to_string(&iid_file)?.trim().to_string(); + Ok(image_id) +} + +/// Remove a test image +pub fn cleanup_test_image(image_id: &str) { + let _ = podman_command().args(["rmi", "-f", image_id]).output(); +} diff --git a/crates/integration-tests/src/main.rs b/crates/integration-tests/src/main.rs index e54dc518..87b70f1c 100644 --- a/crates/integration-tests/src/main.rs +++ b/crates/integration-tests/src/main.rs @@ -3,6 +3,10 @@ //! This binary uses [`libtest_mimic`] as a custom test harness (no `#[test]`). //! Tests are registered via the [`integration_test!`] macro in submodules //! and collected from the [`INTEGRATION_TESTS`] distributed slice at startup. +//! +//! IMPORTANT: This binary may be re-executed via `podman unshare` to act as a +//! userns helper for rootless containers-storage access. The init_if_helper() +//! call at the start of main() handles this. // linkme requires unsafe for distributed slices #![allow(unsafe_code)] @@ -71,6 +75,11 @@ pub(crate) fn create_test_rootfs(parent: &Path) -> Result { } fn main() { + // CRITICAL: Handle userns helper re-execution. + // When running rootless, this binary may be re-executed via `podman unshare` + // to act as a helper process for containers-storage access. + composefs_oci::cstor::init_if_helper(); + let args = Arguments::from_args(); let tests: Vec = INTEGRATION_TESTS diff --git a/crates/integration-tests/src/tests/cli.rs b/crates/integration-tests/src/tests/cli.rs index da276b94..a3dda2e9 100644 --- a/crates/integration-tests/src/tests/cli.rs +++ b/crates/integration-tests/src/tests/cli.rs @@ -215,7 +215,7 @@ integration_test!(test_oci_images_json_empty_repo); fn create_oci_layout(parent: &std::path::Path) -> Result { use cap_std_ext::cap_std; use ocidir::oci_spec::image::{ - ImageConfigurationBuilder, Platform, PlatformBuilder, RootFsBuilder, + ConfigBuilder, ImageConfigurationBuilder, Platform, PlatformBuilder, RootFsBuilder, }; let oci_dir = parent.join("oci-image"); @@ -227,6 +227,9 @@ fn create_oci_layout(parent: &std::path::Path) -> Result { // Create a new empty manifest let mut manifest = ocidir.new_empty_manifest()?.build()?; + // Create runtime config (required for seal operation) + let runtime_config = ConfigBuilder::default().build()?; + // Create config with architecture and OS let rootfs = RootFsBuilder::default() .typ("layers") @@ -236,6 +239,7 @@ fn create_oci_layout(parent: &std::path::Path) -> Result { .architecture("amd64") .os("linux") .rootfs(rootfs) + .config(runtime_config) .build()?; // Create a simple layer with a usr/ directory and one file @@ -1397,3 +1401,290 @@ fn test_init_reset_metadata_changes_algorithm() -> Result<()> { Ok(()) } integration_test!(test_init_reset_metadata_changes_algorithm); + +/// Test tagging and untagging OCI images. +/// +/// Verifies that: +/// - An image can be tagged with multiple names +/// - Tags appear in `oci images` output +/// - Tags can be removed with `oci untag` +/// - Untagging one name doesn't affect other tags +fn test_oci_tag_and_untag() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + let repo_dir = init_insecure_repo(&sh, &cfsctl)?; + let repo = repo_dir.path(); + let fixture_dir = tempfile::tempdir()?; + let oci_layout = create_oci_layout(fixture_dir.path())?; + + // Pull and tag with first name + let pull_output = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci pull oci:{oci_layout} myimage:v1" + ) + .read()?; + + // Extract manifest digest from pull output (e.g., "manifest sha256:abc...") + let manifest_digest = pull_output + .lines() + .find(|line| line.contains("manifest sha256:")) + .and_then(|line| line.split_whitespace().find(|s| s.starts_with("sha256:"))) + .expect("expected manifest digest in pull output"); + + // Add a second tag using the manifest digest + cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci tag {manifest_digest} myimage:latest" + ) + .read()?; + + // Both tags should appear in list + let list_output = cmd!(sh, "{cfsctl} --insecure --repo {repo} oci images --json").read()?; + let images: serde_json::Value = serde_json::from_str(&list_output)?; + let names: Vec<&str> = images + .as_array() + .unwrap() + .iter() + .map(|img| img["name"].as_str().unwrap()) + .collect(); + assert!(names.contains(&"myimage:v1"), "expected myimage:v1 in list"); + assert!( + names.contains(&"myimage:latest"), + "expected myimage:latest in list" + ); + + // Remove one tag + cmd!(sh, "{cfsctl} --insecure --repo {repo} oci untag myimage:v1").read()?; + + // Only the remaining tag should appear + let list_output = cmd!(sh, "{cfsctl} --insecure --repo {repo} oci images --json").read()?; + let images: serde_json::Value = serde_json::from_str(&list_output)?; + let names: Vec<&str> = images + .as_array() + .unwrap() + .iter() + .map(|img| img["name"].as_str().unwrap()) + .collect(); + assert!( + !names.contains(&"myimage:v1"), + "myimage:v1 should be removed" + ); + assert!( + names.contains(&"myimage:latest"), + "myimage:latest should still exist" + ); + + Ok(()) +} +integration_test!(test_oci_tag_and_untag); + +/// Test that GC removes untagged OCI images. +/// +/// Verifies that: +/// - After untagging all references, GC collects the image +/// - Objects are actually removed from the repository +fn test_oci_gc_removes_untagged() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + let repo_dir = init_insecure_repo(&sh, &cfsctl)?; + let repo = repo_dir.path(); + let fixture_dir = tempfile::tempdir()?; + let oci_layout = create_oci_layout(fixture_dir.path())?; + + // Pull an image + cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci pull oci:{oci_layout} test-image" + ) + .read()?; + + // Verify it exists + let list_before = cmd!(sh, "{cfsctl} --insecure --repo {repo} oci images --json").read()?; + let images_before: Vec = serde_json::from_str(&list_before)?; + assert_eq!(images_before.len(), 1, "expected 1 image before untag"); + + // Untag it + cmd!(sh, "{cfsctl} --insecure --repo {repo} oci untag test-image").read()?; + + // Run GC + let gc_output = cmd!(sh, "{cfsctl} --insecure --repo {repo} gc").read()?; + assert!( + gc_output.contains("removed"), + "expected GC to report removed objects: {gc_output}" + ); + + // Verify image is gone + let list_after = cmd!(sh, "{cfsctl} --insecure --repo {repo} oci images --json").read()?; + let images_after: Vec = serde_json::from_str(&list_after)?; + assert!( + images_after.is_empty(), + "expected no images after GC, got: {:?}", + images_after + ); + + // Verify objects were actually removed (streams dir should be mostly empty) + let streams_dir = repo.join("streams"); + let stream_count = if streams_dir.exists() { + std::fs::read_dir(&streams_dir)? + .filter(|e| e.as_ref().map(|e| e.file_name() != "refs").unwrap_or(false)) + .count() + } else { + 0 + }; + assert_eq!( + stream_count, 0, + "expected no non-ref streams after GC, got {}", + stream_count + ); + + Ok(()) +} +integration_test!(test_oci_gc_removes_untagged); + +/// Test layer tar roundtrip: import a layer, extract as tar, verify integrity. +/// +/// This verifies that the splitstream storage correctly preserves tar content +/// by comparing the original tar with the reconstructed one. +fn test_layer_tar_roundtrip() -> Result<()> { + use std::io::Read; + + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + let repo_dir = init_insecure_repo(&sh, &cfsctl)?; + let repo = repo_dir.path(); + let fixture_dir = tempfile::tempdir()?; + let oci_layout = create_oci_layout(fixture_dir.path())?; + + // Pull the image + cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci pull oci:{oci_layout} test-image" + ) + .read()?; + + // Get the layer diff_id + let config_output = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci inspect test-image --config" + ) + .read()?; + let config: serde_json::Value = serde_json::from_str(&config_output)?; + let layer_id = config["rootfs"]["diff_ids"][0] + .as_str() + .expect("expected layer diff_id"); + + // Extract the layer as tar + let tar_output = cmd!(sh, "{cfsctl} --insecure --repo {repo} oci layer {layer_id}").output()?; + assert!(tar_output.status.success(), "layer extraction failed"); + + // Parse the tar and collect file entries + let mut archive = tar::Archive::new(tar_output.stdout.as_slice()); + let mut entries: Vec<(String, Vec)> = Vec::new(); + + for entry in archive.entries()? { + let mut entry = entry?; + let path = entry.path()?.to_string_lossy().to_string(); + let mut content = Vec::new(); + entry.read_to_end(&mut content)?; + entries.push((path, content)); + } + + // Verify we got the expected files (usr/ directory and hello.txt) + assert_eq!( + entries.len(), + 2, + "expected 2 entries in layer (usr/ and hello.txt)" + ); + + // Find hello.txt and verify content + let hello_entry = entries + .iter() + .find(|(path, _)| path == "hello.txt") + .expect("expected hello.txt in layer"); + assert_eq!( + hello_entry.1, b"hello from test layer\n", + "hello.txt content mismatch" + ); + + // Verify usr/ directory exists + assert!( + entries + .iter() + .any(|(path, _)| path == "usr" || path == "usr/"), + "expected usr/ directory in layer" + ); + + Ok(()) +} +integration_test!(test_layer_tar_roundtrip); + +/// Test computing the composefs image ID for an OCI image. +/// +/// This verifies that we can compute the filesystem verity hash for an image, +/// which is the prerequisite for sealing and mounting. +fn test_compute_image_id() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + let repo_dir = init_insecure_repo(&sh, &cfsctl)?; + let repo = repo_dir.path(); + let fixture_dir = tempfile::tempdir()?; + let oci_layout = create_oci_layout(fixture_dir.path())?; + + // Pull an image + cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci pull oci:{oci_layout} test-image" + ) + .read()?; + + // Get the config digest from inspect output + let inspect_output = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci inspect test-image" + ) + .read()?; + let inspect: serde_json::Value = serde_json::from_str(&inspect_output)?; + let config_digest = inspect["manifest"]["config"]["digest"] + .as_str() + .expect("expected config digest"); + // Digests must be prefixed with '@' to distinguish from named refs + let config_ref = format!("@{config_digest}"); + + // Compute the image ID + let compute_output = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci compute-id {config_ref}" + ) + .read()?; + + // The output should be a valid hex digest + // composefs uses SHA-256 fs-verity which produces 64 hex chars + // (but the underlying digest could be longer in some configurations) + let image_id = compute_output.trim(); + assert!( + image_id.len() >= 64, + "image ID should be at least 64 hex chars, got {} chars: {}", + image_id.len(), + image_id + ); + assert!( + image_id.chars().all(|c| c.is_ascii_hexdigit()), + "image ID should be hex, got: {}", + image_id + ); + + // Computing the same image should produce the same ID (deterministic) + let compute_output2 = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci compute-id {config_ref}" + ) + .read()?; + assert_eq!( + image_id, + compute_output2.trim(), + "compute-id should be deterministic" + ); + + Ok(()) +} +integration_test!(test_compute_image_id); diff --git a/crates/integration-tests/src/tests/cstor.rs b/crates/integration-tests/src/tests/cstor.rs new file mode 100644 index 00000000..73987b50 --- /dev/null +++ b/crates/integration-tests/src/tests/cstor.rs @@ -0,0 +1,466 @@ +//! Tests for containers-storage import functionality. +//! +//! These tests verify that importing from containers-storage produces identical +//! results to importing via skopeo/tar streaming. +//! +//! These tests require `podman unshare` which needs user namespace support. +//! On environments without proper user namespace support (like GHA runners), +//! they dispatch to a bcvk VM like other privileged tests. + +use anyhow::Result; +use tempfile::TempDir; +use xshell::{Shell, cmd}; + +use integration_tests::{build_test_image, cleanup_test_image, create_test_repository}; + +use crate::integration_test; +use crate::tests::privileged::{require_privileged, require_userns}; + +/// Helper: copy an image into a separate containers-storage root. +/// +/// Uses skopeo to copy from the default store to a standalone overlay +/// storage at `dest/storage`. +fn copy_image_to_separate_store(sh: &Shell, image_id: &str, dest: &std::path::Path) -> Result<()> { + let dest_str = dest.display().to_string(); + let storage_root = format!("{dest_str}/storage"); + let run_root = format!("{dest_str}/run"); + + // skopeo needs the raw hex ID without the sha256: prefix + let raw_id = image_id.strip_prefix("sha256:").unwrap_or(image_id); + let src_ref = format!("containers-storage:{raw_id}"); + // Use [overlay@+] to target the separate storage + let dest_ref = format!("containers-storage:[overlay@{storage_root}+{run_root}]test:latest"); + + // Use podman unshare so that skopeo has UID/GID mappings available + // for unpacking layers with non-root ownership. + if rustix::process::getuid().is_root() { + cmd!(sh, "skopeo copy {src_ref} {dest_ref}").run()?; + } else { + cmd!(sh, "podman unshare skopeo copy {src_ref} {dest_ref}").run()?; + } + + Ok(()) +} + +/// Test that containers-storage import produces identical results to skopeo/tar import. +/// +/// This is a critical correctness test: both import paths should produce the +/// exact same splitstream digests because they represent the same content. +/// +/// Requires a VM because skopeo's containers-storage transport also needs user +/// namespaces internally, and that fails on GHA runners even when podman unshare works. +fn privileged_test_cstor_vs_skopeo_equivalence() -> Result<()> { + if require_privileged("privileged_test_cstor_vs_skopeo_equivalence")?.is_some() { + return Ok(()); + } + let sh = Shell::new()?; + let rt = tokio::runtime::Runtime::new()?; + rt.block_on(async { + println!("Building test image..."); + let test_image = build_test_image()?; + println!("Built test image: {}", test_image); + + // Create two separate repositories for comparison + let cstor_repo_dir = TempDir::new()?; + let skopeo_repo_dir = TempDir::new()?; + + let cstor_repo = create_test_repository(&cstor_repo_dir)?; + let skopeo_repo = create_test_repository(&skopeo_repo_dir)?; + + // Import via containers-storage (reflink path) + let cstor_image_ref = format!("containers-storage:{}", test_image); + println!("Importing via containers-storage: {}", cstor_image_ref); + let cstor_opts = composefs_oci::PullOptions { + local_fetch: composefs_oci::LocalFetchOpt::IfPossible, + ..Default::default() + }; + let cstor_result = + composefs_oci::pull(&cstor_repo, &cstor_image_ref, None, cstor_opts).await?; + + // Import via skopeo (tar streaming path) - copy to OCI directory first + let oci_dir = TempDir::new()?; + let oci_path = oci_dir.path().join("image"); + + // Use skopeo to copy from containers-storage to oci directory + // Strip sha256: prefix for skopeo compatibility + let image_id_for_skopeo = test_image.strip_prefix("sha256:").unwrap_or(&test_image); + let cstor_ref = format!("containers-storage:{}", image_id_for_skopeo); + let oci_ref = format!("oci:{}:test", oci_path.display()); + println!("Copying to OCI dir via skopeo..."); + cmd!(sh, "skopeo copy {cstor_ref} {oci_ref}").run()?; + + // Import from the OCI directory via skopeo/tar path + let skopeo_image_ref = format!("oci:{}:test", oci_path.display()); + println!("Importing via skopeo/OCI: {}", skopeo_image_ref); + let (skopeo_pull_result, _skopeo_stats) = + composefs_oci::pull_image(&skopeo_repo, &skopeo_image_ref, None, None).await?; + let (skopeo_config_digest, skopeo_config_verity) = skopeo_pull_result.into_config(); + + // Get layer maps from both configs + let cstor_oc = composefs_oci::open_config( + &cstor_repo, + &cstor_result.config_digest, + Some(&cstor_result.config_verity), + )?; + let skopeo_oc = composefs_oci::open_config( + &skopeo_repo, + &skopeo_config_digest, + Some(&skopeo_config_verity), + )?; + + // Compare results + println!("CSTOR config digest: {}", cstor_result.config_digest); + println!("SKOPEO config digest: {}", skopeo_config_digest); + assert_eq!( + cstor_result.config_digest, skopeo_config_digest, + "config digests must match" + ); + + println!("CSTOR layers: {:?}", cstor_oc.layer_refs); + println!("SKOPEO layers: {:?}", skopeo_oc.layer_refs); + assert_eq!( + cstor_oc.layer_refs, skopeo_oc.layer_refs, + "layer verity IDs must match" + ); + + println!("CSTOR config verity: {:?}", cstor_result.config_verity); + println!("SKOPEO config verity: {:?}", skopeo_config_verity); + + // NOTE: Config verity IDs may differ due to layer ref ordering. + // The skopeo path sorts layers by size for parallel fetching, then adds + // named refs in that order. The cstor path adds refs in config order. + // Both produce valid splitstreams with correct content, but different verity. + // TODO: Fix the ordering discrepancy in one of the implementations. + if cstor_result.config_verity != skopeo_config_verity { + println!( + "WARNING: Config verity IDs differ due to layer ref ordering. \ + Content is equivalent but splitstream structure differs." + ); + } + + println!("SUCCESS: Both import paths produced equivalent content"); + println!(" Config digest: {}", cstor_result.config_digest); + println!(" Layers: {}", cstor_oc.layer_refs.len()); + + Ok(()) + }) +} +integration_test!(privileged_test_cstor_vs_skopeo_equivalence); + +/// Test that importing the same image twice produces identical results (idempotency). +/// +/// The second import should return the same verity IDs, and import stats should +/// reflect that layers came from cache. +/// +/// Requires user namespace support (podman unshare), so runs only in privileged/VM tests. +fn privileged_test_cstor_idempotent_import() -> Result<()> { + if require_userns("privileged_test_cstor_idempotent_import")?.is_some() { + return Ok(()); + } + let rt = tokio::runtime::Runtime::new()?; + rt.block_on(async { + println!("Building test image..."); + let test_image = build_test_image()?; + println!("Built test image: {}", test_image); + + let repo_dir = TempDir::new()?; + let repo = create_test_repository(&repo_dir)?; + + let cstor_image_ref = format!("containers-storage:{}", test_image); + + // First import + println!("First import via containers-storage..."); + let cstor_opts = composefs_oci::PullOptions { + local_fetch: composefs_oci::LocalFetchOpt::IfPossible, + ..Default::default() + }; + let first_result = composefs_oci::pull(&repo, &cstor_image_ref, None, cstor_opts).await?; + + // Second import of the same image + println!("Second import via containers-storage (should use cache)..."); + let cstor_opts = composefs_oci::PullOptions { + local_fetch: composefs_oci::LocalFetchOpt::IfPossible, + ..Default::default() + }; + let second_result = composefs_oci::pull(&repo, &cstor_image_ref, None, cstor_opts).await?; + + // Verify idempotency: both imports should produce identical results + assert_eq!( + first_result.config_digest, second_result.config_digest, + "config digests must match between imports" + ); + assert_eq!( + first_result.config_verity, second_result.config_verity, + "config verity IDs must match between imports" + ); + + // Verify layer verity IDs match + let first_oc = composefs_oci::open_config( + &repo, + &first_result.config_digest, + Some(&first_result.config_verity), + )?; + let second_oc = composefs_oci::open_config( + &repo, + &second_result.config_digest, + Some(&second_result.config_verity), + )?; + assert_eq!( + first_oc.layer_refs, second_oc.layer_refs, + "layer verity IDs must match between imports" + ); + + // Check import stats: second import should find objects already present + let first_stats = &first_result.stats; + let second_stats = &second_result.stats; + println!("First import stats: {:?}", first_stats); + println!("Second import stats: {:?}", second_stats); + + // The first import should have copied some objects + assert!( + first_stats.objects_copied > 0, + "first import should copy objects" + ); + + // The second import should find everything already present + assert_eq!( + second_stats.objects_copied, 0, + "second import should not copy any new objects" + ); + + println!("SUCCESS: Idempotent import produced identical results"); + println!(" Config digest: {}", first_result.config_digest); + println!(" Layers: {}", first_oc.layer_refs.len()); + println!( + " Second import: {} objects already present", + second_stats.objects_already_present + ); + + Ok(()) + }) +} +integration_test!(privileged_test_cstor_idempotent_import); + +/// Test that importing with a reference parameter creates an OCI manifest tag. +/// +/// The cstor import path now creates full OCI structure (manifest + config + +/// layers), so the reference becomes an OCI-style manifest tag visible via +/// `list_refs()`. +/// +/// Requires user namespace support (podman unshare), so runs only in privileged/VM tests. +fn privileged_test_cstor_import_with_reference() -> Result<()> { + if require_userns("privileged_test_cstor_import_with_reference")?.is_some() { + return Ok(()); + } + let rt = tokio::runtime::Runtime::new()?; + rt.block_on(async { + println!("Building test image..."); + let test_image = build_test_image()?; + println!("Built test image: {}", test_image); + + let repo_dir = TempDir::new()?; + let repo = create_test_repository(&repo_dir)?; + + let cstor_image_ref = format!("containers-storage:{}", test_image); + let reference_name = "test-ref"; + + // Import with a reference name + println!("Importing with reference: {}", reference_name); + let cstor_opts = composefs_oci::PullOptions { + local_fetch: composefs_oci::LocalFetchOpt::IfPossible, + ..Default::default() + }; + let result = + composefs_oci::pull(&repo, &cstor_image_ref, Some(reference_name), cstor_opts).await?; + + println!("Import complete. Config digest: {}", result.config_digest); + println!( + "Import complete. Manifest digest: {}", + result.manifest_digest + ); + + // Verify the OCI tag was created — it should be visible via list_refs() + let refs = composefs_oci::oci_image::list_refs(&repo)?; + let found = refs.iter().any(|(name, _)| name == reference_name); + assert!( + found, + "reference '{}' should appear in OCI refs, got: {:?}", + reference_name, + refs.iter().map(|(n, _)| n).collect::>() + ); + + // The OCI ref should resolve to the manifest we imported + let ref_path = repo_dir + .path() + .join("streams/refs/oci") + .join(reference_name); + assert!( + ref_path.is_symlink(), + "OCI reference '{}' should exist as symlink at {:?}", + reference_name, + ref_path + ); + + let target = std::fs::read_link(&ref_path)?; + let target_str = target.to_string_lossy(); + assert!( + target_str.contains("oci-manifest-"), + "reference should point to oci-manifest stream, got: {}", + target_str + ); + + println!("SUCCESS: Import with reference created OCI manifest tag"); + println!(" Reference: {}", reference_name); + println!(" Manifest digest: {}", result.manifest_digest); + + Ok(()) + }) +} +integration_test!(privileged_test_cstor_import_with_reference); + +/// Test that importing from an additional image store works. +/// +/// This exercises the `STORAGE_OPTS=additionalimagestore=` mechanism +/// used by bcvk to expose the host's containers-storage inside a VM. +/// +/// The test: +/// 1. Builds an image in the default store +/// 2. Copies it to a separate overlay store in a temp directory +/// 3. Removes it from the default store +/// 4. Runs `cfsctl oci pull` with `STORAGE_OPTS` in the command environment +/// (avoiding process-global env mutation) +fn privileged_test_cstor_additional_image_store() -> Result<()> { + if require_userns("privileged_test_cstor_additional_image_store")?.is_some() { + return Ok(()); + } + let sh = Shell::new()?; + let cfsctl = crate::cfsctl()?; + + println!("Building test image..."); + let test_image = build_test_image()?; + println!("Built test image: {}", test_image); + + // Copy the image to a separate store + let separate_store = TempDir::new()?; + println!( + "Copying image to separate store at {}...", + separate_store.path().display() + ); + copy_image_to_separate_store(&sh, &test_image, separate_store.path())?; + + // Remove from the default store so the only way to find it is via + // the additional image store + cleanup_test_image(&test_image); + + // Set STORAGE_OPTS in the command environment (not process-global) + let additional_store = separate_store.path().join("storage"); + let storage_opts = format!("additionalimagestore={}", additional_store.display()); + println!("Running cfsctl with STORAGE_OPTS={}", storage_opts); + + let repo_dir = TempDir::new()?; + let repo = repo_dir.path(); + let cstor_image_ref = format!("containers-storage:{}", test_image); + + // Initialize the repository first + cmd!(sh, "{cfsctl} --insecure --repo {repo} init").run()?; + + // Run cfsctl as an external process with STORAGE_OPTS in its environment + let output = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci pull --local-fetch auto {cstor_image_ref}" + ) + .env("STORAGE_OPTS", &storage_opts) + .read()?; + + println!("SUCCESS: Imported from additional image store"); + println!(" cfsctl output: {}", output); + + // Verify the pull succeeded by checking the output contains a config digest + assert!( + output.contains("sha256:"), + "expected sha256 digest in pull output, got: {output}" + ); + + Ok(()) +} +integration_test!(privileged_test_cstor_additional_image_store); + +/// Test that `--bootable` works for containers-storage imports. +/// +/// This verifies the full end-to-end flow: +/// 1. Build a test image (non-bootable) in containers-storage +/// 2. Pull it via `cfsctl oci pull --bootable` using the cstor path +/// 3. Verify the CLI succeeds and the output includes the boot image verity +/// +/// Even though the test image doesn't contain actual boot content (no UKI, +/// no kernel), `--bootable` should still succeed — `transform_for_boot` +/// gracefully handles images without boot entries by applying SELinux +/// relabeling and emptying /boot and /sysroot. +fn privileged_test_cstor_bootable() -> Result<()> { + if require_userns("privileged_test_cstor_bootable")?.is_some() { + return Ok(()); + } + let sh = Shell::new()?; + let cfsctl = crate::cfsctl()?; + + println!("Building test image..."); + let test_image = build_test_image()?; + println!("Built test image: {}", test_image); + + let repo_dir = TempDir::new()?; + let repo = repo_dir.path(); + let cstor_image_ref = format!("containers-storage:{}", test_image); + + // Initialize the repository first + cmd!(sh, "{cfsctl} --insecure --repo {repo} init").run()?; + + // Pull with --bootable via the cstor path + let output = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci pull --local-fetch auto --bootable {cstor_image_ref}" + ) + .read()?; + + println!("cfsctl output:\n{}", output); + + // Verify pull succeeded with expected output + assert!( + output.contains("manifest"), + "expected 'manifest' in output, got: {output}" + ); + assert!( + output.contains("Boot image:"), + "expected 'Boot image:' in output (--bootable should produce a boot EROFS), got: {output}" + ); + + // Verify the image is visible via oci images (OCI manifest tag was created) + let ls_output = cmd!(sh, "{cfsctl} --insecure --repo {repo} oci images --json").read()?; + let images: serde_json::Value = serde_json::from_str(&ls_output)?; + let images_arr = images.as_array().expect("oci ls should return array"); + assert!( + !images_arr.is_empty(), + "oci ls should show at least one image after cstor pull" + ); + + // Verify the boot EROFS ref is visible via inspect + let tag_name = &cstor_image_ref; + let inspect_output = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci inspect {tag_name}" + ) + .read()?; + let inspect: serde_json::Value = serde_json::from_str(&inspect_output)?; + assert!( + inspect.get("composefs_erofs").is_some(), + "inspect should show composefs_erofs field" + ); + assert!( + inspect.get("composefs_boot_erofs").is_some(), + "inspect should show composefs_boot_erofs field after --bootable pull" + ); + + println!("SUCCESS: --bootable works for containers-storage imports"); + + Ok(()) +} +integration_test!(privileged_test_cstor_bootable); diff --git a/crates/integration-tests/src/tests/mod.rs b/crates/integration-tests/src/tests/mod.rs index f94b78b8..0e970a9c 100644 --- a/crates/integration-tests/src/tests/mod.rs +++ b/crates/integration-tests/src/tests/mod.rs @@ -1,5 +1,6 @@ //! Integration test modules, organized by execution environment. pub mod cli; +pub mod cstor; pub mod digest_stability; pub mod privileged; diff --git a/crates/integration-tests/src/tests/privileged.rs b/crates/integration-tests/src/tests/privileged.rs index 6e39bff3..de15ba63 100644 --- a/crates/integration-tests/src/tests/privileged.rs +++ b/crates/integration-tests/src/tests/privileged.rs @@ -9,13 +9,17 @@ //! recursion — see [`require_privileged`]. use std::path::{Path, PathBuf}; +use std::sync::Arc; -use anyhow::{Result, bail, ensure}; +use anyhow::{Context, Result, bail, ensure}; use xshell::{Shell, cmd}; +use composefs_oci::composefs::fsverity::{FsVerityHashValue, Sha256HashValue, Sha512HashValue}; +use composefs_oci::composefs::repository::Repository; + use crate::{cfsctl, integration_test}; -/// Ensure we're running as root, or re-exec this test inside a VM. +/// Ensure we're running in a privileged environment, or re-exec this test inside a VM. /// /// If already root (e.g. inside a bcvk VM), returns `Ok(None)` and the /// test proceeds normally. @@ -26,7 +30,10 @@ use crate::{cfsctl, integration_test}; /// the test already ran in the VM. /// /// If not root and no test image is configured, returns an error. -fn require_privileged(test_name: &str) -> Result> { +/// +/// This is also used by cstor tests which need user namespace support +/// (via `podman unshare`) that may not be available on GHA runners. +pub fn require_privileged(test_name: &str) -> Result> { if rustix::process::getuid().is_root() { return Ok(None); } @@ -53,6 +60,58 @@ fn require_privileged(test_name: &str) -> Result> { Ok(Some(())) } +/// Check if user namespaces work (needed for podman unshare). +fn userns_works() -> bool { + std::process::Command::new("podman") + .args(["unshare", "true"]) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .map(|s| s.success()) + .unwrap_or(false) +} + +/// Ensure user namespace support is available, or re-exec this test inside a VM. +/// +/// Unlike `require_privileged`, this doesn't require root — it just needs +/// working user namespaces (for `podman unshare`). If user namespaces work, +/// the test proceeds normally. Otherwise, it dispatches to a VM. +/// +/// Returns `Ok(None)` if the test should proceed, `Ok(Some(()))` if it was +/// dispatched to a VM and the caller should return immediately. +pub fn require_userns(test_name: &str) -> Result> { + // If we're root (e.g. in VM), userns works + if rustix::process::getuid().is_root() { + return Ok(None); + } + + // Check if userns works on this host + if userns_works() { + return Ok(None); + } + + // userns doesn't work — delegate to a VM + if std::env::var_os("COMPOSEFS_IN_VM").is_some() { + bail!("COMPOSEFS_IN_VM is set but userns doesn't work — VM setup is broken"); + } + + let image = std::env::var("COMPOSEFS_TEST_IMAGE").map_err(|_| { + anyhow::anyhow!( + "user namespaces not available and COMPOSEFS_TEST_IMAGE not set; \ + run `just build-test-image` or use `just test-integration-vm`" + ) + })?; + + let sh = Shell::new()?; + let bcvk = std::env::var("BCVK_PATH").unwrap_or_else(|_| "bcvk".into()); + cmd!( + sh, + "{bcvk} ephemeral run-ssh {image} -- cfsctl-integration-tests --exact {test_name}" + ) + .run()?; + Ok(Some(())) +} + /// A temporary directory backed by a loopback ext4 filesystem with verity support. /// /// tmpfs doesn't support fs-verity, so privileged tests that need verity @@ -273,7 +332,7 @@ fn privileged_oci_pull_mount() -> Result<()> { // Verify file content at the mountpoint let hostname = std::fs::read_to_string(mountpoint.path().join("etc/hostname"))?; - ensure!(hostname == "testhost\n", "hostname mismatch: {hostname:?}"); + ensure!(hostname == "test-host", "hostname mismatch: {hostname:?}"); let os_release = std::fs::read_to_string(mountpoint.path().join("etc/os-release"))?; ensure!( @@ -281,10 +340,12 @@ fn privileged_oci_pull_mount() -> Result<()> { "os-release missing ID: {os_release:?}" ); + // busybox is a 4096-byte external file (random data seeded from size) let busybox = std::fs::read(mountpoint.path().join("usr/bin/busybox"))?; ensure!( - busybox == b"busybox-binary-content", - "busybox content mismatch" + busybox.len() == 4096, + "busybox size mismatch: expected 4096, got {}", + busybox.len() ); let sh_target = std::fs::read_link(mountpoint.path().join("usr/bin/sh"))?; @@ -293,10 +354,12 @@ fn privileged_oci_pull_mount() -> Result<()> { "sh symlink target mismatch: {sh_target:?}" ); - let app_data = std::fs::read_to_string(mountpoint.path().join("usr/share/myapp/data.txt"))?; + // App layer has a 512-byte README (external, random data) + let readme = std::fs::read(mountpoint.path().join("usr/share/doc/README"))?; ensure!( - app_data == "application-data", - "app data mismatch: {app_data:?}" + readme.len() == 512, + "README size mismatch: expected 512, got {}", + readme.len() ); ensure!(mountpoint.path().join("tmp").is_dir(), "/tmp missing"); @@ -427,3 +490,339 @@ fn privileged_pull_readonly_repo() -> Result<()> { Ok(()) } integration_test!(privileged_pull_readonly_repo); + +// ============================================================================ +// Filesystem-specific reflink / hardlink tests +// ============================================================================ + +/// A temporary directory backed by a loop-mounted filesystem. +/// +/// Supports ext4 (with verity) and XFS (with reflinks). The backing sparse +/// file is 512 MB — large enough for a synthetic OCI image in +/// containers-storage plus a composefs repo. +struct LoopTempDir { + mountpoint: PathBuf, + _backing: tempfile::TempDir, +} + +impl LoopTempDir { + /// Create a loop-mounted ext4 filesystem with verity support. + fn ext4_verity() -> Result { + Self::create("mkfs.ext4", &["-q", "-O", "verity", "-b", "4096"]) + } + + /// Create a loop-mounted XFS filesystem with reflink support. + fn xfs_reflink() -> Result { + Self::create("mkfs.xfs", &["-q", "-m", "reflink=1"]) + } + + fn create(mkfs: &str, args: &[&str]) -> Result { + let backing = tempfile::tempdir()?; + let img = backing.path().join("fs.img"); + let mountpoint = backing.path().join("mnt"); + std::fs::create_dir(&mountpoint)?; + + let sh = Shell::new()?; + cmd!(sh, "truncate -s 512M {img}").run()?; + cmd!(sh, "{mkfs} {args...} {img}").run()?; + cmd!(sh, "mount -o loop {img} {mountpoint}").run()?; + + Ok(Self { + mountpoint, + _backing: backing, + }) + } + + fn path(&self) -> &Path { + &self.mountpoint + } +} + +impl Drop for LoopTempDir { + fn drop(&mut self) { + let _ = std::process::Command::new("umount") + .arg(&self.mountpoint) + .status(); + } +} + +/// Create a minimal OCI directory image with files large enough to exercise +/// the `ensure_object_from_file` path (> 64 bytes, the inline threshold). +/// +/// Returns the path to the OCI directory. +pub(super) fn create_oci_layout_with_large_files(parent: &Path) -> Result { + use cap_std_ext::cap_std; + use ocidir::oci_spec::image::{ + ConfigBuilder, ImageConfigurationBuilder, Platform, PlatformBuilder, RootFsBuilder, + }; + + let oci_dir = parent.join("oci-image"); + std::fs::create_dir_all(&oci_dir)?; + + let dir = cap_std::fs::Dir::open_ambient_dir(&oci_dir, cap_std::ambient_authority())?; + let ocidir = ocidir::OciDir::ensure(dir)?; + + let mut manifest = ocidir.new_empty_manifest()?.build()?; + + let runtime_config = ConfigBuilder::default().build()?; + let rootfs = RootFsBuilder::default() + .typ("layers") + .diff_ids(Vec::::new()) + .build()?; + let mut config = ImageConfigurationBuilder::default() + .architecture("amd64") + .os("linux") + .rootfs(rootfs) + .config(runtime_config) + .build()?; + + // Create a layer with several files > INLINE_CONTENT_MAX_V0 (64 bytes) + // so they go through the ensure_object_from_file path during cstor import. + // The image must have /usr (required by transform_for_oci). + let mut layer_builder = ocidir.create_layer(None)?; + + // Add /usr directory (required by composefs OCI transformations) + let mut usr_hdr = tar::Header::new_gnu(); + usr_hdr.set_entry_type(tar::EntryType::Directory); + usr_hdr.set_size(0); + usr_hdr.set_mode(0o755); + usr_hdr.set_uid(0); + usr_hdr.set_gid(0); + usr_hdr.set_mtime(1234567890); + usr_hdr.set_cksum(); + layer_builder.append_data(&mut usr_hdr, "usr/", &[] as &[u8])?; + + for i in 0..5u8 { + let data = vec![i.wrapping_mul(0x37); 4096]; + let name = format!("usr/file_{i}.bin"); + let mut header = tar::Header::new_gnu(); + header.set_size(data.len() as u64); + header.set_mode(0o644); + header.set_uid(0); + header.set_gid(0); + header.set_mtime(1234567890); + header.set_cksum(); + layer_builder.append_data(&mut header, &name, &data[..])?; + } + let layer = layer_builder.into_inner()?.complete()?; + + ocidir.push_layer(&mut manifest, &mut config, layer, "test layer", None); + + let platform: Platform = PlatformBuilder::default() + .architecture("amd64") + .os("linux") + .build()?; + ocidir.insert_manifest_and_config(manifest, config, None, platform)?; + + Ok(oci_dir) +} + +/// Copy an OCI directory image into containers-storage on a specific filesystem. +/// +/// Uses `skopeo copy` with the `[overlay@root+runroot]` syntax to target +/// a containers-storage instance at the given mount point. +/// +/// Returns the storage root path. +pub(super) fn copy_oci_to_cstor(sh: &Shell, oci_dir: &Path, mount: &Path) -> Result { + let storage_root = mount.join("storage"); + let run_root = mount.join("run"); + std::fs::create_dir_all(&storage_root)?; + std::fs::create_dir_all(&run_root)?; + + let oci_ref = format!("oci:{}", oci_dir.display()); + let cstor_ref = format!( + "containers-storage:[overlay@{}+{}]test:latest", + storage_root.display(), + run_root.display() + ); + + // Run in a private mount namespace so that any bind mounts the overlay + // driver creates (e.g. on storage/overlay) don't leak into our namespace. + // This ensures the diff files we later open are on the raw filesystem, + // not behind a bind mount that would cause EXDEV on hardlinks. + cmd!(sh, "unshare -m skopeo copy {oci_ref} {cstor_ref}").run()?; + + Ok(storage_root) +} + +/// Open a repository at `path`, initializing it first. Uses insecure mode +/// so the tests work on filesystems without verity (XFS). +fn init_insecure_repo_at( + path: &Path, + algorithm: composefs_oci::composefs::fsverity::Algorithm, +) -> Result>> { + std::fs::create_dir_all(path)?; + let fd = rustix::fs::open( + path, + rustix::fs::OFlags::CLOEXEC | rustix::fs::OFlags::RDONLY, + 0.into(), + )?; + let (mut repo, _created) = Repository::::init_path(&fd, ".", algorithm, false)?; + repo.set_insecure(); + Ok(Arc::new(repo)) +} + +/// Pull a containers-storage image into a composefs repo with an explicit +/// storage root and local-fetch mode, and return the import stats. +fn cstor_pull( + repo: &Arc>, + storage_root: &Path, + local_fetch: composefs_oci::LocalFetchOpt, +) -> Result { + let opts = composefs_oci::PullOptions { + local_fetch, + storage_root: Some(storage_root), + ..Default::default() + }; + + let rt = tokio::runtime::Runtime::new()?; + let pull_result = rt + .block_on(async { + composefs_oci::pull(repo, "containers-storage:test:latest", None, opts).await + }) + .context("containers-storage pull failed")?; + Ok(pull_result.stats) +} + +/// Dispatch a cstor pull with the given algorithm and local-fetch mode. +/// +/// This creates a fresh repo at `repo_path` with the appropriate hash type +/// and pulls the `test:latest` image from containers-storage. +fn cstor_pull_with_algorithm( + storage_root: &Path, + repo_path: &Path, + algorithm: composefs_oci::composefs::fsverity::Algorithm, + local_fetch: composefs_oci::LocalFetchOpt, +) -> Result { + match algorithm { + composefs_oci::composefs::fsverity::Algorithm::Sha256 { .. } => { + let repo = init_insecure_repo_at::(repo_path, algorithm)?; + cstor_pull(&repo, storage_root, local_fetch) + } + composefs_oci::composefs::fsverity::Algorithm::Sha512 { .. } => { + let repo = init_insecure_repo_at::(repo_path, algorithm)?; + cstor_pull(&repo, storage_root, local_fetch) + } + } +} + +/// On ext4 (no reflink support), the import should skip FICLONE after the +/// first probe fails and use hardlinks instead (zero-copy). The `skopeo +/// copy` step runs in `unshare -m` to prevent the overlay driver's bind +/// mount from interfering with hardlinks. +/// +/// Covers all combinations of `{sha256, sha512} × {auto, zerocopy}`. +fn privileged_cstor_import_ext4_hardlink() -> Result<()> { + use composefs_oci::LocalFetchOpt; + use composefs_oci::composefs::fsverity::Algorithm; + + if require_privileged("privileged_cstor_import_ext4_hardlink")?.is_some() { + return Ok(()); + } + + let algorithms = [(Algorithm::SHA256, "sha256"), (Algorithm::SHA512, "sha512")]; + let modes = [ + (LocalFetchOpt::IfPossible, "auto"), + (LocalFetchOpt::ZeroCopy, "zerocopy"), + ]; + + let sh = Shell::new()?; + + for (algorithm, alg_name) in &algorithms { + // One LoopTempDir per algorithm: enabling verity is irreversible and + // algorithm-specific, so we cannot reuse across algorithms. + let fs = LoopTempDir::ext4_verity()?; + + let oci_dir = create_oci_layout_with_large_files(fs.path())?; + let storage_root = copy_oci_to_cstor(&sh, &oci_dir, fs.path())?; + + for (mode, mode_name) in &modes { + let repo_dir = fs.path().join(format!("repo-{alg_name}-{mode_name}")); + let stats = cstor_pull_with_algorithm(&storage_root, &repo_dir, *algorithm, *mode)?; + + println!("ext4 [{alg_name}/{mode_name}] import stats: {stats:?}"); + ensure!( + stats.objects_reflinked == 0, + "ext4 [{alg_name}/{mode_name}] should not reflink any objects, got {} reflinked", + stats.objects_reflinked, + ); + ensure!( + stats.objects_hardlinked > 0, + "ext4 [{alg_name}/{mode_name}] should hardlink objects, got 0 hardlinked (copied={})", + stats.objects_copied, + ); + ensure!( + stats.objects_copied == 0, + "ext4 [{alg_name}/{mode_name}] should not need copies, got {} copied", + stats.objects_copied, + ); + println!( + "ext4 [{alg_name}/{mode_name}]: {} hardlinked, {} already present", + stats.objects_hardlinked, stats.objects_already_present, + ); + } + } + + Ok(()) +} +integration_test!(privileged_cstor_import_ext4_hardlink); + +/// On XFS with reflink support, importing from containers-storage on the same +/// filesystem should use reflinks (zero-copy), and the import stats should +/// show `objects_reflinked > 0`. +/// +/// Covers all combinations of `{sha256, sha512} × {auto, zerocopy}`. +fn privileged_cstor_import_xfs_reflink() -> Result<()> { + use composefs_oci::LocalFetchOpt; + use composefs_oci::composefs::fsverity::Algorithm; + + if require_privileged("privileged_cstor_import_xfs_reflink")?.is_some() { + return Ok(()); + } + + // Skip if mkfs.xfs is not available (e.g. Debian bootc images). + if !Path::new("/usr/sbin/mkfs.xfs").exists() && !Path::new("/sbin/mkfs.xfs").exists() { + println!("SKIP: mkfs.xfs not available"); + return Ok(()); + } + + let algorithms = [(Algorithm::SHA256, "sha256"), (Algorithm::SHA512, "sha512")]; + let modes = [ + (LocalFetchOpt::IfPossible, "auto"), + (LocalFetchOpt::ZeroCopy, "zerocopy"), + ]; + + let sh = Shell::new()?; + + for (algorithm, alg_name) in &algorithms { + let fs = LoopTempDir::xfs_reflink()?; + + let oci_dir = create_oci_layout_with_large_files(fs.path())?; + let storage_root = copy_oci_to_cstor(&sh, &oci_dir, fs.path())?; + + for (mode, mode_name) in &modes { + let repo_dir = fs.path().join(format!("repo-{alg_name}-{mode_name}")); + let stats = cstor_pull_with_algorithm(&storage_root, &repo_dir, *algorithm, *mode)?; + + println!("XFS [{alg_name}/{mode_name}] import stats: {stats:?}"); + ensure!( + stats.objects_reflinked > 0, + "XFS [{alg_name}/{mode_name}] should reflink objects, got 0 reflinked (hardlinked={}, copied={})", + stats.objects_hardlinked, + stats.objects_copied, + ); + ensure!( + stats.objects_copied == 0, + "XFS [{alg_name}/{mode_name}] should not need copies, got {} copied", + stats.objects_copied, + ); + println!( + "XFS [{alg_name}/{mode_name}]: {} reflinked, {} already present", + stats.objects_reflinked, stats.objects_already_present, + ); + } + } + + Ok(()) +} +integration_test!(privileged_cstor_import_xfs_reflink);