From a6930449eba38d69d8f5d8fe8b6ca0a3cc7fc501 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Tue, 7 Apr 2026 11:32:54 -0400 Subject: [PATCH 01/10] tests: Use realistic file sizes and xattrs in test image layers While some of our tests pull large full images, it's handy to have a fully reproducible self-contained, but also representative image. Extend our synthetic one with some large files - this is prep for testing the reflink/hardlink support for containers-storage imports. Add a security. xattr etc. Assisted-by: OpenCode (Claude Opus 4) Signed-off-by: Colin Walters --- crates/composefs-oci/Cargo.toml | 4 +- crates/composefs-oci/src/lib.rs | 23 +-- crates/composefs-oci/src/test_util.rs | 155 +++++++++++++----- .../integration-tests/src/tests/privileged.rs | 16 +- 4 files changed, 139 insertions(+), 59 deletions(-) diff --git a/crates/composefs-oci/Cargo.toml b/crates/composefs-oci/Cargo.toml index 7898a68d..f3b2a56c 100644 --- a/crates/composefs-oci/Cargo.toml +++ b/crates/composefs-oci/Cargo.toml @@ -11,7 +11,7 @@ rust-version.workspace = true version.workspace = true [features] -test = ["tar", "composefs/test"] +test = ["tar", "rand", "composefs/test"] boot = ["composefs-boot"] [dependencies] @@ -29,6 +29,7 @@ serde = { version = "1.0", default-features = false, features = ["derive"] } thiserror = { version = "2.0.0", default-features = false } serde_json = { version = "1.0", default-features = false, features = ["std"] } sha2 = { version = "0.11.0", default-features = false } +rand = { version = "0.10.0", default-features = false, optional = true } tar = { version = "0.4.38", default-features = false, optional = true } tar-core = "0.1.0" tokio = { version = "1.24.2", features = ["macros", "rt-multi-thread"] } @@ -39,6 +40,7 @@ cap-std = { version = "4.0.0", default-features = false } cap-tempfile = { version = "4.0.0", default-features = false } similar-asserts = "1.7.0" tar = { version = "0.4.38", default-features = false } +rand = { version = "0.10.0", default-features = false } composefs = { workspace = true, features = ["test"] } composefs-boot = { workspace = true } once_cell = "1.21.3" diff --git a/crates/composefs-oci/src/lib.rs b/crates/composefs-oci/src/lib.rs index 4dfaa98d..54de6ecc 100644 --- a/crates/composefs-oci/src/lib.rs +++ b/crates/composefs-oci/src/lib.rs @@ -868,25 +868,27 @@ mod test { "\ / 0 40755 6 0 0 0 0.0 - - - /etc 0 40755 2 0 0 0 0.0 - - - -/etc/hostname 9 100644 1 0 0 0 0.0 - testhost\\n - +/etc/hostname 9 100644 1 0 0 0 0.0 - test-host - /etc/os-release 23 100644 1 0 0 0 0.0 - ID=test\\nVERSION_ID=1.0\\n - -/etc/passwd 34 100644 1 0 0 0 0.0 - root:x:0:0:root:/root:/usr/bin/sh\\n - +/etc/passwd 100 100644 1 0 0 0 0.0 f2/c4fd5735bd46db3b18d402ae87c5086c97c0e1321901cfd30f320b73ef25aa - f2c4fd5735bd46db3b18d402ae87c5086c97c0e1321901cfd30f320b73ef25aa /tmp 0 40755 2 0 0 0 0.0 - - - /usr 0 40755 5 0 0 0 0.0 - - - /usr/bin 0 40755 2 0 0 0 0.0 - - - -/usr/bin/busybox 22 100755 1 0 0 0 0.0 - busybox-binary-content - +/usr/bin/busybox 4096 100755 1 0 0 0 0.0 f0/f7e1e58fdd31f5792222087377a4a976760c416ecdf5f426193e608681b7a1 - f0f7e1e58fdd31f5792222087377a4a976760c416ecdf5f426193e608681b7a1 /usr/bin/cat 7 120777 1 0 0 0 0.0 busybox - - /usr/bin/cp 7 120777 1 0 0 0 0.0 busybox - - /usr/bin/ls 7 120777 1 0 0 0 0.0 busybox - - /usr/bin/mv 7 120777 1 0 0 0 0.0 busybox - - -/usr/bin/myapp 25 100755 1 0 0 0 0.0 - #!/usr/bin/sh\\necho\\x20hello\\n - +/usr/bin/ping 7 120777 1 0 0 0 0.0 busybox - - security.capability=\\x02\\x00\\x00\\x02\\x00\\x20\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00 /usr/bin/rm 7 120777 1 0 0 0 0.0 busybox - - /usr/bin/sh 7 120777 1 0 0 0 0.0 busybox - - /usr/lib 0 40755 2 0 0 0 0.0 - - - /usr/share 0 40755 3 0 0 0 0.0 - - - -/usr/share/myapp 0 40755 2 0 0 0 0.0 - - - -/usr/share/myapp/data.txt 16 100644 1 0 0 0 0.0 - application-data - -/var 0 40755 2 0 0 0 0.0 - - - +/usr/share/doc 0 40755 2 0 0 0 0.0 - - - +/usr/share/doc/README 512 100644 1 0 0 0 0.0 51/44b8f80be57c3518f410d930e18c4e405387c82e4993c18265a1ba4a80263b - 5144b8f80be57c3518f410d930e18c4e405387c82e4993c18265a1ba4a80263b +/var 0 40755 3 0 0 0 0.0 - - - +/var/data 0 40755 2 0 0 0 0.0 - - - +/var/data/app.json 256 100644 1 0 0 0 0.0 c9/21965b74ac1780bc437cec640b27186d85317b9afdb3dbb68626aed5ecd2b6 - c921965b74ac1780bc437cec640b27186d85317b9afdb3dbb68626aed5ecd2b6 " ); } @@ -936,9 +938,10 @@ mod test { // Untag and GC — everything gets collected oci_image::untag_image(repo, "gctest:v1").unwrap(); let gc2 = repo.gc(&[]).unwrap(); - // 10 objects: 5 layer splitstreams + config JSON + manifest JSON - // + EROFS image + new config splitstream + new manifest splitstream - assert_eq!(gc2.objects_removed, 10, "all objects collected after untag"); + // 14 objects: 5 layer splitstreams + 4 external file objects + // + config JSON + manifest JSON + EROFS image + // + new config splitstream + new manifest splitstream + assert_eq!(gc2.objects_removed, 14, "all objects collected after untag"); // 7 streams: 5 layers + 1 config + 1 manifest (tag ref removed by untag) assert_eq!(gc2.streams_pruned, 7, "all stream symlinks pruned"); // 1 image: the EROFS symlink under images/ diff --git a/crates/composefs-oci/src/test_util.rs b/crates/composefs-oci/src/test_util.rs index 9953350e..8ff04b13 100644 --- a/crates/composefs-oci/src/test_util.rs +++ b/crates/composefs-oci/src/test_util.rs @@ -18,7 +18,6 @@ /// /usr/bin/sh 0 120777 1 0 0 0 0.0 busybox - - /// ``` use std::collections::HashMap; -use std::io::Read as _; use std::sync::Arc; use crate::oci_image::write_manifest; @@ -41,12 +40,50 @@ fn hash(bytes: &[u8]) -> OciDigest { .unwrap() } +/// Write a PAX extended header entry followed by the real entry to a +/// [`tar::Builder`]. +/// +/// The `xattrs` are encoded as `SCHILY.xattr.=` PAX records, +/// which is the de-facto standard used by GNU tar, BSD tar, and all +/// container runtimes. +fn append_with_xattrs( + builder: &mut ::tar::Builder, + header: &mut ::tar::Header, + path: &str, + data: &[u8], + xattrs: &[(String, Vec)], +) { + // Build the PAX extended header payload using tar_core's PaxBuilder. + let mut pax = tar_core::builder::PaxBuilder::new(); + for (key, value) in xattrs { + pax.add(&format!("SCHILY.xattr.{key}"), value); + } + let pax_data = pax.finish(); + + // Write the PAX header entry (type 'x'). + let mut pax_header = ::tar::Header::new_ustar(); + pax_header.set_entry_type(::tar::EntryType::XHeader); + pax_header.set_size(pax_data.len() as u64); + pax_header.set_mode(0o644); + let pax_path = format!("PaxHeader/{path}"); + builder + .append_data(&mut pax_header, &pax_path, &pax_data[..]) + .unwrap(); + + // Write the actual entry immediately after (same archive stream). + builder.append_data(header, path, data).unwrap(); +} + /// Convert composefs dumpfile lines into tar bytes. /// /// Parses each line as a composefs [`Entry`] and builds the corresponding /// tar entry. The root directory (`/`) is skipped since tar archives don't -/// include it. Only regular files (inline), directories, and symlinks are -/// supported — this is sufficient for test images. +/// include it. Regular files (inline and external), directories, and +/// symlinks are supported. Any xattrs present (e.g. `security.capability`) +/// are emitted as PAX extended headers. +/// +/// External files (`Item::Regular` with no inline content) get +/// deterministic pseudo-random data from a seeded RNG (keyed on file size). fn dumpfile_to_tar(dumpfile: &str) -> Vec { let mut builder = ::tar::Builder::new(vec![]); @@ -71,6 +108,20 @@ fn dumpfile_to_tar(dumpfile: &str) -> Vec { .expect("non-UTF8 path") .trim_start_matches('/'); + // Collect xattrs for PAX headers. + let xattrs: Vec<(String, Vec)> = entry + .xattrs + .iter() + .map(|x| { + let key = x + .key + .to_str() + .unwrap_or_else(|| panic!("non-UTF8 xattr key: {:?}", x.key)); + (key.to_owned(), x.value.to_vec()) + }) + .collect(); + let has_xattrs = !xattrs.is_empty(); + let ty = FileType::from_raw_mode(entry.mode); match ty { FileType::Directory => { @@ -80,36 +131,40 @@ fn dumpfile_to_tar(dumpfile: &str) -> Vec { header.set_mode(entry.mode & 0o7777); header.set_entry_type(::tar::EntryType::Directory); header.set_size(0); - builder - .append_data(&mut header, path, std::io::empty()) - .unwrap(); - } - FileType::RegularFile => match &entry.item { - Item::RegularInline { content, .. } => { - let mut header = ::tar::Header::new_ustar(); - header.set_uid(entry.uid.into()); - header.set_gid(entry.gid.into()); - header.set_mode(entry.mode & 0o7777); - header.set_entry_type(::tar::EntryType::Regular); - header.set_size(content.len() as u64); + if has_xattrs { + append_with_xattrs(&mut builder, &mut header, path, &[], &xattrs); + } else { builder - .append_data(&mut header, path, &content[..]) + .append_data(&mut header, path, std::io::empty()) .unwrap(); } - Item::Regular { size, .. } => { - // External file with no inline content — create sized entry - let mut header = ::tar::Header::new_ustar(); - header.set_uid(entry.uid.into()); - header.set_gid(entry.gid.into()); - header.set_mode(entry.mode & 0o7777); - header.set_entry_type(::tar::EntryType::Regular); - header.set_size(*size); + } + FileType::RegularFile => { + let content: Vec = match &entry.item { + Item::RegularInline { content, .. } => content.to_vec(), + Item::Regular { size, .. } => { + use rand::{RngExt, SeedableRng, rngs::SmallRng}; + let mut rng = SmallRng::seed_from_u64(*size); + let mut buf = vec![0u8; *size as usize]; + rng.fill(&mut buf[..]); + buf + } + other => panic!("unexpected regular file item variant: {other:?}"), + }; + let mut header = ::tar::Header::new_ustar(); + header.set_uid(entry.uid.into()); + header.set_gid(entry.gid.into()); + header.set_mode(entry.mode & 0o7777); + header.set_entry_type(::tar::EntryType::Regular); + header.set_size(content.len() as u64); + if has_xattrs { + append_with_xattrs(&mut builder, &mut header, path, &content, &xattrs); + } else { builder - .append_data(&mut header, path, std::io::repeat(0u8).take(*size)) + .append_data(&mut header, path, &content[..]) .unwrap(); } - other => panic!("unexpected regular file item variant: {other:?}"), - }, + } FileType::Symlink => { let target = match &entry.item { Item::Symlink { target, .. } => target, @@ -124,9 +179,13 @@ fn dumpfile_to_tar(dumpfile: &str) -> Vec { header .set_link_name(target.as_ref()) .expect("failed to set symlink target"); - builder - .append_data(&mut header, path, std::io::empty()) - .unwrap(); + if has_xattrs { + append_with_xattrs(&mut builder, &mut header, path, &[], &xattrs); + } else { + builder + .append_data(&mut header, path, std::io::empty()) + .unwrap(); + } } other => panic!("unsupported file type in test dumpfile: {other:?}"), } @@ -252,12 +311,16 @@ async fn create_multi_layer_image( // --------------------------------------------------------------------------- // Layer definitions in composefs dumpfile format // -// Format: /path size mode nlink uid gid rdev mtime payload content digest +// Format: /path size mode nlink uid gid rdev mtime payload content digest [xattr=val ...] // // Directories: /path 0 40755 2 0 0 0 0.0 - - - // Inline files: /path 100644 1 0 0 0 0.0 - - +// External: /path 100644 1 0 0 0 0.0 / - - (data is auto-generated) // Executables: /path 100755 1 0 0 0 0.0 - - // Symlinks: /path 120777 1 0 0 0 0.0 - - +// +// Xattrs are appended as space-separated key=value pairs after the digest +// (e.g. security.capability for file capabilities). // --------------------------------------------------------------------------- const LAYER_ROOT_STRUCTURE: &str = "\ @@ -271,14 +334,18 @@ const LAYER_ROOT_STRUCTURE: &str = "\ /tmp 0 40755 2 0 0 0 0.0 - - - "; +/// Busybox layer with a 4 KiB binary (external, > INLINE_CONTENT_MAX_V0). const LAYER_BUSYBOX: &str = "\ / 0 40755 2 0 0 0 0.0 - - - /usr 0 40755 2 0 0 0 0.0 - - - /usr/bin 0 40755 2 0 0 0 0.0 - - - -/usr/bin/busybox 22 100755 1 0 0 0 0.0 - busybox-binary-content - +/usr/bin/busybox 4096 100755 1 0 0 0 0.0 / - - /usr/bin/sh 7 120777 1 0 0 0 0.0 busybox - - "; +/// Core utils layer. `/usr/bin/ping` carries a `security.capability` xattr +/// granting CAP_NET_RAW (VFS_CAP_REVISION_2), which is the realistic pattern +/// seen in real container images. const LAYER_CORE_UTILS: &str = "\ / 0 40755 2 0 0 0 0.0 - - - /usr 0 40755 2 0 0 0 0.0 - - - @@ -287,25 +354,29 @@ const LAYER_CORE_UTILS: &str = "\ /usr/bin/cat 7 120777 1 0 0 0 0.0 busybox - - /usr/bin/cp 7 120777 1 0 0 0 0.0 busybox - - /usr/bin/mv 7 120777 1 0 0 0 0.0 busybox - - +/usr/bin/ping 7 120777 1 0 0 0 0.0 busybox - - security.capability=\\x02\\x00\\x00\\x02\\x00\\x20\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00 /usr/bin/rm 7 120777 1 0 0 0 0.0 busybox - - "; +/// Config layer with /etc/passwd as a 100-byte external file. const LAYER_CONFIG: &str = "\ / 0 40755 2 0 0 0 0.0 - - - /etc 0 40755 2 0 0 0 0.0 - - - /etc/os-release 26 100644 1 0 0 0 0.0 - ID=test\\nVERSION_ID=1.0\\n - -/etc/hostname 9 100644 1 0 0 0 0.0 - testhost\\n - -/etc/passwd 36 100644 1 0 0 0 0.0 - root:x:0:0:root:/root:/usr/bin/sh\\n - +/etc/hostname 9 100644 1 0 0 0 0.0 - test-host - +/etc/passwd 100 100644 1 0 0 0 0.0 / - - "; +/// App layer with a 512-byte README and 256-byte JSON (both external). const LAYER_APP: &str = "\ / 0 40755 2 0 0 0 0.0 - - - /usr 0 40755 2 0 0 0 0.0 - - - /usr/share 0 40755 2 0 0 0 0.0 - - - -/usr/share/myapp 0 40755 2 0 0 0 0.0 - - - -/usr/share/myapp/data.txt 16 100644 1 0 0 0 0.0 - application-data - -/usr/bin 0 40755 2 0 0 0 0.0 - - - -/usr/bin/myapp 26 100755 1 0 0 0 0.0 - #!/usr/bin/sh\\necho\\x20hello\\n - +/usr/share/doc 0 40755 2 0 0 0 0.0 - - - +/usr/share/doc/README 512 100644 1 0 0 0 0.0 / - - +/var 0 40755 2 0 0 0 0.0 - - - +/var/data 0 40755 2 0 0 0 0.0 - - - +/var/data/app.json 256 100644 1 0 0 0 0.0 / - - "; const LAYER_BOOT_DIRS: &str = "\ @@ -420,16 +491,16 @@ const LAYER_LIBS_1: &str = "\ / 0 40755 2 0 0 0 0.0 - - - /usr 0 40755 2 0 0 0 0.0 - - - /usr/lib 0 40755 2 0 0 0 0.0 - - - -/usr/lib/libc.so.6 16 100644 1 0 0 0 0.0 - fake-libc-content - -/usr/lib/libm.so.6 16 100644 1 0 0 0 0.0 - fake-libm-content - +/usr/lib/libc.so.6 8192 100644 1 0 0 0 0.0 / - - +/usr/lib/libm.so.6 4096 100644 1 0 0 0 0.0 / - - "; const LAYER_LIBS_2: &str = "\ / 0 40755 2 0 0 0 0.0 - - - /usr 0 40755 2 0 0 0 0.0 - - - /usr/lib 0 40755 2 0 0 0 0.0 - - - -/usr/lib/libpthread.so.0 22 100644 1 0 0 0 0.0 - fake-libpthread-content - -/usr/lib/libdl.so.2 16 100644 1 0 0 0 0.0 - fake-libdl-content - +/usr/lib/libpthread.so.0 4096 100644 1 0 0 0 0.0 / - - +/usr/lib/libdl.so.2 2048 100644 1 0 0 0 0.0 / - - "; const LAYER_LOCALE: &str = "\ diff --git a/crates/integration-tests/src/tests/privileged.rs b/crates/integration-tests/src/tests/privileged.rs index 6e39bff3..4c218bf1 100644 --- a/crates/integration-tests/src/tests/privileged.rs +++ b/crates/integration-tests/src/tests/privileged.rs @@ -273,7 +273,7 @@ fn privileged_oci_pull_mount() -> Result<()> { // Verify file content at the mountpoint let hostname = std::fs::read_to_string(mountpoint.path().join("etc/hostname"))?; - ensure!(hostname == "testhost\n", "hostname mismatch: {hostname:?}"); + ensure!(hostname == "test-host", "hostname mismatch: {hostname:?}"); let os_release = std::fs::read_to_string(mountpoint.path().join("etc/os-release"))?; ensure!( @@ -281,10 +281,12 @@ fn privileged_oci_pull_mount() -> Result<()> { "os-release missing ID: {os_release:?}" ); + // busybox is a 4096-byte external file (random data seeded from size) let busybox = std::fs::read(mountpoint.path().join("usr/bin/busybox"))?; ensure!( - busybox == b"busybox-binary-content", - "busybox content mismatch" + busybox.len() == 4096, + "busybox size mismatch: expected 4096, got {}", + busybox.len() ); let sh_target = std::fs::read_link(mountpoint.path().join("usr/bin/sh"))?; @@ -293,10 +295,12 @@ fn privileged_oci_pull_mount() -> Result<()> { "sh symlink target mismatch: {sh_target:?}" ); - let app_data = std::fs::read_to_string(mountpoint.path().join("usr/share/myapp/data.txt"))?; + // App layer has a 512-byte README (external, random data) + let readme = std::fs::read(mountpoint.path().join("usr/share/doc/README"))?; ensure!( - app_data == "application-data", - "app data mismatch: {app_data:?}" + readme.len() == 512, + "README size mismatch: expected 512, got {}", + readme.len() ); ensure!(mountpoint.path().join("tmp").is_dir(), "/tmp missing"); From 5687d60d9c24f4d521e8d5521c3f4912458910b4 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Thu, 2 Apr 2026 21:35:01 +0000 Subject: [PATCH 02/10] composefs: Remove unused test imports Remove unused Mode and mkdirat imports from splitstream test module, left behind by the edition 2024 migration. Assisted-by: OpenCode (Claude Opus 4) Signed-off-by: Colin Walters --- crates/composefs/src/repository.rs | 2 +- crates/composefs/src/splitstream.rs | 2 +- crates/composefs/tests/mkfs.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/composefs/src/repository.rs b/crates/composefs/src/repository.rs index e402a82c..2930cfcb 100644 --- a/crates/composefs/src/repository.rs +++ b/crates/composefs/src/repository.rs @@ -3454,7 +3454,7 @@ mod tests { Ok(()) } - use crate::tree::{FileSystem, Inode, Leaf, LeafContent, RegularFile, Stat}; + use crate::tree::{FileSystem, Inode, LeafContent, RegularFile, Stat}; /// Create a default root stat for test filesystems fn test_root_stat() -> Stat { diff --git a/crates/composefs/src/splitstream.rs b/crates/composefs/src/splitstream.rs index ea4c6d1a..19d3b57f 100644 --- a/crates/composefs/src/splitstream.rs +++ b/crates/composefs/src/splitstream.rs @@ -969,7 +969,7 @@ mod tests { use super::*; use crate::fsverity::{Sha256HashValue, compute_verity}; use crate::test::tempdir; - use rustix::fs::{CWD, Mode, mkdirat}; + use rustix::fs::CWD; use std::io::Cursor; use std::path::Path; diff --git a/crates/composefs/tests/mkfs.rs b/crates/composefs/tests/mkfs.rs index ae8d83a8..b2896c69 100644 --- a/crates/composefs/tests/mkfs.rs +++ b/crates/composefs/tests/mkfs.rs @@ -14,7 +14,7 @@ use composefs::{ dumpfile::write_dumpfile, erofs::{debug::debug_img, writer::mkfs_erofs}, fsverity::{FsVerityHashValue, Sha256HashValue}, - tree::{FileSystem, Inode, Leaf, LeafContent, RegularFile, Stat}, + tree::{FileSystem, Inode, LeafContent, RegularFile, Stat}, }; fn default_stat() -> Stat { From 1d601533e34c0b1be3fe76757dac728179fc9522 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Wed, 1 Apr 2026 00:45:42 +0000 Subject: [PATCH 03/10] bootc: Improve patch target for reliable rebuilds Three improvements to the bootc revdep testing workflow: - Reject uncommitted changes in composefs-rs so tests always run against a real commit - Embed the composefs-rs commit hash in a Cargo.toml comment so podman's build cache is invalidated when the source changes (path deps don't record a revision in Cargo.lock) - Always re-run cargo update cfsctl on each invocation Assisted-by: OpenCode (Claude Opus 4) Signed-off-by: Colin Walters --- bootc/Justfile | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/bootc/Justfile b/bootc/Justfile index f9ad20cc..8ad4a840 100644 --- a/bootc/Justfile +++ b/bootc/Justfile @@ -45,20 +45,22 @@ clone: # `rev` (done when the bootc branch is updated) busts the podman build # cache. This recipe just adds a [patch] override so Cargo resolves from # the bind-mounted local source instead of fetching from git. +# +# Errors if the composefs-rs tree has uncommitted changes. patch: clone #!/bin/bash set -euo pipefail - cd "$COMPOSEFS_BOOTC_PATH" - cfs_path="$_COMPOSEFS_SRC/crates/cfsctl" - - # Check if already patched (idempotent) - if grep -q 'Patched by composefs-rs' Cargo.toml 2>/dev/null; then - echo "bootc already patched for composefs-rs" - exit 0 + # Require a clean composefs-rs working tree so we test a real commit + if ! git -C "$_COMPOSEFS_SRC" diff --quiet HEAD 2>/dev/null; then + echo "error: composefs-rs has uncommitted changes — commit or stash first" >&2 + git -C "$_COMPOSEFS_SRC" status --short >&2 + exit 1 fi - echo "Patching bootc Cargo.toml to use $_COMPOSEFS_SRC" + cfs_path="$_COMPOSEFS_SRC/crates/cfsctl" + + cd "$COMPOSEFS_BOOTC_PATH" # Add or update the [patch] section with a path override patch_value="cfsctl = { path = \"${cfs_path}\" } # Patched by composefs-rs" @@ -85,7 +87,12 @@ patch: clone # We intentionally don't run `cargo update` here because it rewrites # the workspace dependency line in Cargo.toml (replacing git+rev with path). - echo "bootc patched successfully" + # Update the rev comment in the [patch] section so Cargo.toml actually + # changes when composefs-rs moves to a new commit. Since the file is + # part of the podman build context this busts the layer cache. + _rev=$(git -C "$_COMPOSEFS_SRC" rev-parse HEAD) + sed -i "s/^# Patched by composefs-rs.*/# Patched by composefs-rs at ${_rev}/" Cargo.toml + echo "bootc patched for composefs-rs at ${_rev}" # Build sealed bootc image using local composefs-rs # The path dependency is auto-detected and bind-mounted by bootc's Justfile @@ -158,6 +165,6 @@ config: Example Usage: just bootc/build # Clone main, patch, and build - COMPOSEFS_BOOTC_REF=v1.2.0 just bootc/build # Use specific tag + COMPOSEFS_BOOTC_REF=v1.0.0 just bootc/build # Use specific tag COMPOSEFS_BOOTC_REF=refs/pull/1791/head just bootc/build # Use PR EOF From 63588eb2b6a1da1219212f26334adcc70aafbe38 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Tue, 7 Apr 2026 11:57:34 -0400 Subject: [PATCH 04/10] Add cstorage crate for containers-storage access Add a new cstorage crate that provides read-only access to containers/storage (the backend used by Podman, Buildah, and CRI-O). This enables composefs to directly read OCI layers from local container stores without re-downloading them. The crate supports: - Overlay storage driver with layer chain resolution - Additional image stores (read-only stores) - Rootless storage via user namespace helpers - tar-split based layer reconstruction - JSON-RPC over Unix socket for user namespace operations - Container store locking (shared/exclusive) Also adds jsonrpc-fdpass as a workspace dependency for the Unix socket IPC used by the user namespace helper. Assisted-by: OpenCode (Claude Opus 4) Signed-off-by: Colin Walters --- Cargo.toml | 3 + crates/cstorage/Cargo.toml | 41 + crates/cstorage/src/config.rs | 119 +++ crates/cstorage/src/error.rs | 64 ++ crates/cstorage/src/image.rs | 273 +++++++ crates/cstorage/src/layer.rs | 489 ++++++++++++ crates/cstorage/src/lib.rs | 76 ++ crates/cstorage/src/storage.rs | 623 +++++++++++++++ crates/cstorage/src/tar_split.rs | 711 +++++++++++++++++ crates/cstorage/src/userns.rs | 67 ++ crates/cstorage/src/userns_helper.rs | 1086 ++++++++++++++++++++++++++ 11 files changed, 3552 insertions(+) create mode 100644 crates/cstorage/Cargo.toml create mode 100644 crates/cstorage/src/config.rs create mode 100644 crates/cstorage/src/error.rs create mode 100644 crates/cstorage/src/image.rs create mode 100644 crates/cstorage/src/layer.rs create mode 100644 crates/cstorage/src/lib.rs create mode 100644 crates/cstorage/src/storage.rs create mode 100644 crates/cstorage/src/tar_split.rs create mode 100644 crates/cstorage/src/userns.rs create mode 100644 crates/cstorage/src/userns_helper.rs diff --git a/Cargo.toml b/Cargo.toml index 06b9a600..2c2d7ef6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,9 @@ composefs-oci = { version = "0.3.0", path = "crates/composefs-oci", default-feat composefs-boot = { version = "0.3.0", path = "crates/composefs-boot", default-features = false } composefs-http = { version = "0.3.0", path = "crates/composefs-http", default-features = false } +# JSON-RPC with FD passing for userns helper +jsonrpc-fdpass = { version = "0.1.0", default-features = false } + [profile.dev.package.sha2] # this is *really* slow otherwise opt-level = 3 diff --git a/crates/cstorage/Cargo.toml b/crates/cstorage/Cargo.toml new file mode 100644 index 00000000..e4e524d8 --- /dev/null +++ b/crates/cstorage/Cargo.toml @@ -0,0 +1,41 @@ +[package] +name = "cstorage" +description = "Read-only access to containers-storage (overlay driver)" +keywords = ["containers", "storage", "overlay", "podman", "buildah"] + +edition.workspace = true +license.workspace = true +readme.workspace = true +repository.workspace = true +rust-version.workspace = true +version.workspace = true + +[dependencies] +anyhow = { version = "1.0", default-features = false, features = ["std"] } +base64 = { version = "0.22", default-features = false, features = ["std"] } +cap-std = { version = "4.0", default-features = false } +cap-std-ext = { version = "4.0", default-features = false } +crc = { version = "3.0", default-features = false } +flate2 = { version = "1.0", default-features = false, features = ["rust_backend"] } +jsonrpc-fdpass = { workspace = true, optional = true } +oci-spec = { version = "0.8", default-features = false, features = ["image"] } +rustix = { version = "1.0", default-features = false, features = ["fs", "std", "process", "thread"] } +serde = { version = "1.0", default-features = false, features = ["derive"] } +serde_json = { version = "1.0", default-features = false, features = ["std"] } +sha2 = { version = "0.10", default-features = false, features = ["std"] } +tar-core = "0.1.0" +thiserror = { version = "2.0", default-features = false } +tokio = { version = "1.40", default-features = false, features = ["rt", "net", "sync"], optional = true } +toml = { version = "0.8", default-features = false, features = ["parse"] } +tracing = { version = "0.1", default-features = false, optional = true } +zstd = { version = "0.13", default-features = false } + +[features] +default = [] +userns-helper = ["dep:jsonrpc-fdpass", "dep:tokio", "dep:tracing"] + +[dev-dependencies] +tempfile = { version = "3.8", default-features = false } + +[lints] +workspace = true diff --git a/crates/cstorage/src/config.rs b/crates/cstorage/src/config.rs new file mode 100644 index 00000000..8d8d14a2 --- /dev/null +++ b/crates/cstorage/src/config.rs @@ -0,0 +1,119 @@ +//! Configuration parsing for container storage. +//! +//! This module provides structures for parsing storage.conf files used by +//! containers-storage. Configuration files define storage locations, drivers, +//! and additional read-only image stores. +//! +//! # Overview +//! +//! Container storage configuration is typically found in: +//! - System-wide: `/etc/containers/storage.conf` +//! - User-specific: `~/.config/containers/storage.conf` +//! +//! The configuration uses TOML format and specifies the storage driver +//! (overlay, btrfs, etc.), root paths, and additional layer/image stores. +//! +//! # Configuration Structure +//! +//! A typical storage.conf file looks like: +//! ```toml +//! [storage] +//! driver = "overlay" +//! root = "/var/lib/containers/storage" +//! run_root = "/run/containers/storage" +//! +//! # Additional read-only image stores +//! image_stores = [ +//! "/usr/share/containers/storage" +//! ] +//! +//! # Additional layer stores configuration +//! [[storage.layer_stores]] +//! path = "/mnt/layers" +//! with_reference = true +//! ``` + +use serde::Deserialize; +use std::path::PathBuf; + +/// Storage configuration, typically parsed from storage.conf files. +/// +/// Configuration files are searched in: +/// - `/etc/containers/storage.conf` +/// - `$HOME/.config/containers/storage.conf` +#[derive(Debug, Clone, Deserialize)] +pub struct StorageConfig { + /// Storage driver name (should be "overlay" for this library). + #[serde(default)] + pub driver: String, + + /// Primary storage root path. + #[serde(default)] + pub root: PathBuf, + + /// Runtime root for transient data. + #[serde(default)] + pub run_root: PathBuf, + + /// Additional read-only image stores. + #[serde(default)] + pub image_stores: Vec, + + /// Additional layer stores configuration. + #[serde(default)] + pub layer_stores: Vec, +} + +/// Configuration for an additional layer store. +#[derive(Debug, Clone, Deserialize)] +pub struct AdditionalLayerStore { + /// Path to the additional layer store. + pub path: PathBuf, + + /// Whether to use base64-encoded references in paths. + #[serde(default)] + pub with_reference: bool, +} + +impl StorageConfig { + /// Parse storage configuration from TOML content. + /// + /// # Errors + /// + /// Returns an error if the TOML content is invalid. + pub fn from_toml(content: &str) -> Result { + toml::from_str(content) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_basic_config() { + let config_str = r#" +driver = "overlay" +root = "/var/lib/containers/storage" +"#; + let config = StorageConfig::from_toml(config_str).unwrap(); + assert_eq!(config.driver, "overlay"); + assert_eq!(config.root, PathBuf::from("/var/lib/containers/storage")); + } + + #[test] + fn test_parse_with_layer_stores() { + let config_str = r#" +driver = "overlay" +root = "/var/lib/containers/storage" + +[[layer_stores]] +path = "/mnt/layers" +with_reference = true +"#; + let config = StorageConfig::from_toml(config_str).unwrap(); + assert_eq!(config.layer_stores.len(), 1); + assert_eq!(config.layer_stores[0].path, PathBuf::from("/mnt/layers")); + assert!(config.layer_stores[0].with_reference); + } +} diff --git a/crates/cstorage/src/error.rs b/crates/cstorage/src/error.rs new file mode 100644 index 00000000..959042f3 --- /dev/null +++ b/crates/cstorage/src/error.rs @@ -0,0 +1,64 @@ +//! Error types for the cstorage library. +//! +//! This module defines the error types used throughout the library. All operations +//! that can fail return a [`Result`] which is an alias for `Result`. +//! +//! # Error Categories +//! +//! Errors are organized into several categories: +//! +//! - **Storage errors**: [`RootNotFound`], [`InvalidStorage`] +//! - **Entity errors**: [`LayerNotFound`], [`ImageNotFound`] +//! - **Link resolution**: [`LinkReadError`] +//! - **Tar-split processing**: [`TarSplitError`] +//! - **System errors**: [`Io`], [`JsonParse`] +//! +//! [`RootNotFound`]: StorageError::RootNotFound +//! [`InvalidStorage`]: StorageError::InvalidStorage +//! [`LayerNotFound`]: StorageError::LayerNotFound +//! [`ImageNotFound`]: StorageError::ImageNotFound +//! [`LinkReadError`]: StorageError::LinkReadError +//! [`TarSplitError`]: StorageError::TarSplitError +//! [`Io`]: StorageError::Io +//! [`JsonParse`]: StorageError::JsonParse + +use std::path::PathBuf; + +/// Result type alias for operations that may return a StorageError. +pub type Result = std::result::Result; + +/// Error types for storage operations. +#[derive(Debug, thiserror::Error)] +pub enum StorageError { + /// Storage root directory was not found at the specified path. + #[error("storage root not found at {0}")] + RootNotFound(PathBuf), + + /// Storage validation failed with the provided reason. + #[error("invalid storage: {0}")] + InvalidStorage(String), + + /// The requested layer was not found. + #[error("layer not found: {0}")] + LayerNotFound(String), + + /// The requested image was not found. + #[error("image not found: {0}")] + ImageNotFound(String), + + /// Failed to read a link file. + #[error("failed to read link file: {0}")] + LinkReadError(String), + + /// Error related to tar-split processing. + #[error("tar-split error: {0}")] + TarSplitError(String), + + /// I/O error occurred during file operations. + #[error("I/O error: {0}")] + Io(#[from] std::io::Error), + + /// JSON parsing error occurred. + #[error("JSON parse error: {0}")] + JsonParse(#[from] serde_json::Error), +} diff --git a/crates/cstorage/src/image.rs b/crates/cstorage/src/image.rs new file mode 100644 index 00000000..0b1d4814 --- /dev/null +++ b/crates/cstorage/src/image.rs @@ -0,0 +1,273 @@ +//! Image reading and manifest parsing. +//! +//! This module provides access to OCI image manifests and metadata stored in +//! the `overlay-images/` directory. All operations use fd-relative access via +//! cap-std Dir handles. +//! +//! # Overview +//! +//! The [`Image`] struct represents a container image stored in the overlay driver. +//! It provides access to: +//! - OCI image manifests ([`oci_spec::image::ImageManifest`]) +//! - OCI image configurations ([`oci_spec::image::ImageConfiguration`]) +//! - Layer information (diff_ids that map to storage layer IDs) +//! - Additional metadata stored in base64-encoded files +//! +//! # Image Directory Structure +//! +//! Each image is stored in `overlay-images//`: +//! ```text +//! overlay-images// +//! +-- manifest # OCI image manifest (JSON) +//! +-- = # Additional metadata files +//! ``` + +use base64::{Engine, engine::general_purpose::STANDARD}; +use cap_std::fs::Dir; +use oci_spec::image::{ImageConfiguration, ImageManifest}; +use std::io::Read; + +use crate::error::{Result, StorageError}; +use crate::storage::Storage; + +/// Filename for OCI image manifest in the image directory. +const MANIFEST_FILENAME: &str = "manifest"; + +/// Represents an OCI image with its metadata and manifest. +#[derive(Debug)] +pub struct Image { + /// Image ID (typically a 64-character hex digest). + id: String, + + /// Directory handle for overlay-images/\/. + image_dir: Dir, +} + +impl Image { + /// Open an image by ID using fd-relative operations. + /// + /// The ID can be provided with or without a `sha256:` prefix - the prefix + /// will be stripped if present, since containers-storage directories use + /// just the hex digest. + /// + /// # Errors + /// + /// Returns an error if the image directory doesn't exist or cannot be opened. + pub fn open(storage: &Storage, id: &str) -> Result { + // Strip the sha256: prefix if present - containers-storage directories + // use just the hex digest, but image IDs from podman (e.g. via --iidfile) + // include the prefix. See https://github.com/containers/skopeo/issues/2750 + let id = id.strip_prefix("sha256:").unwrap_or(id); + + // Open overlay-images directory from storage root + let images_dir = storage.root_dir().open_dir("overlay-images")?; + + // Open specific image directory + let image_dir = images_dir + .open_dir(id) + .map_err(|_| StorageError::ImageNotFound(id.to_string()))?; + + Ok(Self { + id: id.to_string(), + image_dir, + }) + } + + /// Get the image ID. + pub fn id(&self) -> &str { + &self.id + } + + /// Read and parse the image manifest. + /// + /// The manifest is stored as a JSON file named "manifest" in the image directory. + /// + /// # Errors + /// + /// Returns an error if the manifest file cannot be read or parsed. + pub fn manifest(&self) -> Result { + let file = self.image_dir.open(MANIFEST_FILENAME)?; + serde_json::from_reader(file) + .map_err(|e| StorageError::InvalidStorage(format!("Invalid manifest JSON: {}", e))) + } + + /// Read and parse the image configuration. + /// + /// The image config is stored with a base64-encoded key based on the image digest. + /// + /// # Errors + /// + /// Returns an error if the config file cannot be read or parsed. + pub fn config(&self) -> Result { + // The config is stored with key: sha256: + // Base64 encode: "sha256:" + let key = format!("sha256:{}", self.id); + let encoded_key = STANDARD.encode(key.as_bytes()); + + let config_data = self.read_metadata(&encoded_key).map_err(|e| { + StorageError::Io(std::io::Error::other(format!( + "reading config metadata ={} for image {}: {}", + encoded_key, self.id, e + ))) + })?; + serde_json::from_slice(&config_data) + .map_err(|e| StorageError::InvalidStorage(format!("Invalid config JSON: {}", e))) + } + + /// Get the OCI diff_ids for this image in order (base to top). + /// + /// This returns the diff_ids from the image config, which are the uncompressed + /// tar digests. Note that these are **not** the same as the storage layer IDs! + /// To get the actual storage layer IDs, use [`storage_layer_ids()`](Self::storage_layer_ids). + /// + /// # Errors + /// + /// Returns an error if the config cannot be read or parsed. + pub fn layers(&self) -> Result> { + let config = self.config()?; + + // Extract diff_ids from config - these are NOT the storage layer IDs + let diff_ids: Vec = config + .rootfs() + .diff_ids() + .iter() + .map(|digest| { + // Remove the "sha256:" prefix if present + let diff_id = digest.to_string(); + diff_id + .strip_prefix("sha256:") + .unwrap_or(&diff_id) + .to_string() + }) + .collect(); + + Ok(diff_ids) + } + + /// Get the storage layer IDs for this image in order (base to top). + /// + /// Unlike [`layers()`](Self::layers) which returns OCI diff_ids, this method + /// returns the actual storage layer directory names by resolving diff_ids + /// through the `layers.json` mapping file. + /// + /// # Errors + /// + /// Returns an error if the config cannot be read, parsed, or if any layer + /// cannot be resolved. + pub fn storage_layer_ids(&self, stores: &[Storage]) -> Result> { + let diff_ids = self.layers()?; + + // Try to resolve all diff_ids from each store (batch parse of layers.json). + // Layers may span stores (e.g. base layers in an additional image store, + // new layers in the primary), so we merge results across stores. + let mut resolved: Vec> = vec![None; diff_ids.len()]; + for store in stores { + if resolved.iter().all(|r| r.is_some()) { + break; + } + // resolve_diff_ids parses layers.json once for all diff_ids + if let Ok(found) = store.resolve_diff_ids(&diff_ids) { + for (i, id) in found.into_iter().enumerate() { + if resolved[i].is_none() { + resolved[i] = id; + } + } + } + } + + resolved + .into_iter() + .enumerate() + .map(|(i, opt)| opt.ok_or_else(|| StorageError::LayerNotFound(diff_ids[i].clone()))) + .collect() + } + + /// Read additional metadata files. + /// + /// Metadata files are stored with base64-encoded keys as filenames, + /// prefixed with '='. + /// + /// # Errors + /// + /// Returns an error if the metadata file doesn't exist or cannot be read. + pub fn read_metadata(&self, key: &str) -> Result> { + let filename = format!("={}", key); + let mut file = self.image_dir.open(&filename)?; + let mut data = Vec::new(); + file.read_to_end(&mut data)?; + Ok(data) + } + + /// Get a reference to the image directory handle. + pub fn image_dir(&self) -> &Dir { + &self.image_dir + } + + /// Get the repository names/tags for this image. + /// + /// Reads from the `overlay-images/images.json` index file to find the + /// names associated with this image. + /// + /// # Errors + /// + /// Returns an error if the images.json file cannot be read or parsed. + pub fn names(&self, storage: &Storage) -> Result> { + let images_dir = storage.root_dir().open_dir("overlay-images")?; + let mut file = images_dir.open("images.json")?; + let mut contents = String::new(); + file.read_to_string(&mut contents)?; + + let entries: Vec = serde_json::from_str(&contents) + .map_err(|e| StorageError::InvalidStorage(format!("Invalid images.json: {}", e)))?; + + for entry in entries { + if entry.id == self.id { + return Ok(entry.names.unwrap_or_default()); + } + } + + // Image not found in images.json - return empty names + Ok(Vec::new()) + } +} + +/// Entry in images.json for image name lookups. +#[derive(Debug, serde::Deserialize)] +pub(crate) struct ImageJsonEntry { + pub(crate) id: String, + pub(crate) names: Option>, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_manifest_parsing() { + let manifest_json = r#"{ + "schemaVersion": 2, + "mediaType": "application/vnd.oci.image.manifest.v1+json", + "config": { + "mediaType": "application/vnd.oci.image.config.v1+json", + "digest": "sha256:0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef", + "size": 1234 + }, + "layers": [ + { + "mediaType": "application/vnd.oci.image.layer.v1.tar+gzip", + "digest": "sha256:1111111111111111111111111111111111111111111111111111111111111111", + "size": 5678 + }, + { + "mediaType": "application/vnd.oci.image.layer.v1.tar+gzip", + "digest": "sha256:2222222222222222222222222222222222222222222222222222222222222222", + "size": 9012 + } + ] + }"#; + + let manifest: ImageManifest = serde_json::from_str(manifest_json).unwrap(); + assert_eq!(manifest.schema_version(), 2); + assert_eq!(manifest.layers().len(), 2); + } +} diff --git a/crates/cstorage/src/layer.rs b/crates/cstorage/src/layer.rs new file mode 100644 index 00000000..9a05a6b9 --- /dev/null +++ b/crates/cstorage/src/layer.rs @@ -0,0 +1,489 @@ +//! Layer reading and metadata handling. +//! +//! This module provides access to individual overlay layers and their metadata. +//! Layers are the fundamental storage units in the overlay driver, representing +//! filesystem changes that are stacked to form complete container images. +//! +//! # Overview +//! +//! The [`Layer`] struct represents a single layer in the overlay filesystem. +//! Each layer contains: +//! - A `diff/` directory with the actual file contents +//! - A `link` file containing a short 26-character identifier +//! - A `lower` file listing parent layers (if not a base layer) +//! - Metadata for whiteouts and opaque directories +//! +//! # Layer Structure +//! +//! Each layer is stored in `overlay//`: +//! ```text +//! overlay// +//! +-- diff/ # Layer file contents +//! | +-- etc/ +//! | | +-- hosts +//! | +-- usr/ +//! | +-- bin/ +//! +-- link # Short link ID (26 chars) +//! +-- lower # Parent references: "l/:l/:..." +//! ``` +//! +//! # Whiteouts and Opaque Directories +//! +//! The overlay driver uses special markers to indicate file deletions: +//! - `.wh.` - Whiteout file (marks `` as deleted) +//! - `.wh..wh..opq` - Opaque directory marker (hides lower layer contents) + +use crate::error::{Result, StorageError}; +use crate::storage::Storage; +use cap_std::fs::Dir; + +/// Represents an overlay layer with its metadata and content. +#[derive(Debug)] +pub struct Layer { + /// Layer ID (typically a 64-character hex digest). + id: String, + + /// Directory handle for the layer directory (overlay/\/). + layer_dir: Dir, + + /// Directory handle for the diff/ subdirectory containing layer content. + diff_dir: Dir, + + /// Short link identifier from the link file (26 characters). + link_id: String, + + /// Parent layer link IDs from the lower file. + parent_links: Vec, +} + +impl Layer { + /// Open a layer by ID using fd-relative operations. + /// + /// # Errors + /// + /// Returns an error if the layer directory doesn't exist or cannot be opened. + pub fn open(storage: &Storage, id: &str) -> Result { + // Open overlay directory from storage root + let overlay_dir = storage.root_dir().open_dir("overlay")?; + + // Open layer directory relative to overlay + let layer_dir = overlay_dir + .open_dir(id) + .map_err(|_| StorageError::LayerNotFound(id.to_string()))?; + + // Open diff directory for content access + let diff_dir = layer_dir.open_dir("diff")?; + + // Read metadata files using fd-relative operations + let link_id = Self::read_link(&layer_dir)?; + let parent_links = Self::read_lower(&layer_dir)?; + + Ok(Self { + id: id.to_string(), + layer_dir, + diff_dir, + link_id, + parent_links, + }) + } + + /// Get the layer ID. + pub fn id(&self) -> &str { + &self.id + } + + /// Read the link file (26-char identifier) via Dir handle. + fn read_link(layer_dir: &Dir) -> Result { + let content = layer_dir.read_to_string("link")?; + Ok(content.trim().to_string()) + } + + /// Read the lower file (colon-separated parent links) via Dir handle. + fn read_lower(layer_dir: &Dir) -> Result> { + match layer_dir.read_to_string("lower") { + Ok(content) => { + // Format is "l/:l/:..." + let links: Vec = content + .trim() + .split(':') + .filter_map(|s| s.strip_prefix("l/")) + .map(|s| s.to_string()) + .collect(); + Ok(links) + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Vec::new()), // Base layer has no lower file + Err(e) => Err(StorageError::Io(e)), + } + } + + /// Get the short link ID for this layer. + pub fn link_id(&self) -> &str { + &self.link_id + } + + /// Get the parent link IDs for this layer. + pub fn parent_links(&self) -> &[String] { + &self.parent_links + } + + /// Get parent layer IDs (resolved from link IDs). + /// + /// This resolves the short link IDs from the `lower` file to full layer IDs + /// by reading the symlinks in the `overlay/l/` directory. + /// + /// # Errors + /// + /// Returns an error if any link cannot be resolved. + pub fn parents(&self, storage: &Storage) -> Result> { + self.parent_links + .iter() + .map(|link_id| storage.resolve_link(link_id)) + .collect() + } + + /// Get a reference to the layer directory handle. + pub fn layer_dir(&self) -> &Dir { + &self.layer_dir + } + + /// Get a reference to the diff directory handle. + pub fn diff_dir(&self) -> &Dir { + &self.diff_dir + } + + /// Get the complete chain of layers from this layer to the base. + /// + /// Returns layers in order: [self, parent, grandparent, ..., base] + /// + /// # Errors + /// + /// Returns an error if the layer chain exceeds the maximum depth of 500 layers. + pub fn layer_chain(self, storage: &Storage) -> Result> { + let mut chain = vec![self]; + let mut current_idx = 0; + + // Maximum depth to prevent infinite loops + const MAX_DEPTH: usize = 500; + + while current_idx < chain.len() && chain.len() < MAX_DEPTH { + let parent_ids = chain[current_idx].parents(storage)?; + + // Add all parents to the chain + for parent_id in parent_ids { + chain.push(Layer::open(storage, &parent_id)?); + } + + current_idx += 1; + } + + if chain.len() >= MAX_DEPTH { + return Err(StorageError::InvalidStorage( + "Layer chain exceeds maximum depth of 500".to_string(), + )); + } + + Ok(chain) + } + + /// Open a file in the layer's diff directory using fd-relative operations. + /// + /// # Errors + /// + /// Returns an error if the file doesn't exist or cannot be opened. + pub fn open_file(&self, path: impl AsRef) -> Result { + self.diff_dir.open(path).map_err(StorageError::Io) + } + + /// Open a file and return a standard library File. + /// + /// # Errors + /// + /// Returns an error if the file doesn't exist or cannot be opened. + pub fn open_file_std(&self, path: impl AsRef) -> Result { + let file = self.diff_dir.open(path).map_err(StorageError::Io)?; + Ok(file.into_std()) + } + + /// Get metadata for a file in the layer's diff directory. + /// + /// # Errors + /// + /// Returns an error if the file doesn't exist. + pub fn metadata(&self, path: impl AsRef) -> Result { + self.diff_dir.metadata(path).map_err(StorageError::Io) + } + + /// Read directory entries using Dir handle. + /// + /// # Errors + /// + /// Returns an error if the directory doesn't exist. + pub fn read_dir(&self, path: impl AsRef) -> Result { + self.diff_dir.read_dir(path).map_err(StorageError::Io) + } + + /// Check if a whiteout file exists for the given filename. + /// + /// Whiteout format: `.wh.` + /// + /// # Arguments + /// + /// * `parent_path` - The directory path containing the file (empty string or "." for root) + /// * `filename` - The name of the file to check for whiteout + /// + /// # Errors + /// + /// Returns an error if the directory cannot be accessed. + pub fn has_whiteout(&self, parent_path: &str, filename: &str) -> Result { + let whiteout_name = format!(".wh.{}", filename); + + // Handle root directory case + if parent_path.is_empty() || parent_path == "." { + Ok(self.diff_dir.try_exists(&whiteout_name)?) + } else { + match self.diff_dir.open_dir(parent_path) { + Ok(parent_dir) => Ok(parent_dir.try_exists(&whiteout_name)?), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), + Err(e) => Err(StorageError::Io(e)), + } + } + } + + /// Check if a directory is marked as opaque (hides lower layers). + /// + /// Opaque marker: `.wh..wh..opq` + /// + /// # Errors + /// + /// Returns an error if the directory cannot be accessed. + pub fn is_opaque_dir(&self, path: &str) -> Result { + const OPAQUE_MARKER: &str = ".wh..wh..opq"; + + if path.is_empty() || path == "." { + Ok(self.diff_dir.try_exists(OPAQUE_MARKER)?) + } else { + match self.diff_dir.open_dir(path) { + Ok(dir) => Ok(dir.try_exists(OPAQUE_MARKER)?), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), + Err(e) => Err(StorageError::Io(e)), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::Path; + + #[test] + fn test_parse_lower_format() { + // Test that we correctly parse the lower file format + let content = "l/ABCDEFGHIJKLMNOPQRSTUVWXY:l/BCDEFGHIJKLMNOPQRSTUVWXYZ"; + let links: Vec = content + .trim() + .split(':') + .filter_map(|s| s.strip_prefix("l/")) + .map(|s| s.to_string()) + .collect(); + + assert_eq!(links.len(), 2); + assert_eq!(links[0], "ABCDEFGHIJKLMNOPQRSTUVWXY"); + assert_eq!(links[1], "BCDEFGHIJKLMNOPQRSTUVWXYZ"); + } + + /// Create a minimal mock storage + layer on disk so that `Layer::open()` succeeds. + /// Returns the opened `Layer`. + fn create_mock_layer(root: &Path) -> Layer { + // Storage validation requires these three directories + for d in ["overlay", "overlay-layers", "overlay-images"] { + std::fs::create_dir_all(root.join(d)).unwrap(); + } + + let layer_id = "test-layer-001"; + let layer_dir = root.join("overlay").join(layer_id); + std::fs::create_dir_all(layer_dir.join("diff")).unwrap(); + std::fs::write(layer_dir.join("link"), "ABCDEFGHIJKLMNOPQRSTUVWXYZ").unwrap(); + + let storage = Storage::open(root).unwrap(); + Layer::open(&storage, layer_id).unwrap() + } + + // --- has_whiteout tests --- + + #[test] + fn test_has_whiteout_in_root() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + + // No whiteout yet + assert!(!layer.has_whiteout("", "somefile").unwrap()); + assert!(!layer.has_whiteout(".", "somefile").unwrap()); + + // Create whiteout marker (regular file — has_whiteout uses try_exists) + std::fs::write( + dir.path().join("overlay/test-layer-001/diff/.wh.somefile"), + "", + ) + .unwrap(); + + assert!(layer.has_whiteout("", "somefile").unwrap()); + assert!(layer.has_whiteout(".", "somefile").unwrap()); + // Different name still returns false + assert!(!layer.has_whiteout("", "otherfile").unwrap()); + } + + #[test] + fn test_has_whiteout_in_subdirectory() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + std::fs::create_dir_all(diff.join("etc")).unwrap(); + std::fs::write(diff.join("etc/.wh.hosts"), "").unwrap(); + + assert!(layer.has_whiteout("etc", "hosts").unwrap()); + // Root doesn't have this whiteout + assert!(!layer.has_whiteout("", "hosts").unwrap()); + } + + #[test] + fn test_has_whiteout_in_nested_subdirectory() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + std::fs::create_dir_all(diff.join("usr/local/bin")).unwrap(); + std::fs::write(diff.join("usr/local/bin/.wh.myapp"), "").unwrap(); + + assert!(layer.has_whiteout("usr/local/bin", "myapp").unwrap()); + assert!(!layer.has_whiteout("usr/local", "myapp").unwrap()); + assert!(!layer.has_whiteout("usr", "myapp").unwrap()); + } + + #[test] + fn test_has_whiteout_nonexistent_parent() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + + // Parent directory doesn't exist — should return false, not error + assert!(!layer.has_whiteout("no/such/dir", "file").unwrap()); + } + + #[test] + fn test_has_whiteout_multiple() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + std::fs::write(diff.join(".wh.file_a"), "").unwrap(); + std::fs::write(diff.join(".wh.file_b"), "").unwrap(); + + assert!(layer.has_whiteout("", "file_a").unwrap()); + assert!(layer.has_whiteout("", "file_b").unwrap()); + assert!(!layer.has_whiteout("", "file_c").unwrap()); + } + + // --- is_opaque_dir tests --- + + #[test] + fn test_is_opaque_dir_in_root() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + assert!(!layer.is_opaque_dir("").unwrap()); + assert!(!layer.is_opaque_dir(".").unwrap()); + + std::fs::write(diff.join(".wh..wh..opq"), "").unwrap(); + + assert!(layer.is_opaque_dir("").unwrap()); + assert!(layer.is_opaque_dir(".").unwrap()); + } + + #[test] + fn test_is_opaque_dir_in_subdirectory() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + std::fs::create_dir_all(diff.join("etc")).unwrap(); + std::fs::write(diff.join("etc/.wh..wh..opq"), "").unwrap(); + + assert!(layer.is_opaque_dir("etc").unwrap()); + // Root is not opaque + assert!(!layer.is_opaque_dir("").unwrap()); + } + + #[test] + fn test_is_opaque_dir_false_for_normal_dir() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + // Create a subdirectory with a regular file, but no opaque marker + std::fs::create_dir_all(diff.join("var/log")).unwrap(); + std::fs::write(diff.join("var/log/syslog"), "log data").unwrap(); + + assert!(!layer.is_opaque_dir("var").unwrap()); + assert!(!layer.is_opaque_dir("var/log").unwrap()); + } + + #[test] + fn test_is_opaque_dir_nonexistent_path() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + + // Non-existent directory should return false, not error + assert!(!layer.is_opaque_dir("no/such/path").unwrap()); + } + + #[test] + fn test_is_opaque_dir_nested() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + // Only the nested dir is opaque, not its parents + std::fs::create_dir_all(diff.join("a/b/c")).unwrap(); + std::fs::write(diff.join("a/b/c/.wh..wh..opq"), "").unwrap(); + + assert!(!layer.is_opaque_dir("a").unwrap()); + assert!(!layer.is_opaque_dir("a/b").unwrap()); + assert!(layer.is_opaque_dir("a/b/c").unwrap()); + } + + // --- Interaction between whiteout and opaque --- + + #[test] + fn test_whiteout_and_opaque_coexist() { + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + std::fs::create_dir_all(diff.join("mydir")).unwrap(); + // Opaque marker in mydir + std::fs::write(diff.join("mydir/.wh..wh..opq"), "").unwrap(); + // Also a file whiteout in mydir + std::fs::write(diff.join("mydir/.wh.oldfile"), "").unwrap(); + + assert!(layer.is_opaque_dir("mydir").unwrap()); + assert!(layer.has_whiteout("mydir", "oldfile").unwrap()); + } + + #[test] + fn test_whiteout_of_dotdot_prefix_name() { + // .wh..wh. is NOT an opaque whiteout — it's a whiteout for a file + // literally named ".wh." (the has_whiteout logic just prepends ".wh.") + let dir = tempfile::tempdir().unwrap(); + let layer = create_mock_layer(dir.path()); + let diff = dir.path().join("overlay/test-layer-001/diff"); + + // Create .wh..wh. (whiteout for file named ".wh.") + std::fs::write(diff.join(".wh..wh."), "").unwrap(); + + assert!(layer.has_whiteout("", ".wh.").unwrap()); + // This is NOT an opaque marker — the opaque marker is ".wh..wh..opq" + assert!(!layer.is_opaque_dir("").unwrap()); + } +} diff --git a/crates/cstorage/src/lib.rs b/crates/cstorage/src/lib.rs new file mode 100644 index 00000000..1863004c --- /dev/null +++ b/crates/cstorage/src/lib.rs @@ -0,0 +1,76 @@ +//! Read-only access to containers-storage overlay driver. +//! +//! This library provides efficient, capability-based access to container image +//! storage using the overlay driver. All file operations are performed using +//! file descriptor-relative operations via cap-std, providing security against +//! path traversal attacks and TOCTOU race conditions. +//! +//! # Overview +//! +//! The library is designed to access containers-storage (overlay driver) without +//! requiring tar serialization. Instead, it provides direct file descriptor access +//! to layer content, enabling zero-copy operations. +//! +//! # Key Features +//! +//! - **Capability-based security**: All file access via `cap_std::fs::Dir` handles +//! - **Zero-copy access**: File descriptors instead of data copies +//! - **Safe by design**: No path traversal vulnerabilities +//! - **Tar-split integration**: Bit-for-bit identical TAR reconstruction +//! - **OCI compatibility**: Uses oci-spec for standard image formats +//! +//! # Example +//! +//! ```no_run +//! use cstorage::Storage; +//! +//! // Discover storage from default locations +//! let storage = Storage::discover()?; +//! +//! // Or open storage at a specific path +//! let storage = Storage::open("/var/lib/containers/storage")?; +//! +//! // List images +//! for image in storage.list_images()? { +//! println!("Image: {}", image.id()); +//! } +//! # Ok::<(), cstorage::StorageError>(()) +//! ``` +//! +//! # Architecture +//! +//! The library uses cap-std for all file operations: +//! - `Storage` holds a `Dir` handle to the storage root +//! - All file access is relative to `Dir` handles +//! - No absolute paths are constructed during operations +//! - SQLite database accessed via fd-relative path + +// Core storage access +pub mod config; +pub mod error; +pub mod image; +pub mod layer; +pub mod storage; +pub mod tar_split; + +// User namespace support for rootless access +pub mod userns; +#[cfg(feature = "userns-helper")] +pub mod userns_helper; + +// Re-export commonly used types +pub use config::{AdditionalLayerStore, StorageConfig}; +pub use error::{Result, StorageError}; +pub use image::Image; +pub use layer::Layer; +pub use storage::{LayerMetadata, Storage}; +pub use tar_split::{TarHeader, TarSplitFdStream, TarSplitItem}; +pub use userns::can_bypass_file_permissions; +#[cfg(feature = "userns-helper")] +pub use userns_helper::{ + GetImageResult, HelperError, ImageInfo, ProxiedLayerStream, ProxiedTarSplitItem, StorageProxy, + init_if_helper, +}; + +// Re-export OCI spec types for convenience +pub use oci_spec::image::{Descriptor, ImageConfiguration, ImageManifest}; diff --git a/crates/cstorage/src/storage.rs b/crates/cstorage/src/storage.rs new file mode 100644 index 00000000..f00b5a75 --- /dev/null +++ b/crates/cstorage/src/storage.rs @@ -0,0 +1,623 @@ +//! Storage access for container overlay filesystem. +//! +//! This module provides the main [`Storage`] struct for accessing containers-storage +//! overlay driver data. All file access uses cap-std for fd-relative operations, +//! providing security against path traversal attacks and TOCTOU race conditions. +//! +//! # Overview +//! +//! The `Storage` struct is the primary entry point for interacting with container +//! storage. It holds a capability-based directory handle to the storage root. +//! +//! # Storage Structure +//! +//! Container storage on disk follows this layout: +//! ```text +//! /var/lib/containers/storage/ +//! +-- overlay/ # Layer data +//! | +-- / # Individual layer directories +//! | | +-- diff/ # Layer file contents +//! | | +-- link # Short link ID (26 chars) +//! | | +-- lower # Parent layer references +//! | +-- l/ # Short link directory (symlinks) +//! +-- overlay-layers/ # Tar-split metadata +//! | +-- .tar-split.gz +//! +-- overlay-images/ # Image metadata +//! +-- / +//! +-- manifest # OCI image manifest +//! +-- = # Base64-encoded metadata files +//! ``` +//! +//! # Security Model +//! +//! All file operations are performed via [`cap_std::fs::Dir`] handles, which provide: +//! - Protection against path traversal attacks +//! - Prevention of TOCTOU race conditions +//! - Guarantee that all access stays within the storage directory tree + +use crate::error::{Result, StorageError}; +use cap_std::ambient_authority; +use cap_std::fs::Dir; +use std::env; +use std::io::Read; +use std::path::{Path, PathBuf}; + +/// Main storage handle providing read-only access to container storage. +/// +/// The Storage struct holds a `Dir` handle to the storage root for fd-relative +/// file operations. +#[derive(Debug)] +pub struct Storage { + /// Directory handle for the storage root, used for all fd-relative operations. + root_dir: Dir, +} + +impl Storage { + /// Open storage at the given root path. + /// + /// This validates that the path points to a valid container storage directory + /// by checking for required subdirectories and the database file. + /// + /// # Errors + /// + /// Returns an error if: + /// - The path does not exist or is not a directory + /// - Required subdirectories are missing + /// - The database file is missing or invalid + pub fn open>(root: P) -> Result { + let root_path = root.as_ref(); + + // Open the directory handle for fd-relative operations + let root_dir = Dir::open_ambient_dir(root_path, ambient_authority()).map_err(|e| { + if e.kind() == std::io::ErrorKind::NotFound { + StorageError::RootNotFound(root_path.to_path_buf()) + } else { + StorageError::Io(e) + } + })?; + + // Validate storage structure + Self::validate_storage(&root_dir)?; + + Ok(Self { root_dir }) + } + + /// Discover storage root from default locations. + /// + /// Searches for container storage in the following order: + /// 1. `$CONTAINERS_STORAGE_ROOT` environment variable + /// 2. Rootless storage: `$XDG_DATA_HOME/containers/storage` or `~/.local/share/containers/storage` + /// 3. Root storage: `/var/lib/containers/storage` + /// + /// # Errors + /// + /// Returns an error if no valid storage location is found. + pub fn discover() -> Result { + let search_paths = Self::default_search_paths(); + + for path in search_paths { + if path.exists() { + match Self::open(&path) { + Ok(storage) => return Ok(storage), + Err(_) => continue, + } + } + } + + Err(StorageError::InvalidStorage( + "No valid storage location found. Searched default locations.".to_string(), + )) + } + + /// Discover all storage locations: the primary store plus any additional + /// image stores from `$STORAGE_OPTS`. + /// + /// The `containers/storage` library supports + /// `STORAGE_OPTS=additionalimagestore=/path` to add read-only image stores + /// (used by e.g. `bcvk` to expose the host's containers-storage inside a VM). + /// + /// Returns a non-empty vec with the primary store first (if it exists), + /// followed by any additional stores. Returns an error only if no stores + /// are found at all. + pub fn discover_all() -> Result> { + let mut stores = Vec::new(); + if let Ok(primary) = Self::discover() { + stores.push(primary); + } + stores.extend(Self::additional_image_stores_from_env()); + if stores.is_empty() { + return Err(StorageError::InvalidStorage( + "No valid storage location found. Searched default locations and $STORAGE_OPTS." + .to_string(), + )); + } + Ok(stores) + } + + /// Parse `$STORAGE_OPTS` for `additionalimagestore=` entries and + /// open any that point to valid overlay storage. + /// + /// Invalid or inaccessible paths are silently skipped. + fn additional_image_stores_from_env() -> Vec { + let opts = match env::var("STORAGE_OPTS") { + Ok(v) => v, + Err(_) => return Vec::new(), + }; + + Self::parse_additional_image_stores(&opts) + } + + /// Parse a `STORAGE_OPTS` value for `additionalimagestore=` entries + /// and open any that point to valid overlay storage. + /// + /// This is separated from [`additional_image_stores_from_env()`] so the + /// parsing logic can be tested without mutating process-global environment + /// variables. + fn parse_additional_image_stores(opts: &str) -> Vec { + let mut stores = Vec::new(); + // STORAGE_OPTS is comma-separated, e.g. + // "additionalimagestore=/run/host-container-storage,additionalimagestore=/other" + for item in opts.split(',') { + let item = item.trim(); + if let Some(path) = item.strip_prefix("additionalimagestore=") + && let Ok(s) = Self::open(path) + { + stores.push(s); + } + } + stores + } + + /// Get the default search paths for storage discovery. + fn default_search_paths() -> Vec { + let mut paths = Vec::new(); + + // 1. Check CONTAINERS_STORAGE_ROOT environment variable + if let Ok(root) = env::var("CONTAINERS_STORAGE_ROOT") { + paths.push(PathBuf::from(root)); + } + + // 2. Check rootless locations + if let Ok(home) = env::var("HOME") { + let home_path = PathBuf::from(home); + + // Try XDG_DATA_HOME first + if let Ok(xdg_data) = env::var("XDG_DATA_HOME") { + paths.push(PathBuf::from(xdg_data).join("containers/storage")); + } + + // Fallback to ~/.local/share/containers/storage + paths.push(home_path.join(".local/share/containers/storage")); + } + + // 3. Check root location + paths.push(PathBuf::from("/var/lib/containers/storage")); + + paths + } + + /// Validate that the directory structure is a valid overlay storage. + fn validate_storage(root_dir: &Dir) -> Result<()> { + // Check for required subdirectories + let required_dirs = ["overlay", "overlay-layers", "overlay-images"]; + + for dir_name in &required_dirs { + match root_dir.try_exists(dir_name) { + Ok(exists) if !exists => { + return Err(StorageError::InvalidStorage(format!( + "Missing required directory: {}", + dir_name + ))); + } + Err(e) => return Err(StorageError::Io(e)), + _ => {} + } + } + + Ok(()) + } + + /// Create storage from an existing root directory handle. + /// + /// # Errors + /// + /// Returns an error if the directory is not a valid container storage. + pub fn from_root_dir(root_dir: Dir) -> Result { + Self::validate_storage(&root_dir)?; + Ok(Self { root_dir }) + } + + /// Get a reference to the root directory handle. + pub fn root_dir(&self) -> &Dir { + &self.root_dir + } + + /// Resolve a link ID to a layer ID using fd-relative symlink reading. + /// + /// # Errors + /// + /// Returns an error if the link doesn't exist or has an invalid format. + pub fn resolve_link(&self, link_id: &str) -> Result { + // Open overlay directory from storage root + let overlay_dir = self.root_dir.open_dir("overlay")?; + + // Open link directory + let link_dir = overlay_dir.open_dir("l")?; + + // Read symlink target using fd-relative operation + let target = link_dir.read_link(link_id).map_err(|e| { + StorageError::LinkReadError(format!("Failed to read link {}: {}", link_id, e)) + })?; + + // Extract layer ID from symlink target + Self::extract_layer_id_from_link(&target) + } + + /// Extract layer ID from symlink target path. + /// + /// Target format: ..//diff + fn extract_layer_id_from_link(target: &Path) -> Result { + // Convert to string for processing + let target_str = target.to_str().ok_or_else(|| { + StorageError::LinkReadError("Invalid UTF-8 in link target".to_string()) + })?; + + // Split by '/' and find the layer ID component + let components: Vec<&str> = target_str.split('/').collect(); + + // Expected format: ..//diff + // So we need the second-to-last component + if components.len() >= 2 { + let layer_id = components[components.len() - 2]; + if !layer_id.is_empty() && layer_id != ".." { + return Ok(layer_id.to_string()); + } + } + + Err(StorageError::LinkReadError(format!( + "Invalid link target format: {}", + target_str + ))) + } + + /// List all images in storage. + /// + /// # Errors + /// + /// Returns an error if the images directory cannot be read. + pub fn list_images(&self) -> Result> { + use crate::image::Image; + + let images_dir = self.root_dir.open_dir("overlay-images")?; + let mut images = Vec::new(); + + for entry in images_dir.entries()? { + let entry = entry?; + if entry.file_type()?.is_dir() { + let id = entry + .file_name() + .to_str() + .ok_or_else(|| { + StorageError::InvalidStorage( + "Invalid UTF-8 in image directory name".to_string(), + ) + })? + .to_string(); + images.push(Image::open(self, &id)?); + } + } + Ok(images) + } + + /// Get an image by ID. + /// + /// # Errors + /// + /// Returns [`StorageError::ImageNotFound`] if the image doesn't exist. + pub fn get_image(&self, id: &str) -> Result { + crate::image::Image::open(self, id) + } + + /// Get layers for an image (in order from base to top). + /// + /// # Errors + /// + /// Returns an error if any layer cannot be opened. + pub fn get_image_layers( + &self, + image: &crate::image::Image, + ) -> Result> { + use crate::layer::Layer; + // image.layers() returns diff_ids, which need to be mapped to storage layer IDs. + // Use the batch method to parse layers.json only once. + let diff_ids = image.layers()?; + let layer_ids: Vec = self + .resolve_diff_ids(&diff_ids)? + .into_iter() + .enumerate() + .map(|(i, opt)| opt.ok_or_else(|| StorageError::LayerNotFound(diff_ids[i].clone()))) + .collect::>()?; + layer_ids + .iter() + .map(|layer_id| Layer::open(self, layer_id)) + .collect() + } + + /// Find an image by name. + /// + /// # Errors + /// + /// Returns [`StorageError::ImageNotFound`] if no image with the given name is found. + pub fn find_image_by_name(&self, name: &str) -> Result { + // Read images.json from overlay-images/ + let images_dir = self.root_dir.open_dir("overlay-images")?; + let mut file = images_dir.open("images.json")?; + let mut contents = String::new(); + file.read_to_string(&mut contents)?; + + // Parse the JSON array + let entries: Vec = serde_json::from_str(&contents) + .map_err(|e| StorageError::InvalidStorage(format!("Invalid images.json: {}", e)))?; + + // Search for matching name + for entry in &entries { + if let Some(names) = &entry.names { + for image_name in names { + if image_name == name { + return self.get_image(&entry.id); + } + } + } + } + + // Try partial matching (e.g., "alpine:latest" matches "docker.io/library/alpine:latest") + for entry in &entries { + if let Some(names) = &entry.names { + for image_name in names { + // Check if name is a suffix (after removing registry/namespace prefix) + if let Some(prefix) = image_name.strip_suffix(name) { + // Verify it's a proper boundary (preceded by '/') + if prefix.is_empty() || prefix.ends_with('/') { + return self.get_image(&entry.id); + } + } + } + } + } + + // Try matching short name without tag (e.g., "busybox" matches "docker.io/library/busybox:latest") + // This handles the common case of just specifying the image name + let name_with_tag = if name.contains(':') { + name.to_string() + } else { + format!("{}:latest", name) + }; + + for entry in &entries { + if let Some(names) = &entry.names { + for image_name in names { + // Check if image_name ends with /name:tag pattern + if let Some(prefix) = image_name.strip_suffix(&name_with_tag) + && (prefix.is_empty() || prefix.ends_with('/')) + { + return self.get_image(&entry.id); + } + } + } + } + + Err(StorageError::ImageNotFound(name.to_string())) + } + + /// Parse layers.json and return all entries. + /// + /// This is used internally to avoid re-parsing on every lookup. + fn read_layer_entries(&self) -> Result> { + let layers_dir = self.root_dir.open_dir("overlay-layers").map_err(|e| { + StorageError::Io(std::io::Error::new( + e.kind(), + format!("opening overlay-layers/: {e}"), + )) + })?; + let mut file = layers_dir.open("layers.json").map_err(|e| { + StorageError::Io(std::io::Error::new( + e.kind(), + format!("opening overlay-layers/layers.json: {e}"), + )) + })?; + let mut contents = String::new(); + file.read_to_string(&mut contents)?; + + serde_json::from_str(&contents) + .map_err(|e| StorageError::InvalidStorage(format!("Invalid layers.json: {}", e))) + } + + /// Resolve multiple diff-digests to storage layer IDs in a single pass. + /// + /// Parses `layers.json` once and looks up all diff_ids, avoiding the O(N×M) + /// overhead of calling [`resolve_diff_id()`] in a loop. + /// + /// Returns a `Vec>` with the same length as `diff_digests`, + /// where `Some(id)` means the diff-digest was found and `None` means it was not. + /// This allows callers to merge results across multiple stores without + /// short-circuiting on the first miss. + /// + /// # Errors + /// + /// Returns an error only if `layers.json` cannot be read or parsed. + pub fn resolve_diff_ids(&self, diff_digests: &[String]) -> Result>> { + let entries = self.read_layer_entries()?; + + // Build a map from normalized diff-digest -> layer ID + let mut digest_to_id = std::collections::HashMap::with_capacity(entries.len()); + for entry in &entries { + if let Some(digest) = &entry.diff_digest { + digest_to_id.insert(digest.as_str(), entry.id.as_str()); + } + } + + Ok(diff_digests + .iter() + .map(|diff_digest| { + let normalized = if diff_digest.starts_with("sha256:") { + diff_digest.clone() + } else { + format!("sha256:{}", diff_digest) + }; + digest_to_id + .get(normalized.as_str()) + .map(|id| id.to_string()) + }) + .collect()) + } + + /// Resolve a diff-digest to a storage layer ID. + /// + /// # Errors + /// + /// Returns [`StorageError::LayerNotFound`] if no layer with the given diff-digest exists. + pub fn resolve_diff_id(&self, diff_digest: &str) -> Result { + self.resolve_diff_ids(&[diff_digest.to_string()])? + .into_iter() + .next() + .flatten() + .ok_or_else(|| StorageError::LayerNotFound(diff_digest.to_string())) + } + + /// Get layer metadata including size information. + /// + /// # Errors + /// + /// Returns an error if the layer is not found. + pub fn get_layer_metadata(&self, layer_id: &str) -> Result { + let entries = self.read_layer_entries()?; + + for entry in entries { + if entry.id == layer_id { + return Ok(LayerMetadata { + id: entry.id, + parent: entry.parent, + diff_size: entry.diff_size, + compressed_size: entry.compressed_size, + }); + } + } + + Err(StorageError::LayerNotFound(layer_id.to_string())) + } + + /// Calculate the total uncompressed size of an image. + /// + /// # Errors + /// + /// Returns an error if any layer metadata cannot be read. + pub fn calculate_image_size(&self, image: &crate::image::Image) -> Result { + let layers = self.get_image_layers(image)?; + let mut total_size: u64 = 0; + + for layer in &layers { + let metadata = self.get_layer_metadata(layer.id())?; + if let Some(size) = metadata.diff_size { + total_size = total_size.saturating_add(size); + } + } + + Ok(total_size) + } +} + +use crate::image::ImageJsonEntry; + +/// Entry in layers.json for layer ID lookups. +#[derive(Debug, serde::Deserialize)] +#[serde(rename_all = "kebab-case")] +struct LayerEntry { + id: String, + parent: Option, + diff_digest: Option, + diff_size: Option, + compressed_size: Option, +} + +/// Metadata about a layer from layers.json. +#[derive(Debug, Clone)] +pub struct LayerMetadata { + /// Layer storage ID. + pub id: String, + /// Parent layer ID (if not base layer). + pub parent: Option, + /// Uncompressed diff size in bytes. + pub diff_size: Option, + /// Compressed size in bytes. + pub compressed_size: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_search_paths() { + let paths = Storage::default_search_paths(); + assert!(!paths.is_empty(), "Should have at least one search path"); + } + + #[test] + fn test_storage_validation() { + // Create a mock storage directory structure for testing + let dir = tempfile::tempdir().unwrap(); + let storage_path = dir.path(); + + // Create required directories + std::fs::create_dir_all(storage_path.join("overlay")).unwrap(); + std::fs::create_dir_all(storage_path.join("overlay-layers")).unwrap(); + std::fs::create_dir_all(storage_path.join("overlay-images")).unwrap(); + + let storage = Storage::open(storage_path).unwrap(); + assert!(storage.root_dir().try_exists("overlay").unwrap()); + } + + /// Helper: create a mock overlay storage directory. + fn create_mock_storage(path: &Path) { + for d in ["overlay", "overlay-layers", "overlay-images"] { + std::fs::create_dir_all(path.join(d)).unwrap(); + } + } + + #[test] + fn test_parse_additional_image_stores() { + let dir = tempfile::tempdir().unwrap(); + let store_a = dir.path().join("a"); + let store_b = dir.path().join("b"); + create_mock_storage(&store_a); + create_mock_storage(&store_b); + + // Empty string returns empty + assert!(Storage::parse_additional_image_stores("").is_empty()); + + // Single store + let opts = format!("additionalimagestore={}", store_a.display()); + let stores = Storage::parse_additional_image_stores(&opts); + assert_eq!(stores.len(), 1); + + // Multiple stores (comma-separated) + let opts = format!( + "additionalimagestore={},additionalimagestore={}", + store_a.display(), + store_b.display() + ); + let stores = Storage::parse_additional_image_stores(&opts); + assert_eq!(stores.len(), 2); + + // Non-existent path is silently skipped + assert!( + Storage::parse_additional_image_stores("additionalimagestore=/no/such/path").is_empty() + ); + + // Unrelated options are ignored + assert!( + Storage::parse_additional_image_stores("overlay.mount_program=/usr/bin/fuse-overlayfs") + .is_empty() + ); + } +} diff --git a/crates/cstorage/src/tar_split.rs b/crates/cstorage/src/tar_split.rs new file mode 100644 index 00000000..c3eeed57 --- /dev/null +++ b/crates/cstorage/src/tar_split.rs @@ -0,0 +1,711 @@ +//! Tar-split integration for reading container layers without full tar serialization. +//! +//! This module provides the `TarSplitFdStream` which reads tar-split metadata files +//! and returns file descriptors for the actual file content, enabling zero-copy +//! access to layer data. +//! +//! # Overview +//! +//! The tar-split format stores tar header metadata separately from file content, +//! allowing reconstruction of tar archives without duplicating the actual file data. +//! This implementation uses that metadata to provide file descriptors directly to +//! the files in the overlay diff directory. +//! +//! # Architecture +//! +//! The tar-split format is NDJSON (newline-delimited JSON), gzip-compressed: +//! - Type 1 (FileType): File/directory references with name, optional size, optional CRC64 +//! - Type 2 (SegmentType): Raw TAR header bytes and padding (base64-encoded) +//! - CRC64-ISO algorithm for checksums + +use std::io::{BufRead, BufReader, Read}; +use std::os::fd::OwnedFd; + +use base64::prelude::*; +use cap_std::fs::{Dir, File}; +use crc::{CRC_64_GO_ISO, Crc}; +use flate2::read::GzDecoder; +use serde::Deserialize; + +use crate::error::{Result, StorageError}; +use crate::layer::Layer; +use crate::storage::Storage; + +/// CRC64-ISO implementation for verifying file checksums. +const CRC64_ISO: Crc = Crc::::new(&CRC_64_GO_ISO); + +/// Item returned from tar-split stream iteration. +#[derive(Debug)] +pub enum TarSplitItem { + /// Raw segment bytes (TAR header + padding) to write directly. + Segment(Vec), + + /// File content to write. + FileContent { + /// File descriptor for reading the content. + /// + /// The caller takes ownership of this file descriptor and is responsible + /// for reading the content and closing it when done. + fd: OwnedFd, + /// Expected file size in bytes. + /// + /// Used for tar padding calculation: TAR files are padded to 512-byte + /// boundaries, so the consumer needs to know the size to write the + /// correct amount of padding after the file content. + size: u64, + /// File path from the tar-split entry. + /// + /// This is the path as recorded in the original tar archive + /// (e.g., "./etc/hosts"). + name: String, + }, +} + +/// Raw tar-split entry from NDJSON format before validation. +#[derive(Debug, Deserialize)] +struct TarSplitEntryRaw { + /// Entry type discriminant: 1 for File, 2 for Segment. + #[serde(rename = "type")] + type_id: u8, + /// File name from TAR header (type 1 only). + #[serde(default)] + name: Option, + /// File size in bytes (type 1 only). + #[serde(default)] + size: Option, + /// CRC64-ISO checksum, base64-encoded (type 1 only). + #[serde(default)] + crc64: Option, + /// Base64-encoded TAR header bytes or padding (type 2 only). + #[serde(default)] + payload: Option, +} + +/// Tar-split entry from NDJSON format. +#[derive(Debug)] +enum TarSplitEntry { + /// File type entry: references a file/directory with metadata. + File { + /// File name from TAR header. + name: Option, + /// File size in bytes. + size: Option, + /// CRC64-ISO checksum (base64-encoded). + crc64: Option, + }, + /// Segment type entry: raw TAR header bytes and padding. + Segment { + /// Base64-encoded TAR header bytes (512 bytes) or padding. + payload: Option, + }, +} + +impl TarSplitEntry { + /// Parse a tar-split entry from raw format with validation. + fn from_raw(raw: TarSplitEntryRaw) -> Result { + match raw.type_id { + 1 => Ok(TarSplitEntry::File { + name: raw.name, + size: raw.size, + crc64: raw.crc64, + }), + 2 => Ok(TarSplitEntry::Segment { + payload: raw.payload, + }), + _ => Err(StorageError::TarSplitError(format!( + "Invalid tar-split entry type: {}", + raw.type_id + ))), + } + } +} + +/// Tar header information extracted from tar-split metadata. +#[derive(Debug, Clone)] +pub struct TarHeader { + /// File path in the tar archive (e.g., "./etc/hosts") + pub name: String, + + /// File mode (permissions and type information) + pub mode: u32, + + /// User ID of the file owner + pub uid: u32, + + /// Group ID of the file owner + pub gid: u32, + + /// File size in bytes + pub size: u64, + + /// Modification time (Unix timestamp) + pub mtime: i64, + + /// Tar entry type flag + pub typeflag: u8, + + /// Link target for symbolic links and hard links + pub linkname: String, + + /// User name of the file owner + pub uname: String, + + /// Group name of the file owner + pub gname: String, + + /// Major device number (for device files) + pub devmajor: u32, + + /// Minor device number (for device files) + pub devminor: u32, +} + +impl TarHeader { + /// Parse a TarHeader from a 512-byte TAR header block. + /// + /// # Errors + /// + /// Returns an error if the header is too short or has an invalid checksum. + pub fn from_bytes(header_bytes: &[u8]) -> Result { + let header_array: &[u8; tar_core::HEADER_SIZE] = header_bytes.try_into().map_err(|_| { + StorageError::TarSplitError(format!( + "TAR header wrong size: {} bytes (expected {})", + header_bytes.len(), + tar_core::HEADER_SIZE + )) + })?; + let header = tar_core::Header::from_bytes(header_array); + + let name = String::from_utf8(header.path_bytes().to_vec()).map_err(|e| { + StorageError::TarSplitError(format!("Non-UTF-8 path in TAR header: {}", e)) + })?; + let mode = header + .mode() + .map_err(|e| StorageError::TarSplitError(format!("Invalid mode: {}", e)))?; + let uid = header + .uid() + .map_err(|e| StorageError::TarSplitError(format!("Invalid uid: {}", e)))? + as u32; + let gid = header + .gid() + .map_err(|e| StorageError::TarSplitError(format!("Invalid gid: {}", e)))? + as u32; + let size = header + .entry_size() + .map_err(|e| StorageError::TarSplitError(format!("Invalid size: {}", e)))?; + let mtime = header + .mtime() + .map_err(|e| StorageError::TarSplitError(format!("Invalid mtime: {}", e)))? + as i64; + let typeflag = header.entry_type().as_byte(); + let link_bytes = header.link_name_bytes(); + let linkname = if link_bytes.is_empty() { + String::new() + } else { + String::from_utf8(link_bytes.to_vec()).map_err(|e| { + StorageError::TarSplitError(format!("Non-UTF-8 link name in TAR header: {}", e)) + })? + }; + let uname = header + .username() + .map(|b| { + String::from_utf8(b.to_vec()).map_err(|e| { + StorageError::TarSplitError(format!("Non-UTF-8 username in TAR header: {}", e)) + }) + }) + .transpose()? + .unwrap_or_default(); + let gname = header + .groupname() + .map(|b| { + String::from_utf8(b.to_vec()).map_err(|e| { + StorageError::TarSplitError(format!( + "Non-UTF-8 group name in TAR header: {}", + e + )) + }) + }) + .transpose()? + .unwrap_or_default(); + let devmajor = header + .device_major() + .map_err(|e| StorageError::TarSplitError(format!("Invalid devmajor: {}", e)))? + .unwrap_or(0); + let devminor = header + .device_minor() + .map_err(|e| StorageError::TarSplitError(format!("Invalid devminor: {}", e)))? + .unwrap_or(0); + + Ok(TarHeader { + name, + mode, + uid, + gid, + size, + mtime, + typeflag, + linkname, + uname, + gname, + devmajor, + devminor, + }) + } + + /// Check if this header represents a regular file. + pub fn is_regular_file(&self) -> bool { + self.typeflag == b'0' || self.typeflag == b'\0' + } + + /// Check if this header represents a directory. + pub fn is_directory(&self) -> bool { + self.typeflag == b'5' + } + + /// Check if this header represents a symbolic link. + pub fn is_symlink(&self) -> bool { + self.typeflag == b'2' + } + + /// Check if this header represents a hard link. + pub fn is_hardlink(&self) -> bool { + self.typeflag == b'1' + } + + /// Normalize the path by stripping leading "./" + pub fn normalized_name(&self) -> &str { + self.name.strip_prefix("./").unwrap_or(&self.name) + } +} + +/// Stream that reads tar-split metadata and provides file descriptors for file content. +#[derive(Debug)] +pub struct TarSplitFdStream { + /// The current layer for file lookups. + layer: Layer, + + /// Storage root directory for accessing parent layers on-demand. + storage_root: Dir, + + /// Gzip decompressor reading from the tar-split file. + reader: BufReader>, + + /// Entry counter for debugging and error messages. + entry_count: usize, +} + +impl TarSplitFdStream { + /// Create a new tar-split stream for a layer. + /// + /// # Errors + /// + /// Returns an error if the tar-split file doesn't exist or cannot be opened. + pub fn new(storage: &Storage, layer: &Layer) -> Result { + // Open overlay-layers directory via Dir handle + let layers_dir = storage.root_dir().open_dir("overlay-layers").map_err(|e| { + StorageError::TarSplitError(format!("Failed to open overlay-layers directory: {}", e)) + })?; + + // Open tar-split file relative to layers directory + let filename = format!("{}.tar-split.gz", layer.id()); + let file = layers_dir.open(&filename).map_err(|e| { + StorageError::TarSplitError(format!( + "Failed to open tar-split file {}: {}", + filename, e + )) + })?; + + // Wrap in gzip decompressor + let gz_decoder = GzDecoder::new(file); + let reader = BufReader::new(gz_decoder); + + // Open the layer for on-demand file lookups + let layer = Layer::open(storage, layer.id())?; + + // Clone storage root dir for on-demand parent layer access + let storage_root = storage.root_dir().try_clone()?; + + Ok(Self { + layer, + storage_root, + reader, + entry_count: 0, + }) + } + + /// Open a file in the layer chain, trying current layer first then parents. + fn open_file_in_chain(&self, path: &str) -> Result { + // Normalize path (remove leading ./) + let normalized_path = path.strip_prefix("./").unwrap_or(path); + + // Try to open in current layer first + match self.layer.diff_dir().open(normalized_path) { + Ok(file) => return Ok(file), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + // Continue to search parent layers + } + Err(e) => return Err(StorageError::Io(e)), + } + + // Search parent layers on-demand + self.search_parent_layers(&self.layer, normalized_path, 0) + } + + /// Recursively search parent layers for a file. + fn search_parent_layers( + &self, + current_layer: &Layer, + path: &str, + depth: usize, + ) -> Result { + const MAX_DEPTH: usize = 500; + + if depth >= MAX_DEPTH { + return Err(StorageError::TarSplitError(format!( + "Layer chain exceeds maximum depth of {} while searching for file: {}", + MAX_DEPTH, path + ))); + } + + // Get parent link IDs + let parent_links = current_layer.parent_links(); + + // Try each parent + for link_id in parent_links { + // Resolve link ID to layer ID by reading the symlink directly + let parent_id = self.resolve_link_direct(link_id)?; + + // Try to open file directly in parent's diff directory + match self.open_file_in_layer(&parent_id, path) { + Ok(file) => return Ok(file), + Err(StorageError::Io(e)) if e.kind() == std::io::ErrorKind::NotFound => { + // File not in this parent, recursively search its parents + match self.search_by_layer_id(&parent_id, path, depth + 1) { + Ok(file) => return Ok(file), + Err(StorageError::TarSplitError(_)) => continue, // File not found in this branch, try next parent + Err(e) => return Err(e), + } + } + Err(e) => return Err(e), + } + } + + Err(StorageError::TarSplitError(format!( + "File not found in layer chain: {}", + path + ))) + } + + /// Search for a file starting from a layer ID. + fn search_by_layer_id( + &self, + layer_id: &str, + path: &str, + depth: usize, + ) -> Result { + const MAX_DEPTH: usize = 500; + + if depth >= MAX_DEPTH { + return Err(StorageError::TarSplitError(format!( + "Layer chain exceeds maximum depth of {} while searching for file: {}", + MAX_DEPTH, path + ))); + } + + // Try to open file in this layer + match self.open_file_in_layer(layer_id, path) { + Ok(file) => return Ok(file), + Err(StorageError::Io(e)) if e.kind() == std::io::ErrorKind::NotFound => { + // File not found, check parents + } + Err(e) => return Err(e), + } + + // Read parent links for this layer + let parent_links = self.read_layer_parent_links(layer_id)?; + + // Try each parent + for link_id in parent_links { + let parent_id = self.resolve_link_direct(&link_id)?; + match self.search_by_layer_id(&parent_id, path, depth + 1) { + Ok(file) => return Ok(file), + Err(StorageError::TarSplitError(_)) => continue, // File not found in this branch, try next parent + Err(e) => return Err(e), + } + } + + Err(StorageError::TarSplitError(format!( + "File not found in layer chain: {}", + path + ))) + } + + /// Resolve a link ID to layer ID by directly reading the symlink. + fn resolve_link_direct(&self, link_id: &str) -> Result { + let overlay_dir = self.storage_root.open_dir("overlay")?; + let link_dir = overlay_dir.open_dir("l")?; + let target = link_dir.read_link(link_id).map_err(|e| { + StorageError::LinkReadError(format!("Failed to read link {}: {}", link_id, e)) + })?; + + // Extract layer ID from symlink target (format: ..//diff) + let target_str = target.to_str().ok_or_else(|| { + StorageError::LinkReadError("Invalid UTF-8 in link target".to_string()) + })?; + let components: Vec<&str> = target_str.split('/').collect(); + if components.len() >= 2 { + let layer_id = components[components.len() - 2]; + if !layer_id.is_empty() && layer_id != ".." { + return Ok(layer_id.to_string()); + } + } + Err(StorageError::LinkReadError(format!( + "Invalid link target format: {}", + target_str + ))) + } + + /// Open a file in a specific layer's diff directory. + fn open_file_in_layer(&self, layer_id: &str, path: &str) -> Result { + let overlay_dir = self.storage_root.open_dir("overlay")?; + let layer_dir = overlay_dir.open_dir(layer_id)?; + let diff_dir = layer_dir.open_dir("diff")?; + diff_dir.open(path).map_err(StorageError::Io) + } + + /// Read parent link IDs from a layer's lower file. + fn read_layer_parent_links(&self, layer_id: &str) -> Result> { + let overlay_dir = self.storage_root.open_dir("overlay")?; + let layer_dir = overlay_dir.open_dir(layer_id)?; + + match layer_dir.read_to_string("lower") { + Ok(content) => Ok(content + .trim() + .split(':') + .filter_map(|s| s.strip_prefix("l/")) + .map(|s| s.to_string()) + .collect()), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Vec::new()), // Base layer has no lower file + Err(e) => Err(StorageError::Io(e)), + } + } + + /// Verify CRC64-ISO checksum of a file. + fn verify_crc64( + &self, + file: &mut cap_std::fs::File, + expected_b64: &str, + size: u64, + ) -> Result<()> { + // Decode base64 checksum + let expected_bytes = BASE64_STANDARD.decode(expected_b64).map_err(|e| { + StorageError::TarSplitError(format!("Failed to decode base64 CRC64: {}", e)) + })?; + + if expected_bytes.len() != 8 { + return Err(StorageError::TarSplitError(format!( + "Invalid CRC64 length: {} bytes", + expected_bytes.len() + ))); + } + + // Convert to u64 (big-endian) + let expected = u64::from_be_bytes(expected_bytes.try_into().unwrap()); + + // Compute CRC64 of file content + let mut digest = CRC64_ISO.digest(); + let mut buffer = vec![0u8; 8192]; + let mut bytes_read = 0u64; + + loop { + let n = file.read(&mut buffer).map_err(|e| { + StorageError::TarSplitError(format!( + "Failed to read file for CRC64 verification: {}", + e + )) + })?; + if n == 0 { + break; + } + digest.update(&buffer[..n]); + bytes_read += n as u64; + } + + // Verify size matches + if bytes_read != size { + return Err(StorageError::TarSplitError(format!( + "File size mismatch: expected {}, got {}", + size, bytes_read + ))); + } + + let computed = digest.finalize(); + if computed != expected { + return Err(StorageError::TarSplitError(format!( + "CRC64 mismatch: expected {:016x}, got {:016x}", + expected, computed + ))); + } + + Ok(()) + } + + /// Read the next item from the tar-split stream. + /// + /// Returns: + /// - `Ok(Some(item))` - Next item was read successfully + /// - `Ok(None)` - End of stream reached + /// - `Err(...)` - Error occurred during reading + #[allow(clippy::should_implement_trait)] + pub fn next(&mut self) -> Result> { + loop { + // Read next line from NDJSON stream + let mut line = String::new(); + match self.reader.read_line(&mut line) { + Ok(0) => { + return Ok(None); + } + Ok(_) => { + // Parse NDJSON entry + let raw: TarSplitEntryRaw = serde_json::from_str(&line).map_err(|e| { + StorageError::TarSplitError(format!( + "Failed to parse tar-split entry: {}", + e + )) + })?; + let entry = TarSplitEntry::from_raw(raw)?; + + match entry { + TarSplitEntry::Segment { payload } => { + if let Some(payload_b64) = payload { + let payload_bytes = + BASE64_STANDARD.decode(&payload_b64).map_err(|e| { + StorageError::TarSplitError(format!( + "Failed to decode base64 payload: {}", + e + )) + })?; + + return Ok(Some(TarSplitItem::Segment(payload_bytes))); + } + // Empty segment, continue + } + + TarSplitEntry::File { name, size, crc64 } => { + self.entry_count += 1; + + // Check if this file has content to write + let file_size = size.unwrap_or(0); + if file_size > 0 { + // Regular file with content - open it + let path = name.as_ref().ok_or_else(|| { + StorageError::TarSplitError( + "FileType entry missing name".to_string(), + ) + })?; + + let mut file = self.open_file_in_chain(path)?; + + // Verify CRC64 if provided + if let Some(ref crc64_b64) = crc64 { + self.verify_crc64(&mut file, crc64_b64, file_size)?; + + // Reopen file since we consumed it for CRC check + file = self.open_file_in_chain(path)?; + } + + // Convert to OwnedFd and return + let std_file = file.into_std(); + let owned_fd: OwnedFd = std_file.into(); + return Ok(Some(TarSplitItem::FileContent { + fd: owned_fd, + size: file_size, + name: path.clone(), + })); + } + // Empty file or directory - header already in preceding Segment + } + } + } + Err(e) => { + return Err(StorageError::TarSplitError(format!( + "Failed to read tar-split line: {}", + e + ))); + } + } + } + } + + /// Get the number of entries processed so far. + pub fn entry_count(&self) -> usize { + self.entry_count + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_tar_header_type_checks() { + let mut header = TarHeader { + name: "test.txt".to_string(), + mode: 0o644, + uid: 1000, + gid: 1000, + size: 100, + mtime: 0, + typeflag: b'0', + linkname: String::new(), + uname: "user".to_string(), + gname: "group".to_string(), + devmajor: 0, + devminor: 0, + }; + + assert!(header.is_regular_file()); + assert!(!header.is_directory()); + assert!(!header.is_symlink()); + + header.typeflag = b'5'; + assert!(!header.is_regular_file()); + assert!(header.is_directory()); + + header.typeflag = b'2'; + assert!(header.is_symlink()); + } + + #[test] + fn test_tar_split_entry_deserialization() { + // Test type 2 (Segment) with integer discriminant + let json_segment = r#"{"type":2,"payload":"dXN0YXIAMDA="}"#; + let raw: TarSplitEntryRaw = serde_json::from_str(json_segment).unwrap(); + let entry = TarSplitEntry::from_raw(raw).unwrap(); + match entry { + TarSplitEntry::Segment { payload } => { + assert_eq!(payload, Some("dXN0YXIAMDA=".to_string())); + } + _ => panic!("Expected Segment variant"), + } + + // Test type 1 (File) with integer discriminant + let json_file = r#"{"type":1,"name":"./etc/hosts","size":123,"crc64":"AAAAAAAAAA=="}"#; + let raw: TarSplitEntryRaw = serde_json::from_str(json_file).unwrap(); + let entry = TarSplitEntry::from_raw(raw).unwrap(); + match entry { + TarSplitEntry::File { name, size, crc64 } => { + assert_eq!(name, Some("./etc/hosts".to_string())); + assert_eq!(size, Some(123)); + assert_eq!(crc64, Some("AAAAAAAAAA==".to_string())); + } + _ => panic!("Expected File variant"), + } + + // Test invalid type + let json_invalid = r#"{"type":99}"#; + let raw: TarSplitEntryRaw = serde_json::from_str(json_invalid).unwrap(); + let result = TarSplitEntry::from_raw(raw); + assert!(result.is_err()); + } +} diff --git a/crates/cstorage/src/userns.rs b/crates/cstorage/src/userns.rs new file mode 100644 index 00000000..ab44d03c --- /dev/null +++ b/crates/cstorage/src/userns.rs @@ -0,0 +1,67 @@ +//! User namespace utilities for rootless containers-storage access. +//! +//! This module provides utilities for determining when user namespace entry is +//! needed to access overlay storage files that are owned by remapped UIDs/GIDs. +//! +//! # Background +//! +//! When podman runs rootless, it uses user namespaces to remap UIDs. Files in +//! the overlay storage are owned by these remapped UIDs (e.g., UID 100000+N on +//! the host corresponds to UID N inside the container). These files also retain +//! their original permission bits from the container image. +//! +//! Files with restrictive permissions (e.g., `/etc/shadow` with mode 0600) are +//! only readable by their owner - a remapped UID we cannot access as an +//! unprivileged user. +//! +//! # Solution +//! +//! Rather than manually setting up user namespaces (parsing `/etc/subuid`, +//! calling `newuidmap`/`newgidmap`, etc.), we delegate to `podman unshare` +//! which handles all the edge cases. See [`crate::userns_helper`] for the +//! helper process that runs inside the user namespace. + +use rustix::process::getuid; +use rustix::thread::{CapabilitySet, capabilities}; + +/// Check if the current process can read arbitrary files regardless of permissions. +/// +/// This returns `true` if: +/// - The process is running as real root (UID 0), or +/// - The process has `CAP_DAC_OVERRIDE` in its effective capability set +/// +/// When this returns `true`, there's no need to spawn a userns helper for +/// file access - the process can already read any file in the storage. +pub fn can_bypass_file_permissions() -> bool { + // Real root can read anything + if getuid().is_root() { + return true; + } + + // Check for CAP_DAC_OVERRIDE capability + if let Ok(caps) = capabilities(None) + && caps.effective.contains(CapabilitySet::DAC_OVERRIDE) + { + return true; + } + + false +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_can_bypass_file_permissions() { + // This function should not panic and should return a consistent result + let result1 = can_bypass_file_permissions(); + let result2 = can_bypass_file_permissions(); + assert_eq!(result1, result2); + + // If we're root, it should return true + if getuid().is_root() { + assert!(result1, "root should be able to bypass permissions"); + } + } +} diff --git a/crates/cstorage/src/userns_helper.rs b/crates/cstorage/src/userns_helper.rs new file mode 100644 index 00000000..f28ef6dc --- /dev/null +++ b/crates/cstorage/src/userns_helper.rs @@ -0,0 +1,1086 @@ +//! User namespace helper process for privileged storage access. +//! +//! This module provides a mechanism for unprivileged processes to access +//! containers-storage content that has restrictive permissions. It works by +//! spawning a helper process inside a user namespace (via `podman unshare`) +//! that can read any file, and communicating with it via JSON-RPC over a +//! Unix socket with fd-passing. +//! +//! # Why This Is Needed +//! +//! Container images contain files with various permission bits (e.g., `/etc/shadow` +//! with mode 0600). When stored in rootless containers-storage, these files are +//! owned by remapped UIDs that the unprivileged user cannot access. Even though +//! we have tar-split metadata telling us the file structure, we still need to +//! read the actual file content. +//! +//! # Architecture +//! +//! The helper uses stdin (fd 0) for IPC, avoiding the need for unsafe code: +//! +//! ```text +//! ┌─────────────────────────────────────┐ +//! │ Parent Process │ +//! │ (unprivileged, library user) │ +//! │ │ +//! │ StorageProxy::spawn() │ +//! │ │ │ +//! │ ├─► Create socketpair │ +//! │ ├─► Spawn: podman unshare │ +//! │ │ /proc/self/exe │ +//! │ │ (child's stdin=socket) │ +//! │ │ │ +//! │ proxy.stream_layer() ───────────► │ +//! │ │ │ +//! │ ◄─── receives OwnedFd via SCM_RIGHTS│ +//! └─────────────────────────────────────┘ +//! ``` +//! +//! # Usage +//! +//! Library users must call [`init_if_helper`] early in their `main()` function: +//! +//! ```no_run +//! // This must be called before any other cstorage operations. +//! // If this process was spawned as a userns helper, it will +//! // serve requests and exit, never returning. +//! cstorage::userns_helper::init_if_helper(); +//! +//! // Normal application code continues here... +//! ``` + +use std::os::fd::AsFd; +use std::os::unix::io::OwnedFd; +use std::os::unix::net::UnixStream as StdUnixStream; +use std::path::Path; +use std::process::{Child, Command, Stdio}; + +use base64::prelude::*; +use jsonrpc_fdpass::transport::UnixSocketTransport; +use jsonrpc_fdpass::{JsonRpcMessage, JsonRpcRequest, JsonRpcResponse, MessageWithFds}; +use rustix::io::dup; +use rustix::process::{Signal, set_parent_process_death_signal}; +use serde::{Deserialize, Serialize}; +use tokio::net::UnixStream as TokioUnixStream; + +use crate::layer::Layer; +use crate::storage::Storage; +use crate::tar_split::{TarSplitFdStream, TarSplitItem}; +use crate::userns::can_bypass_file_permissions; + +/// Environment variable that indicates this process is a userns helper. +const HELPER_ENV: &str = "__CSTORAGE_USERNS_HELPER"; + +/// JSON-RPC 2.0 error codes. +/// +/// These codes follow the JSON-RPC 2.0 specification: +/// - Standard errors: -32700 to -32600 +/// - Server errors: -32099 to -32000 (implementation-defined) +mod error_codes { + /// Invalid params - the params passed to a method are invalid. + pub const INVALID_PARAMS: i32 = -32602; + + /// Method not found - the requested method does not exist. + pub const METHOD_NOT_FOUND: i32 = -32601; + + /// Resource not found - the requested resource (image, layer, etc.) was not found. + pub const RESOURCE_NOT_FOUND: i32 = -32000; + + /// Internal error - a server-side error occurred (I/O, storage access, etc.). + pub const INTERNAL_ERROR: i32 = -32003; +} + +/// JSON-RPC method names. +mod methods { + /// Open a file and return its fd. + pub const OPEN_FILE: &str = "userns.openFile"; + /// Shutdown the helper process. + pub const SHUTDOWN: &str = "userns.shutdown"; + /// List images in storage. + pub const LIST_IMAGES: &str = "userns.listImages"; + /// Get image metadata. + pub const GET_IMAGE: &str = "userns.getImage"; + /// Stream layer as tar-split entries with fds. + pub const STREAM_LAYER: &str = "userns.streamLayer"; +} + +/// Parameters for the open_file method. +#[derive(Debug, Serialize, Deserialize)] +pub struct OpenFileParams { + /// Path to open. + pub path: String, +} + +/// Result for the open_file method. +#[derive(Debug, Serialize, Deserialize)] +pub struct OpenFileResult { + /// True if successful (fd is passed out-of-band). + pub success: bool, +} + +/// Parameters for list_images method. +#[derive(Debug, Serialize, Deserialize)] +pub struct ListImagesParams { + /// Storage root path. + pub storage_path: String, +} + +/// Image info returned by list_images. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ImageInfo { + /// Image ID. + pub id: String, + /// Image names/tags. + pub names: Vec, +} + +/// Result for list_images method. +#[derive(Debug, Serialize, Deserialize)] +pub struct ListImagesResult { + /// List of images. + pub images: Vec, +} + +/// Parameters for get_image method. +#[derive(Debug, Serialize, Deserialize)] +pub struct GetImageParams { + /// Storage root path. + pub storage_path: String, + /// Image ID or name. + pub image_ref: String, +} + +/// Result for get_image method. +#[derive(Debug, Serialize, Deserialize)] +pub struct GetImageResult { + /// Image ID. + pub id: String, + /// Image names. + pub names: Vec, + /// Layer diff IDs (sha256:...). + pub layer_diff_ids: Vec, + /// Storage layer IDs (internal IDs used by containers-storage). + pub storage_layer_ids: Vec, +} + +/// Parameters for stream_layer method. +#[derive(Debug, Serialize, Deserialize)] +pub struct StreamLayerParams { + /// Storage root path. + pub storage_path: String, + /// Layer ID (storage layer ID, not diff ID). + pub layer_id: String, +} + +/// Streaming notification for a segment. +#[derive(Debug, Serialize, Deserialize)] +pub struct StreamSegmentNotification { + /// Base64-encoded segment data. + pub data: String, +} + +/// Streaming notification for a file (fd is passed out-of-band). +#[derive(Debug, Serialize, Deserialize)] +pub struct StreamFileNotification { + /// File path in the tar. + pub name: String, + /// File size. + pub size: u64, +} + +/// Result for stream_layer method (sent after all notifications). +#[derive(Debug, Serialize, Deserialize)] +pub struct StreamLayerResult { + /// Number of items streamed. + pub items_sent: usize, +} + +/// Error type for userns helper operations. +#[derive(Debug, thiserror::Error)] +pub enum HelperError { + /// Failed to create socket. + #[error("failed to create socket: {0}")] + Socket(#[source] std::io::Error), + + /// Failed to spawn helper process. + #[error("failed to spawn helper process: {0}")] + Spawn(#[source] std::io::Error), + + /// IPC error. + #[error("IPC error: {0}")] + Ipc(String), + + /// Helper returned an error. + #[error("helper error: {0}")] + HelperError(String), + + /// I/O error. + #[error("I/O error: {0}")] + Io(#[from] std::io::Error), + + /// JSON-RPC error from the helper. + #[error("RPC error: code={code}, message={message}")] + RpcError { + /// JSON-RPC error code. + code: i32, + /// Error message. + message: String, + }, +} + +/// Check if this process was spawned as a userns helper and run the helper loop if so. +/// +/// This function **must** be called early in `main()`, before any other cstorage +/// operations. If this process was spawned as a helper, this function will: +/// +/// 1. Read from stdin (which is a Unix socket from the parent) +/// 2. Serve JSON-RPC requests for file operations +/// 3. Exit when the parent closes the connection +/// +/// If this is not a helper process, this function returns immediately. +pub fn init_if_helper() { + // Check if we're a helper via environment variable + if std::env::var(HELPER_ENV).is_err() { + return; // Not a helper, continue normal execution + } + + // Ensure we exit if parent dies (avoids orphan helper processes) + if let Err(e) = set_parent_process_death_signal(Some(Signal::TERM)) { + eprintln!("cstorage helper: failed to set parent death signal: {}", e); + // Continue anyway - this is a nice-to-have, not critical + } + + // We're a helper - stdin is our IPC socket. + // Use dup() to get a new owned fd from stdin (fd 0). + // This is safe because: + // 1. We were spawned with stdin set to a socket + // 2. dup() gives us a new fd that we own + // 3. We use std::io::stdin().as_fd() which is the safe way to get the fd + let stdin_fd = match dup(std::io::stdin().as_fd()) { + Ok(fd) => fd, + Err(e) => { + eprintln!("cstorage helper: failed to dup stdin: {}", e); + std::process::exit(1); + } + }; + let std_socket = StdUnixStream::from(stdin_fd); + + // Run the helper loop (never returns on success) + if let Err(e) = run_helper_loop_blocking(std_socket) { + eprintln!("cstorage helper: error in helper loop: {}", e); + std::process::exit(1); + } + std::process::exit(0); +} + +/// Run the helper loop synchronously by creating a tokio runtime. +fn run_helper_loop_blocking(std_socket: StdUnixStream) -> std::result::Result<(), HelperError> { + // Set non-blocking for tokio + std_socket.set_nonblocking(true)?; + + // Create a tokio runtime for the helper + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .map_err(|e| HelperError::Ipc(format!("failed to create tokio runtime: {}", e)))?; + + rt.block_on(run_helper_loop_async(std_socket)) +} + +/// Run the helper loop, serving requests from the parent. +async fn run_helper_loop_async(std_socket: StdUnixStream) -> std::result::Result<(), HelperError> { + // Convert std socket to tokio socket + let tokio_socket = TokioUnixStream::from_std(std_socket) + .map_err(|e| HelperError::Ipc(format!("failed to convert socket: {}", e)))?; + + let transport = UnixSocketTransport::new(tokio_socket); + let (mut sender, mut receiver) = transport.split(); + + tracing::debug!("userns helper: starting request loop"); + + loop { + let msg_with_fds = match receiver.receive().await { + Ok(m) => m, + Err(jsonrpc_fdpass::Error::ConnectionClosed) => { + tracing::debug!("userns helper: connection closed"); + return Ok(()); + } + Err(e) => { + return Err(HelperError::Ipc(format!( + "failed to receive message: {}", + e + ))); + } + }; + + match msg_with_fds.message { + JsonRpcMessage::Request(request) => { + let id = request.id.clone(); + + // Handle stream_layer specially since it needs to send multiple messages + if request.method == methods::STREAM_LAYER { + if let Err((code, msg)) = handle_stream_layer(&request, &mut sender).await { + let error = jsonrpc_fdpass::JsonRpcError::owned(code, msg, None::<()>); + let response = JsonRpcResponse::error(error, id); + let message = + MessageWithFds::new(JsonRpcMessage::Response(response), vec![]); + sender.send(message).await.map_err(|e| { + HelperError::Ipc(format!("failed to send error response: {}", e)) + })?; + } + // Success response is sent by handle_stream_layer + continue; + } + + let (result, fds) = handle_request(&request); + + match result { + Ok(response_value) => { + let response = JsonRpcResponse::success(response_value, id); + let message = MessageWithFds::new(JsonRpcMessage::Response(response), fds); + sender.send(message).await.map_err(|e| { + HelperError::Ipc(format!("failed to send response: {}", e)) + })?; + } + Err((code, message_str)) => { + let error = + jsonrpc_fdpass::JsonRpcError::owned(code, message_str, None::<()>); + let response = JsonRpcResponse::error(error, id); + let message = + MessageWithFds::new(JsonRpcMessage::Response(response), vec![]); + sender.send(message).await.map_err(|e| { + HelperError::Ipc(format!("failed to send error response: {}", e)) + })?; + } + } + + // Check for shutdown request (handle after sending response) + if request.method == methods::SHUTDOWN { + tracing::debug!("userns helper: received shutdown request"); + return Ok(()); + } + } + JsonRpcMessage::Notification(notif) => { + if notif.method == methods::SHUTDOWN { + tracing::debug!("userns helper: received shutdown notification"); + return Ok(()); + } + // Ignore other notifications + } + JsonRpcMessage::Response(_) => { + // Unexpected response - ignore + } + } + } +} + +/// Handle stream_layer request - sends multiple notifications with fds. +async fn handle_stream_layer( + request: &JsonRpcRequest, + sender: &mut jsonrpc_fdpass::transport::Sender, +) -> std::result::Result<(), (i32, String)> { + let params: StreamLayerParams = request + .params + .as_ref() + .and_then(|p| serde_json::from_value(p.clone()).ok()) + .ok_or(( + error_codes::INVALID_PARAMS, + "invalid params for streamLayer".to_string(), + ))?; + + let storage = Storage::open(¶ms.storage_path).map_err(|e| { + ( + error_codes::INTERNAL_ERROR, + format!("failed to open storage: {}", e), + ) + })?; + + let layer = Layer::open(&storage, ¶ms.layer_id).map_err(|e| { + ( + error_codes::RESOURCE_NOT_FOUND, + format!("layer not found: {}", e), + ) + })?; + + let mut stream = TarSplitFdStream::new(&storage, &layer).map_err(|e| { + ( + error_codes::INTERNAL_ERROR, + format!("failed to create tar-split stream: {}", e), + ) + })?; + + let mut items_sent = 0usize; + + // Stream all items as notifications + while let Some(item) = stream + .next() + .map_err(|e| (error_codes::INTERNAL_ERROR, format!("stream error: {}", e)))? + { + match item { + TarSplitItem::Segment(bytes) => { + // Send segment as base64-encoded notification + let params = StreamSegmentNotification { + data: BASE64_STANDARD.encode(&bytes), + }; + let notif = jsonrpc_fdpass::JsonRpcNotification::new( + "stream.segment".to_string(), + Some(serde_json::to_value(¶ms).unwrap()), + ); + let message = MessageWithFds::new(JsonRpcMessage::Notification(notif), vec![]); + sender.send(message).await.map_err(|e| { + ( + error_codes::INTERNAL_ERROR, + format!("failed to send segment: {}", e), + ) + })?; + items_sent += 1; + } + TarSplitItem::FileContent { fd, size, name } => { + // Send file notification with fd + let params = StreamFileNotification { name, size }; + let notif = jsonrpc_fdpass::JsonRpcNotification::new( + "stream.file".to_string(), + Some(serde_json::to_value(¶ms).unwrap()), + ); + let message = MessageWithFds::new(JsonRpcMessage::Notification(notif), vec![fd]); + sender.send(message).await.map_err(|e| { + ( + error_codes::INTERNAL_ERROR, + format!("failed to send file: {}", e), + ) + })?; + items_sent += 1; + } + } + } + + // Send success response + let result = StreamLayerResult { items_sent }; + let response = + JsonRpcResponse::success(serde_json::to_value(result).unwrap(), request.id.clone()); + let message = MessageWithFds::new(JsonRpcMessage::Response(response), vec![]); + sender.send(message).await.map_err(|e| { + ( + error_codes::INTERNAL_ERROR, + format!("failed to send response: {}", e), + ) + })?; + + Ok(()) +} + +/// Handle a JSON-RPC request. +fn handle_request( + request: &JsonRpcRequest, +) -> ( + std::result::Result, + Vec, +) { + match request.method.as_str() { + methods::OPEN_FILE => { + let params: OpenFileParams = match request + .params + .as_ref() + .and_then(|p| serde_json::from_value(p.clone()).ok()) + { + Some(p) => p, + None => { + return ( + Err(( + error_codes::INVALID_PARAMS, + "invalid params: missing 'path' field".to_string(), + )), + vec![], + ); + } + }; + + match std::fs::File::open(¶ms.path) { + Ok(file) => { + let fd: OwnedFd = file.into(); + let result = OpenFileResult { success: true }; + (Ok(serde_json::to_value(result).unwrap()), vec![fd]) + } + Err(e) => ( + Err(( + error_codes::INTERNAL_ERROR, + format!("failed to open file: {}", e), + )), + vec![], + ), + } + } + methods::LIST_IMAGES => handle_list_images(request), + methods::GET_IMAGE => handle_get_image(request), + methods::SHUTDOWN => { + // Just return success - the loop will exit after sending the response + (Ok(serde_json::json!({"success": true})), vec![]) + } + _ => ( + Err(( + error_codes::METHOD_NOT_FOUND, + format!("method not found: {}", request.method), + )), + vec![], + ), + } +} + +/// Handle list_images request. +fn handle_list_images( + request: &JsonRpcRequest, +) -> ( + std::result::Result, + Vec, +) { + let params: ListImagesParams = match request + .params + .as_ref() + .and_then(|p| serde_json::from_value(p.clone()).ok()) + { + Some(p) => p, + None => { + return ( + Err(( + error_codes::INVALID_PARAMS, + "invalid params for listImages".to_string(), + )), + vec![], + ); + } + }; + + let storage = match Storage::open(¶ms.storage_path) { + Ok(s) => s, + Err(e) => { + return ( + Err(( + error_codes::INTERNAL_ERROR, + format!("failed to open storage: {}", e), + )), + vec![], + ); + } + }; + + let images = match storage.list_images() { + Ok(imgs) => imgs, + Err(e) => { + return ( + Err(( + error_codes::INTERNAL_ERROR, + format!("failed to list images: {}", e), + )), + vec![], + ); + } + }; + + let image_infos: Vec = images + .iter() + .map(|img| ImageInfo { + id: img.id().to_string(), + names: img.names(&storage).unwrap_or_default(), + }) + .collect(); + + let result = ListImagesResult { + images: image_infos, + }; + (Ok(serde_json::to_value(result).unwrap()), vec![]) +} + +/// Handle get_image request. +fn handle_get_image( + request: &JsonRpcRequest, +) -> ( + std::result::Result, + Vec, +) { + let params: GetImageParams = match request + .params + .as_ref() + .and_then(|p| serde_json::from_value(p.clone()).ok()) + { + Some(p) => p, + None => { + return ( + Err(( + error_codes::INVALID_PARAMS, + "invalid params for getImage".to_string(), + )), + vec![], + ); + } + }; + + let storage = match Storage::open(¶ms.storage_path) { + Ok(s) => s, + Err(e) => { + return ( + Err(( + error_codes::INTERNAL_ERROR, + format!("failed to open storage: {}", e), + )), + vec![], + ); + } + }; + + // Try by ID first, then by name + let image = match crate::image::Image::open(&storage, ¶ms.image_ref) { + Ok(img) => img, + Err(_) => match storage.find_image_by_name(¶ms.image_ref) { + Ok(img) => img, + Err(e) => { + return ( + Err(( + error_codes::RESOURCE_NOT_FOUND, + format!("image not found: {}", e), + )), + vec![], + ); + } + }, + }; + + let config = match image.config() { + Ok(cfg) => cfg, + Err(e) => { + return ( + Err(( + error_codes::INTERNAL_ERROR, + format!("failed to read config: {}", e), + )), + vec![], + ); + } + }; + + let diff_ids: Vec = config + .rootfs() + .diff_ids() + .iter() + .map(|s| s.to_string()) + .collect(); + + let storage_layer_ids = match image.storage_layer_ids(std::slice::from_ref(&storage)) { + Ok(ids) => ids, + Err(e) => { + return ( + Err(( + error_codes::INTERNAL_ERROR, + format!("failed to get storage layer IDs: {}", e), + )), + vec![], + ); + } + }; + + let result = GetImageResult { + id: image.id().to_string(), + names: image.names(&storage).unwrap_or_default(), + layer_diff_ids: diff_ids, + storage_layer_ids, + }; + (Ok(serde_json::to_value(result).unwrap()), vec![]) +} + +/// Proxy for accessing files via the userns helper process. +/// +/// This spawns a helper process (via `podman unshare`) that runs inside a +/// user namespace and can read files with restrictive permissions. File +/// descriptors are passed back via SCM_RIGHTS. +pub struct StorageProxy { + child: Child, + sender: jsonrpc_fdpass::transport::Sender, + receiver: jsonrpc_fdpass::transport::Receiver, + next_id: u64, +} + +impl std::fmt::Debug for StorageProxy { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("StorageProxy") + .field("child_pid", &self.child.id()) + .finish_non_exhaustive() + } +} + +impl StorageProxy { + /// Spawn a userns helper process. + /// + /// If the current process can already bypass file permissions (running as + /// root or has CAP_DAC_OVERRIDE), this returns `Ok(None)` since no helper + /// is needed. + pub async fn spawn() -> std::result::Result, HelperError> { + // Check if we even need a helper + if can_bypass_file_permissions() { + return Ok(None); + } + + Self::spawn_helper().await.map(Some) + } + + /// Spawn the helper unconditionally. + async fn spawn_helper() -> std::result::Result { + let exe = std::fs::read_link("/proc/self/exe").map_err(HelperError::Io)?; + Self::spawn_helper_with_binary(exe).await + } + + /// Spawn the helper with a specific binary path. + /// + /// This is used when the default /proc/self/exe is not suitable, + /// such as when running from a test harness. + async fn spawn_helper_with_binary( + exe: std::path::PathBuf, + ) -> std::result::Result { + // Create a socket pair - one end for us, one for the child's stdin + let (parent_sock, child_sock) = StdUnixStream::pair().map_err(HelperError::Socket)?; + + // Spawn via podman unshare, with child_sock as the child's stdin. + // We use `env` to set the HELPER_ENV because podman unshare doesn't + // propagate the parent's environment to the inner command. + let child = Command::new("podman") + .arg("unshare") + .arg("env") + .arg(format!("{}=1", HELPER_ENV)) + .arg(&exe) + .stdin(Stdio::from(OwnedFd::from(child_sock))) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .spawn() + .map_err(HelperError::Spawn)?; + + // Convert our socket to async + parent_sock.set_nonblocking(true)?; + let tokio_socket = TokioUnixStream::from_std(parent_sock) + .map_err(|e| HelperError::Ipc(format!("failed to convert socket: {}", e)))?; + + let transport = UnixSocketTransport::new(tokio_socket); + let (sender, receiver) = transport.split(); + + Ok(Self { + child, + sender, + receiver, + next_id: 1, + }) + } + + /// Open a file via the helper, returning its fd. + /// + /// # Arguments + /// + /// * `path` - The path to open (should be absolute) + /// + /// # Returns + /// + /// The opened file descriptor, which can be used for reading. + pub async fn open_file( + &mut self, + path: impl AsRef, + ) -> std::result::Result { + let params = OpenFileParams { + path: path.as_ref().to_string_lossy().to_string(), + }; + + let id = self.next_id; + self.next_id += 1; + + let request = JsonRpcRequest::new( + methods::OPEN_FILE.to_string(), + Some(serde_json::to_value(¶ms).unwrap()), + serde_json::Value::Number(id.into()), + ); + + let message = MessageWithFds::new(JsonRpcMessage::Request(request), vec![]); + self.sender + .send(message) + .await + .map_err(|e| HelperError::Ipc(format!("failed to send request: {}", e)))?; + + // Receive response + let response = self + .receiver + .receive() + .await + .map_err(|e| HelperError::Ipc(format!("failed to receive response: {}", e)))?; + + match response.message { + JsonRpcMessage::Response(resp) => { + if let Some(error) = resp.error { + return Err(HelperError::RpcError { + code: error.code(), + message: error.message().to_string(), + }); + } + + // The fd should be in the response + if response.file_descriptors.is_empty() { + return Err(HelperError::Ipc( + "response missing file descriptor".to_string(), + )); + } + + Ok(response.file_descriptors.into_iter().next().unwrap()) + } + other => Err(HelperError::Ipc(format!( + "unexpected message type: {:?}", + other + ))), + } + } + + /// Shutdown the helper process gracefully. + pub async fn shutdown(mut self) -> std::result::Result<(), HelperError> { + let id = self.next_id; + + let request = JsonRpcRequest::new( + methods::SHUTDOWN.to_string(), + None, + serde_json::Value::Number(id.into()), + ); + + let message = MessageWithFds::new(JsonRpcMessage::Request(request), vec![]); + // Ignore send errors - the child may have already exited + let _ = self.sender.send(message).await; + + // Wait for the child to exit + let _ = self.child.wait(); + + Ok(()) + } + + /// List images in storage via the helper. + pub async fn list_images( + &mut self, + storage_path: &str, + ) -> std::result::Result, HelperError> { + let params = ListImagesParams { + storage_path: storage_path.to_string(), + }; + let result: ListImagesResult = self.call(methods::LIST_IMAGES, ¶ms).await?; + Ok(result.images) + } + + /// Get image information via the helper. + pub async fn get_image( + &mut self, + storage_path: &str, + image_ref: &str, + ) -> std::result::Result { + let params = GetImageParams { + storage_path: storage_path.to_string(), + image_ref: image_ref.to_string(), + }; + self.call(methods::GET_IMAGE, ¶ms).await + } + + /// Start streaming a layer's tar-split content. + /// + /// Returns a stream that yields `ProxiedTarSplitItem`s. The helper sends + /// notifications with file descriptors for each file in the layer. + pub async fn stream_layer( + &mut self, + storage_path: &str, + layer_id: &str, + ) -> std::result::Result, HelperError> { + let params = StreamLayerParams { + storage_path: storage_path.to_string(), + layer_id: layer_id.to_string(), + }; + + let id = self.next_id; + self.next_id += 1; + + let request = JsonRpcRequest::new( + methods::STREAM_LAYER.to_string(), + Some(serde_json::to_value(¶ms).unwrap()), + serde_json::Value::Number(id.into()), + ); + + let message = MessageWithFds::new(JsonRpcMessage::Request(request), vec![]); + self.sender + .send(message) + .await + .map_err(|e| HelperError::Ipc(format!("failed to send stream_layer request: {}", e)))?; + + Ok(ProxiedLayerStream { + receiver: &mut self.receiver, + request_id: id, + finished: false, + }) + } + + /// Make an RPC call and parse the response. + async fn call Deserialize<'de>>( + &mut self, + method: &str, + params: &P, + ) -> std::result::Result { + let id = self.next_id; + self.next_id += 1; + + let request = JsonRpcRequest::new( + method.to_string(), + Some(serde_json::to_value(params).unwrap()), + serde_json::Value::Number(id.into()), + ); + + let message = MessageWithFds::new(JsonRpcMessage::Request(request), vec![]); + self.sender + .send(message) + .await + .map_err(|e| HelperError::Ipc(format!("failed to send request: {}", e)))?; + + // Receive response + let response = self + .receiver + .receive() + .await + .map_err(|e| HelperError::Ipc(format!("failed to receive response: {}", e)))?; + + match response.message { + JsonRpcMessage::Response(resp) => { + if let Some(error) = resp.error { + return Err(HelperError::RpcError { + code: error.code(), + message: error.message().to_string(), + }); + } + + let result = resp + .result + .ok_or_else(|| HelperError::Ipc("response missing result".to_string()))?; + + serde_json::from_value(result) + .map_err(|e| HelperError::Ipc(format!("failed to parse result: {}", e))) + } + other => Err(HelperError::Ipc(format!( + "unexpected message type: {:?}", + other + ))), + } + } +} + +/// Item received from a proxied layer stream. +#[derive(Debug)] +pub enum ProxiedTarSplitItem { + /// Raw segment bytes (tar header/padding). + Segment(Vec), + /// File content with metadata and fd. + FileContent { + /// File descriptor for the content. + fd: OwnedFd, + /// File size. + size: u64, + /// File name/path. + name: String, + }, +} + +/// Stream of tar-split items received via the helper proxy. +pub struct ProxiedLayerStream<'a> { + receiver: &'a mut jsonrpc_fdpass::transport::Receiver, + request_id: u64, + finished: bool, +} + +impl std::fmt::Debug for ProxiedLayerStream<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ProxiedLayerStream") + .field("request_id", &self.request_id) + .field("finished", &self.finished) + .finish_non_exhaustive() + } +} + +impl<'a> ProxiedLayerStream<'a> { + /// Get the next item from the stream. + /// + /// Returns `None` when the stream is complete. + pub async fn next(&mut self) -> std::result::Result, HelperError> { + if self.finished { + return Ok(None); + } + + let msg_with_fds = match self.receiver.receive().await { + Ok(m) => m, + Err(jsonrpc_fdpass::Error::ConnectionClosed) => { + self.finished = true; + return Ok(None); + } + Err(e) => { + return Err(HelperError::Ipc(format!("failed to receive: {}", e))); + } + }; + + let mut fds = msg_with_fds.file_descriptors; + + match msg_with_fds.message { + JsonRpcMessage::Notification(notif) => { + let params = notif.params.unwrap_or(serde_json::Value::Null); + + match notif.method.as_str() { + "stream.segment" => { + let seg: StreamSegmentNotification = serde_json::from_value(params) + .map_err(|e| { + HelperError::Ipc(format!("invalid segment params: {}", e)) + })?; + + let bytes = BASE64_STANDARD.decode(&seg.data).map_err(|e| { + HelperError::Ipc(format!("failed to decode segment: {}", e)) + })?; + + Ok(Some(ProxiedTarSplitItem::Segment(bytes))) + } + "stream.file" => { + let file: StreamFileNotification = serde_json::from_value(params) + .map_err(|e| HelperError::Ipc(format!("invalid file params: {}", e)))?; + + if fds.is_empty() { + return Err(HelperError::Ipc( + "file notification missing fd".to_string(), + )); + } + + let fd = fds.remove(0); + Ok(Some(ProxiedTarSplitItem::FileContent { + fd, + size: file.size, + name: file.name, + })) + } + other => Err(HelperError::Ipc(format!( + "unknown notification method: {}", + other + ))), + } + } + JsonRpcMessage::Response(resp) => { + // Final response - stream is complete + self.finished = true; + + if let Some(error) = resp.error { + return Err(HelperError::RpcError { + code: error.code(), + message: error.message().to_string(), + }); + } + + Ok(None) + } + JsonRpcMessage::Request(_) => Err(HelperError::Ipc( + "unexpected request from helper".to_string(), + )), + } + } +} + +impl Drop for StorageProxy { + fn drop(&mut self) { + // Try to kill the child if it's still running + let _ = self.child.kill(); + } +} From c49aa6106dc4806a9fa282a4726f707371a24e7a Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Tue, 7 Apr 2026 12:05:29 -0400 Subject: [PATCH 05/10] repo: Add ensure_object_from_file with reflink/hardlink/copy fallback Add ObjectStoreMethod::Reflinked/Hardlinked variants and a new ImportContext that caches per-(src_dev, dst_dev) reflink support across bulk import operations. ensure_object_from_file() tries FICLONE first, falls back to hardlink (linking the source file directly into the objects directory after enabling fs-verity on it), and finally falls back to a regular data copy. This avoids data copying when importing from containers-storage on filesystems that support reflinks (btrfs, XFS) or even on ext4 via hardlinks. Assisted-by: OpenCode (Claude Opus 4) Signed-off-by: Colin Walters --- crates/composefs/src/repository.rs | 299 ++++++++++++++++++++++++++++- 1 file changed, 296 insertions(+), 3 deletions(-) diff --git a/crates/composefs/src/repository.rs b/crates/composefs/src/repository.rs index 2930cfcb..0787ad36 100644 --- a/crates/composefs/src/repository.rs +++ b/crates/composefs/src/repository.rs @@ -523,16 +523,57 @@ pub(crate) fn write_repo_metadata( /// How an object was stored in the repository. /// -/// Returned by [`Repository::ensure_object_from_file_with_stats`] to indicate -/// whether the operation used a regular copy or found an existing object. +/// Returned by [`Repository::ensure_object_from_file`] to indicate +/// whether the operation used zero-copy reflinks, a regular copy, or found +/// an existing object. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ObjectStoreMethod { - /// Object was stored via regular file copy. + /// Object was stored via reflink (zero-copy, FICLONE ioctl). + Reflinked, + /// Object was stored via hardlink (zero-copy, source file linked directly). + Hardlinked, + /// Object was stored via regular file copy (reflink not supported). Copied, /// Object already existed in the repository (deduplicated). AlreadyPresent, } +/// Per-operation context for [`Repository::ensure_object_from_file`]. +/// +/// Create one of these at the start of a bulk import operation (e.g. importing +/// all layers of a container image) and pass it to every +/// `ensure_object_from_file` call. The context caches which +/// `(source_device, dest_device)` pairs do not support reflinks, so that +/// after the first `EOPNOTSUPP` / `EXDEV` on a given pair, subsequent +/// calls skip the FICLONE probe entirely. +/// +/// This correctly handles multi-store imports where layers may come from +/// different filesystems: a reflink failure on ext4→xfs does not suppress +/// reflink attempts for a later xfs→xfs pair. +#[derive(Debug, Default)] +pub struct ImportContext { + /// Device-ID pairs where FICLONE has already failed. Stored as + /// `(source_dev, dest_dev)` from `fstat().st_dev`. + reflink_unsupported_devs: Vec<(u64, u64)>, +} + +impl ImportContext { + /// Check whether reflinks are known to be unsupported for this + /// source→destination device pair. + pub(crate) fn is_reflink_unsupported(&self, src_dev: u64, dst_dev: u64) -> bool { + self.reflink_unsupported_devs + .iter() + .any(|&(s, d)| s == src_dev && d == dst_dev) + } + + /// Record that reflinks are unsupported for this device pair. + pub(crate) fn mark_reflink_unsupported(&mut self, src_dev: u64, dst_dev: u64) { + if !self.is_reflink_unsupported(src_dev, dst_dev) { + self.reflink_unsupported_devs.push((src_dev, dst_dev)); + } + } +} + /// Call openat() on the named subdirectory of "dirfd", possibly creating it first. /// /// We assume that the directory will probably exist (ie: we try the open first), and on ENOENT, we @@ -1133,6 +1174,201 @@ impl Repository { Ok(fd) } + /// Ensure an object exists by reflinking or hardlinking from a source file. + /// + /// The fallback chain is: reflink -> hardlink -> copy. + /// + /// - **Reflink** (FICLONE): zero-copy clone on btrfs/XFS. Uses a tmpfile. + /// - **Hardlink**: enables fs-verity on the source file in-place, then + /// hardlinks it directly into the objects directory. This avoids all data + /// copying on filesystems like ext4 that don't support reflinks. + /// - **Copy**: regular data copy into a tmpfile as last resort. + /// + /// The `ctx` argument accumulates knowledge across calls in the same + /// import operation. After the first reflink attempt fails with + /// `EOPNOTSUPP` / `EXDEV`, the context records this so that subsequent + /// calls skip straight to the hardlink path. + /// + /// This is particularly useful for importing from containers-storage where + /// we already have the file on disk and want to avoid copying data. + pub fn ensure_object_from_file( + &self, + src: &std::fs::File, + size: u64, + ctx: &mut ImportContext, + ) -> Result<(ObjectID, ObjectStoreMethod)> { + self.ensure_object_from_file_inner(src, size, true, ctx) + } + + /// Like [`ensure_object_from_file`](Self::ensure_object_from_file) but + /// errors if neither reflink nor hardlink succeeds, instead of falling back + /// to a regular copy. + /// + /// Intended for bootc's unified storage path where the composefs repo and + /// containers-storage are always on the same filesystem, so zero-copy + /// should always be possible. + pub fn ensure_object_from_file_zerocopy( + &self, + src: &std::fs::File, + size: u64, + ctx: &mut ImportContext, + ) -> Result<(ObjectID, ObjectStoreMethod)> { + self.ensure_object_from_file_inner(src, size, false, ctx) + } + + /// Inner implementation for [`ensure_object_from_file`](Self::ensure_object_from_file) and + /// [`ensure_object_from_file_zerocopy`](Self::ensure_object_from_file_zerocopy). + /// + /// When `allow_copy` is false, the copy fallback returns an error instead. + fn ensure_object_from_file_inner( + &self, + src: &std::fs::File, + size: u64, + allow_copy: bool, + ctx: &mut ImportContext, + ) -> Result<(ObjectID, ObjectStoreMethod)> { + use rustix::fs::{fstat, ioctl_ficlone}; + + let writable = self.ensure_writable_token()?; + + // Determine the source and destination device IDs so we can look up + // whether this particular filesystem pair supports reflinks. + let src_dev = fstat(src)?.st_dev; + let dst_dev = fstat(self.objects_dir()?)?.st_dev; + + // Try reflink first, unless a previous call on this device pair + // already discovered that FICLONE is unsupported. + if !ctx.is_reflink_unsupported(src_dev, dst_dev) { + let tmpfile_fd = self.create_object_tmpfile_impl(&writable)?; + let tmpfile = File::from(tmpfile_fd); + + match ioctl_ficlone(&tmpfile, src) { + Ok(()) => { + // Reflink succeeded — verify size matches + let stat = fstat(&tmpfile)?; + anyhow::ensure!( + stat.st_size as u64 == size, + "Reflink size mismatch: expected {}, got {}", + size, + stat.st_size + ); + + let (object_id, method) = self.finalize_object_tmpfile(tmpfile, size)?; + let method = match method { + ObjectStoreMethod::Copied => ObjectStoreMethod::Reflinked, + other => other, + }; + return Ok((object_id, method)); + } + Err(Errno::OPNOTSUPP | Errno::XDEV) => { + // Record for this device pair so subsequent calls skip. + ctx.mark_reflink_unsupported(src_dev, dst_dev); + drop(tmpfile); + } + Err(e) => { + return Err(e).context("Reflinking source file to objects directory")?; + } + } + } + + // Try hardlink: enable verity on the source in-place, then link it + // directly into objects/. This avoids all data copying. + match self.try_hardlink_object(src, size) { + Ok(result) => return Ok(result), + Err(_) if allow_copy => { + // Hardlink failed, fall through to copy. + // Common causes: cross-mount (overlay bind mount), EPERM, + // or verity enablement failure. + } + Err(e) => { + return Err(e).context( + "reflink and hardlink both failed; copy fallback is disabled (zerocopy mode)", + ); + } + } + + // Final fallback: copy data into a new tmpfile. + let tmpfile_fd = self.create_object_tmpfile_impl(&writable)?; + let mut tmpfile = File::from(tmpfile_fd); + { + use std::io::{Seek, SeekFrom}; + let mut src_clone = src.try_clone()?; + src_clone.seek(SeekFrom::Start(0))?; + std::io::copy(&mut src_clone, &mut tmpfile)?; + } + + let (object_id, method) = self.finalize_object_tmpfile(tmpfile, size)?; + Ok((object_id, method)) + } + + /// Try to hardlink a source file directly into the objects directory. + /// + /// Enables fs-verity on the source file in-place, measures the digest to + /// determine the object ID, then hardlinks the source into `objects/`. + /// + /// Returns an error if verity cannot be enabled, the digest cannot be + /// measured, or the hardlink fails (e.g. cross-device). + fn try_hardlink_object( + &self, + src: &std::fs::File, + size: u64, + ) -> Result<(ObjectID, ObjectStoreMethod)> { + use crate::fsverity::enable_verity_with_retry; + + let objects_dir = self.objects_dir()?; + + // Enable fs-verity on the source file in-place. + // This is safe because the caller (bootc/containers-storage) owns the + // source files and they are immutable image data. + // AlreadyEnabled is fine — the file was already verity-protected. + let verity_enabled = match enable_verity_with_retry::(src) { + Ok(()) => true, + Err(EnableVerityError::AlreadyEnabled) => true, + Err(EnableVerityError::FilesystemNotSupported) if self.insecure => false, + Err(e) => { + return Err(e).context("enabling verity on source file for hardlink")?; + } + }; + + // Get the object ID from the verity digest (kernel-measured or userspace-computed) + let id: ObjectID = if verity_enabled { + measure_verity(src).context("measuring verity digest on source file")? + } else { + // Insecure mode on a filesystem without verity: compute digest in userspace + let mut reader = std::io::BufReader::new( + src.try_clone() + .context("cloning fd for digest computation")?, + ); + Self::compute_verity_digest(&mut reader) + .context("computing verity digest in insecure mode")? + }; + + // Check if object already exists (dedup) + let path = id.to_object_pathname(); + match statat(objects_dir, &path, AtFlags::empty()) { + Ok(stat) if stat.st_size as u64 == size => { + return Ok((id, ObjectStoreMethod::AlreadyPresent)); + } + _ => {} + } + + // Ensure parent directory exists (e.g. objects/4e/) + let parent_dir = id.to_object_dir(); + let _ = mkdirat(objects_dir, &parent_dir, Mode::from_raw_mode(0o755)); + + // Hardlink the source file directly into objects/. + // Use AT_EMPTY_PATH to link by fd, which avoids the kernel's + // may_linkat() restriction that rejects AT_SYMLINK_FOLLOW on + // /proc/self/fd/ magic symlinks for non-root mounts. + // AT_EMPTY_PATH requires CAP_DAC_READ_SEARCH, which is available + // when running as root (the expected case for containers-storage). + match linkat(src, "", objects_dir, &path, AtFlags::EMPTY_PATH) { + Ok(()) => Ok((id, ObjectStoreMethod::Hardlinked)), + Err(Errno::EXIST) => Ok((id, ObjectStoreMethod::AlreadyPresent)), + Err(e) => Err(e).context("hardlinking source file into objects directory")?, + } + } + /// Finalize a tmpfile as an object. /// /// This method should be called from a blocking context (e.g., `spawn_blocking`) @@ -3717,6 +3953,39 @@ mod tests { Ok(()) } + #[test] + fn test_ensure_object_from_file() -> Result<()> { + use std::io::{Seek, SeekFrom, Write}; + + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + let mut ctx = ImportContext::default(); + + let test_data = generate_test_data(64 * 1024, 0xBE); + let mut temp_file = crate::test::tempfile(); + temp_file.write_all(&test_data)?; + temp_file.seek(SeekFrom::Start(0))?; + + // First store should return Copied or Reflinked (depending on fs) + let (object_id, method) = + repo.ensure_object_from_file(&temp_file, test_data.len() as u64, &mut ctx)?; + assert_ne!(method, ObjectStoreMethod::AlreadyPresent); + assert!(test_object_exists(&tmp, &object_id)?); + + // Read back and verify contents match + let stored_data = repo.read_object(&object_id)?; + assert_eq!(stored_data, test_data); + + // Second store of same data should return AlreadyPresent + temp_file.seek(SeekFrom::Start(0))?; + let (object_id_2, method_2) = + repo.ensure_object_from_file(&temp_file, test_data.len() as u64, &mut ctx)?; + assert_eq!(object_id, object_id_2); + assert_eq!(method_2, ObjectStoreMethod::AlreadyPresent); + + Ok(()) + } + // ==================== Fsck Tests ==================== #[tokio::test] @@ -4451,4 +4720,28 @@ mod tests { FeatureCheck::ReadWrite ); } + + #[test] + fn test_object_store_method_variants() { + // Verify all variants exist and are distinct + let methods = [ + ObjectStoreMethod::Reflinked, + ObjectStoreMethod::Hardlinked, + ObjectStoreMethod::Copied, + ObjectStoreMethod::AlreadyPresent, + ]; + + for (i, a) in methods.iter().enumerate() { + for (j, b) in methods.iter().enumerate() { + if i == j { + assert_eq!(a, b); + } else { + assert_ne!(a, b); + } + } + } + + // Verify Debug impl works + assert_eq!(format!("{:?}", ObjectStoreMethod::Hardlinked), "Hardlinked"); + } } From 4ae70921bc28eec82333a228b852ce0dc8d16a7c Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Mon, 13 Apr 2026 14:08:43 -0400 Subject: [PATCH 06/10] repo: Stop silently ignoring mkdirat errors in object storage The two mkdirat calls in try_hardlink_object and link_tmpfile_as_object were discarding all errors with `let _ =`. If mkdirat failed for a reason other than EEXIST (e.g. permission denied, read-only filesystem), the subsequent linkat would fail with a confusing ENOENT. Add an ensure_dir_at helper that propagates all errors except EEXIST, matching the pattern already used in ensure_dir_and_openat. Assisted-by: OpenCode (Claude Opus 4) Signed-off-by: Colin Walters --- crates/composefs/src/repository.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/crates/composefs/src/repository.rs b/crates/composefs/src/repository.rs index 0787ad36..ff3bdff7 100644 --- a/crates/composefs/src/repository.rs +++ b/crates/composefs/src/repository.rs @@ -599,6 +599,17 @@ fn ensure_dir_and_openat(dirfd: impl AsFd, filename: &str, flags: OFlags) -> Err } } +/// Create a directory under `dirfd` if it doesn't already exist. +/// +/// Returns `Ok(())` on success or if the directory already exists. +/// Propagates all other errors from `mkdirat`. +fn ensure_dir_at(dirfd: impl AsFd, path: &str, mode: Mode) -> ErrnoResult<()> { + match mkdirat(dirfd, path, mode) { + Ok(()) | Err(Errno::EXIST) => Ok(()), + Err(e) => Err(e), + } +} + /// A zero-sized proof token confirming that a [`Repository`] is writable. /// /// Obtained by calling [`Repository::ensure_writable`], which performs a fast @@ -1354,7 +1365,8 @@ impl Repository { // Ensure parent directory exists (e.g. objects/4e/) let parent_dir = id.to_object_dir(); - let _ = mkdirat(objects_dir, &parent_dir, Mode::from_raw_mode(0o755)); + ensure_dir_at(objects_dir, &parent_dir, Mode::from_raw_mode(0o755)) + .context("creating object parent directory")?; // Hardlink the source file directly into objects/. // Use AT_EMPTY_PATH to link by fd, which avoids the kernel's @@ -1458,7 +1470,8 @@ impl Repository { } let parent_dir = id.to_object_dir(); - let _ = mkdirat(objects_dir, &parent_dir, Mode::from_raw_mode(0o755)); + ensure_dir_at(objects_dir, &parent_dir, Mode::from_raw_mode(0o755)) + .context("creating object parent directory")?; match linkat( CWD, From dddf0f0e1337d38a6ad97cdb6bd0cf901b103b53 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Tue, 7 Apr 2026 12:07:58 -0400 Subject: [PATCH 07/10] oci: Expand ImportStats for zero-copy tracking and add PullOptions Extend ImportStats with reflink/hardlink counters and byte totals, plus layer-level tracking (layers, layers_already_present). The Display impl now shows a detailed breakdown when zero-copy methods were used while preserving the existing compact format for copy-only imports. Add PullOptions struct to the pull() signature, preparing for the containers-storage import path which needs extra knobs (zerocopy mode, explicit storage root, additional image stores). Visibility changes (pub(crate) on helpers, pub on ContentAndVerity) prepare for the cstor module to reuse these internals. Assisted-by: OpenCode (Claude Opus 4) Signed-off-by: Colin Walters --- crates/composefs-oci/src/lib.rs | 206 ++++++++++++++++++++++++++--- crates/composefs-oci/src/skopeo.rs | 8 ++ 2 files changed, 197 insertions(+), 17 deletions(-) diff --git a/crates/composefs-oci/src/lib.rs b/crates/composefs-oci/src/lib.rs index 54de6ecc..9e5a9568 100644 --- a/crates/composefs-oci/src/lib.rs +++ b/crates/composefs-oci/src/lib.rs @@ -8,6 +8,8 @@ //! - Pulling container images from registries using skopeo //! - Converting OCI image layers from tar format to composefs split streams //! - Creating mountable filesystems from OCI image configurations +//! - Importing from containers-storage with zero-copy reflinks (optional feature) + #![forbid(unsafe_code)] pub mod boot; @@ -67,27 +69,55 @@ pub use skopeo::pull_image; /// Statistics from an image import operation. #[derive(Debug, Clone, Default)] pub struct ImportStats { - /// Number of objects stored via copy. + /// Number of layers in the image. + pub layers: u64, + /// Number of layers that were already present (skipped). + pub layers_already_present: u64, + /// Number of objects stored via regular copy. pub objects_copied: u64, + /// Number of objects stored via reflink (zero-copy). + pub objects_reflinked: u64, + /// Number of objects stored via hardlink (zero-copy). + pub objects_hardlinked: u64, /// Number of objects that already existed (deduplicated). pub objects_already_present: u64, - /// Total bytes stored as new objects. + /// Total bytes stored via regular copy. pub bytes_copied: u64, + /// Total bytes stored via reflink. + pub bytes_reflinked: u64, + /// Total bytes stored via hardlink. + pub bytes_hardlinked: u64, /// Total bytes inlined in splitstreams (small files + headers). pub bytes_inlined: u64, } impl ImportStats { + /// Total number of new objects stored (copied + reflinked + hardlinked). + pub fn new_objects(&self) -> u64 { + self.objects_copied + self.objects_reflinked + self.objects_hardlinked + } + /// Total number of objects processed (new + already present). pub fn total_objects(&self) -> u64 { - self.objects_copied + self.objects_already_present + self.new_objects() + self.objects_already_present + } + + /// Total bytes stored as new objects (copied + reflinked + hardlinked). + pub fn new_bytes(&self) -> u64 { + self.bytes_copied + self.bytes_reflinked + self.bytes_hardlinked } /// Merge another `ImportStats` into this one. pub fn merge(&mut self, other: &ImportStats) { + self.layers += other.layers; + self.layers_already_present += other.layers_already_present; self.objects_copied += other.objects_copied; + self.objects_reflinked += other.objects_reflinked; + self.objects_hardlinked += other.objects_hardlinked; self.objects_already_present += other.objects_already_present; self.bytes_copied += other.bytes_copied; + self.bytes_reflinked += other.bytes_reflinked; + self.bytes_hardlinked += other.bytes_hardlinked; self.bytes_inlined += other.bytes_inlined; } @@ -103,6 +133,14 @@ impl ImportStats { stats.objects_copied += 1; stats.bytes_copied += size; } + ObjectStoreMethod::Reflinked => { + stats.objects_reflinked += 1; + stats.bytes_reflinked += size; + } + ObjectStoreMethod::Hardlinked => { + stats.objects_hardlinked += 1; + stats.bytes_hardlinked += size; + } ObjectStoreMethod::AlreadyPresent => { stats.objects_already_present += 1; } @@ -114,17 +152,83 @@ impl ImportStats { impl std::fmt::Display for ImportStats { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{} new + {} already present objects; {} stored, {} inlined", - self.objects_copied, - self.objects_already_present, - indicatif::HumanBytes(self.bytes_copied), - indicatif::HumanBytes(self.bytes_inlined), - ) + let has_zerocopy = self.objects_reflinked > 0 || self.objects_hardlinked > 0; + if has_zerocopy { + // Show detailed breakdown when zero-copy methods were used + let mut parts = Vec::new(); + if self.objects_reflinked > 0 { + parts.push(format!("{} reflinked", self.objects_reflinked)); + } + if self.objects_hardlinked > 0 { + parts.push(format!("{} hardlinked", self.objects_hardlinked)); + } + parts.push(format!("{} copied", self.objects_copied)); + parts.push(format!("{} already present", self.objects_already_present)); + write!(f, "{} objects; ", parts.join(" + "))?; + + let mut byte_parts = Vec::new(); + if self.objects_reflinked > 0 { + byte_parts.push(format!( + "{} reflinked", + indicatif::HumanBytes(self.bytes_reflinked) + )); + } + if self.objects_hardlinked > 0 { + byte_parts.push(format!( + "{} hardlinked", + indicatif::HumanBytes(self.bytes_hardlinked) + )); + } + byte_parts.push(format!( + "{} copied", + indicatif::HumanBytes(self.bytes_copied) + )); + byte_parts.push(format!( + "{} inlined", + indicatif::HumanBytes(self.bytes_inlined) + )); + write!(f, "{}", byte_parts.join(", ")) + } else { + write!( + f, + "{} new + {} already present objects; {} stored, {} inlined", + self.objects_copied, + self.objects_already_present, + indicatif::HumanBytes(self.bytes_copied), + indicatif::HumanBytes(self.bytes_inlined), + ) + } } } +/// Options for a [`pull`] operation. +/// +/// Use `Default::default()` for the common case (skopeo transport, no +/// zero-copy requirement). +#[derive(Debug, Default)] +pub struct PullOptions<'a> { + /// Image proxy configuration passed to skopeo (ignored for + /// `containers-storage:` references). + pub img_proxy_config: Option, + + /// If `true`, the containers-storage import path will error instead of + /// falling back to a data copy when neither reflink nor hardlink succeeds. + /// Intended for bootc's unified storage layout where the composefs repo + /// and containers-storage are always on the same filesystem. + pub zerocopy: bool, + + /// Explicit containers-storage root. When set, auto-discovery is skipped + /// and only this path (plus any `additional_image_stores`) is searched. + /// Only relevant for `containers-storage:` references. + pub storage_root: Option<&'a std::path::Path>, + + /// Additional read-only image stores to search beyond the primary + /// (auto-discovered or explicit) store. Equivalent to the + /// `additionalimagestore=` option in containers/storage. + /// Only relevant for `containers-storage:` references. + pub additional_image_stores: &'a [&'a std::path::Path], +} + /// Result of a pull operation. #[derive(Debug)] pub struct PullResult { @@ -136,7 +240,8 @@ pub struct PullResult { pub stats: ImportStats, } -type ContentAndVerity = (OciDigest, ObjectID); +/// A tuple of (content digest, fs-verity ObjectID). +pub type ContentAndVerity = (OciDigest, ObjectID); /// Parsed OCI config and its associated references. pub struct OpenConfig { @@ -160,11 +265,11 @@ impl std::fmt::Debug for OpenConfig { } } -fn layer_identifier(diff_id: &OciDigest) -> String { +pub(crate) fn layer_identifier(diff_id: &OciDigest) -> String { format!("oci-layer-{diff_id}") } -fn config_identifier(config: &OciDigest) -> String { +pub(crate) fn config_identifier(config: &OciDigest) -> String { format!("oci-config-{config}") } @@ -223,14 +328,21 @@ pub fn ls_layer( /// Pull the target image, and add the provided tag. If this is a mountable /// image (i.e. not an artifact), it is *not* unpacked by default. +/// +/// When the `containers-storage` feature is enabled and the image reference +/// starts with `containers-storage:`, this uses the native cstor import path +/// which supports zero-copy reflinks/hardlinks. Otherwise, it uses skopeo. +/// +/// See [`PullOptions`] for tunable knobs (zero-copy mode, extra storage +/// roots, image proxy configuration). pub async fn pull( repo: &Arc>, imgref: &str, reference: Option<&str>, - img_proxy_config: Option, + opts: PullOptions<'_>, ) -> Result> { let (config_digest, config_verity, stats) = - skopeo::pull(repo, imgref, reference, img_proxy_config).await?; + skopeo::pull(repo, imgref, reference, opts.img_proxy_config).await?; Ok(crate::PullResult { config_digest, config_verity, @@ -1075,16 +1187,77 @@ mod test { #[test] fn test_import_stats_display() { + // Copy-only stats (no reflinks) let stats = ImportStats { objects_copied: 42, objects_already_present: 100, bytes_copied: 1_500_000, bytes_inlined: 800, + ..Default::default() }; assert_eq!( stats.to_string(), "42 new + 100 already present objects; 1.43 MiB stored, 800 B inlined" ); + assert_eq!(stats.total_objects(), 142); + assert_eq!(stats.new_objects(), 42); + assert_eq!(stats.new_bytes(), 1_500_000); + + // Stats with reflinks + let reflink_stats = ImportStats { + objects_reflinked: 30, + objects_copied: 12, + objects_already_present: 100, + bytes_reflinked: 1_000_000, + bytes_copied: 500_000, + bytes_inlined: 800, + ..Default::default() + }; + assert_eq!( + reflink_stats.to_string(), + "30 reflinked + 12 copied + 100 already present objects; 976.56 KiB reflinked, 488.28 KiB copied, 800 B inlined" + ); + assert_eq!(reflink_stats.total_objects(), 142); + assert_eq!(reflink_stats.new_objects(), 42); + assert_eq!(reflink_stats.new_bytes(), 1_500_000); + + // Stats with hardlinks only + let hardlink_stats = ImportStats { + objects_hardlinked: 20, + objects_copied: 5, + objects_already_present: 50, + bytes_hardlinked: 800_000, + bytes_copied: 200_000, + bytes_inlined: 400, + ..Default::default() + }; + assert_eq!( + hardlink_stats.to_string(), + "20 hardlinked + 5 copied + 50 already present objects; 781.25 KiB hardlinked, 195.31 KiB copied, 400 B inlined" + ); + assert_eq!(hardlink_stats.total_objects(), 75); + assert_eq!(hardlink_stats.new_objects(), 25); + assert_eq!(hardlink_stats.new_bytes(), 1_000_000); + + // Stats with both reflinks and hardlinks + let mixed_stats = ImportStats { + objects_reflinked: 10, + objects_hardlinked: 15, + objects_copied: 5, + objects_already_present: 70, + bytes_reflinked: 500_000, + bytes_hardlinked: 750_000, + bytes_copied: 250_000, + bytes_inlined: 600, + ..Default::default() + }; + assert_eq!( + mixed_stats.to_string(), + "10 reflinked + 15 hardlinked + 5 copied + 70 already present objects; 488.28 KiB reflinked, 732.42 KiB hardlinked, 244.14 KiB copied, 600 B inlined" + ); + assert_eq!(mixed_stats.total_objects(), 100); + assert_eq!(mixed_stats.new_objects(), 30); + assert_eq!(mixed_stats.new_bytes(), 1_500_000); let empty = ImportStats::default(); assert_eq!( @@ -1092,6 +1265,5 @@ mod test { "0 new + 0 already present objects; 0 B stored, 0 B inlined" ); assert_eq!(empty.total_objects(), 0); - assert_eq!(stats.total_objects(), 142); } } diff --git a/crates/composefs-oci/src/skopeo.rs b/crates/composefs-oci/src/skopeo.rs index 8827f453..977f0a60 100644 --- a/crates/composefs-oci/src/skopeo.rs +++ b/crates/composefs-oci/src/skopeo.rs @@ -242,6 +242,14 @@ impl ImageOp { stats.objects_copied += 1; stats.bytes_copied += size; } + ObjectStoreMethod::Reflinked => { + stats.objects_reflinked += 1; + stats.bytes_reflinked += size; + } + ObjectStoreMethod::Hardlinked => { + stats.objects_hardlinked += 1; + stats.bytes_hardlinked += size; + } ObjectStoreMethod::AlreadyPresent => { stats.objects_already_present += 1; } From 8a9bf521793bf23f2a3a24cd93d6410e3cba54ff Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Tue, 7 Apr 2026 12:08:15 -0400 Subject: [PATCH 08/10] tests: Add CLI integration tests for tag, untag, GC, and compute-id Add four new integration tests exercising existing cfsctl functionality through the CLI: - test_oci_tag_and_untag: multi-tag and selective untag workflow - test_oci_gc_removes_untagged: verifies GC collects untagged images - test_layer_tar_roundtrip: imports a layer and verifies tar extraction - test_compute_image_id: deterministic fs-verity image ID computation Also fix create_oci_layout to include a runtime config (ConfigBuilder) which is required for the seal/compute-id operations. Assisted-by: OpenCode (Claude Opus 4) Signed-off-by: Colin Walters --- crates/integration-tests/src/tests/cli.rs | 293 +++++++++++++++++++++- 1 file changed, 292 insertions(+), 1 deletion(-) diff --git a/crates/integration-tests/src/tests/cli.rs b/crates/integration-tests/src/tests/cli.rs index da276b94..a3dda2e9 100644 --- a/crates/integration-tests/src/tests/cli.rs +++ b/crates/integration-tests/src/tests/cli.rs @@ -215,7 +215,7 @@ integration_test!(test_oci_images_json_empty_repo); fn create_oci_layout(parent: &std::path::Path) -> Result { use cap_std_ext::cap_std; use ocidir::oci_spec::image::{ - ImageConfigurationBuilder, Platform, PlatformBuilder, RootFsBuilder, + ConfigBuilder, ImageConfigurationBuilder, Platform, PlatformBuilder, RootFsBuilder, }; let oci_dir = parent.join("oci-image"); @@ -227,6 +227,9 @@ fn create_oci_layout(parent: &std::path::Path) -> Result { // Create a new empty manifest let mut manifest = ocidir.new_empty_manifest()?.build()?; + // Create runtime config (required for seal operation) + let runtime_config = ConfigBuilder::default().build()?; + // Create config with architecture and OS let rootfs = RootFsBuilder::default() .typ("layers") @@ -236,6 +239,7 @@ fn create_oci_layout(parent: &std::path::Path) -> Result { .architecture("amd64") .os("linux") .rootfs(rootfs) + .config(runtime_config) .build()?; // Create a simple layer with a usr/ directory and one file @@ -1397,3 +1401,290 @@ fn test_init_reset_metadata_changes_algorithm() -> Result<()> { Ok(()) } integration_test!(test_init_reset_metadata_changes_algorithm); + +/// Test tagging and untagging OCI images. +/// +/// Verifies that: +/// - An image can be tagged with multiple names +/// - Tags appear in `oci images` output +/// - Tags can be removed with `oci untag` +/// - Untagging one name doesn't affect other tags +fn test_oci_tag_and_untag() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + let repo_dir = init_insecure_repo(&sh, &cfsctl)?; + let repo = repo_dir.path(); + let fixture_dir = tempfile::tempdir()?; + let oci_layout = create_oci_layout(fixture_dir.path())?; + + // Pull and tag with first name + let pull_output = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci pull oci:{oci_layout} myimage:v1" + ) + .read()?; + + // Extract manifest digest from pull output (e.g., "manifest sha256:abc...") + let manifest_digest = pull_output + .lines() + .find(|line| line.contains("manifest sha256:")) + .and_then(|line| line.split_whitespace().find(|s| s.starts_with("sha256:"))) + .expect("expected manifest digest in pull output"); + + // Add a second tag using the manifest digest + cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci tag {manifest_digest} myimage:latest" + ) + .read()?; + + // Both tags should appear in list + let list_output = cmd!(sh, "{cfsctl} --insecure --repo {repo} oci images --json").read()?; + let images: serde_json::Value = serde_json::from_str(&list_output)?; + let names: Vec<&str> = images + .as_array() + .unwrap() + .iter() + .map(|img| img["name"].as_str().unwrap()) + .collect(); + assert!(names.contains(&"myimage:v1"), "expected myimage:v1 in list"); + assert!( + names.contains(&"myimage:latest"), + "expected myimage:latest in list" + ); + + // Remove one tag + cmd!(sh, "{cfsctl} --insecure --repo {repo} oci untag myimage:v1").read()?; + + // Only the remaining tag should appear + let list_output = cmd!(sh, "{cfsctl} --insecure --repo {repo} oci images --json").read()?; + let images: serde_json::Value = serde_json::from_str(&list_output)?; + let names: Vec<&str> = images + .as_array() + .unwrap() + .iter() + .map(|img| img["name"].as_str().unwrap()) + .collect(); + assert!( + !names.contains(&"myimage:v1"), + "myimage:v1 should be removed" + ); + assert!( + names.contains(&"myimage:latest"), + "myimage:latest should still exist" + ); + + Ok(()) +} +integration_test!(test_oci_tag_and_untag); + +/// Test that GC removes untagged OCI images. +/// +/// Verifies that: +/// - After untagging all references, GC collects the image +/// - Objects are actually removed from the repository +fn test_oci_gc_removes_untagged() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + let repo_dir = init_insecure_repo(&sh, &cfsctl)?; + let repo = repo_dir.path(); + let fixture_dir = tempfile::tempdir()?; + let oci_layout = create_oci_layout(fixture_dir.path())?; + + // Pull an image + cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci pull oci:{oci_layout} test-image" + ) + .read()?; + + // Verify it exists + let list_before = cmd!(sh, "{cfsctl} --insecure --repo {repo} oci images --json").read()?; + let images_before: Vec = serde_json::from_str(&list_before)?; + assert_eq!(images_before.len(), 1, "expected 1 image before untag"); + + // Untag it + cmd!(sh, "{cfsctl} --insecure --repo {repo} oci untag test-image").read()?; + + // Run GC + let gc_output = cmd!(sh, "{cfsctl} --insecure --repo {repo} gc").read()?; + assert!( + gc_output.contains("removed"), + "expected GC to report removed objects: {gc_output}" + ); + + // Verify image is gone + let list_after = cmd!(sh, "{cfsctl} --insecure --repo {repo} oci images --json").read()?; + let images_after: Vec = serde_json::from_str(&list_after)?; + assert!( + images_after.is_empty(), + "expected no images after GC, got: {:?}", + images_after + ); + + // Verify objects were actually removed (streams dir should be mostly empty) + let streams_dir = repo.join("streams"); + let stream_count = if streams_dir.exists() { + std::fs::read_dir(&streams_dir)? + .filter(|e| e.as_ref().map(|e| e.file_name() != "refs").unwrap_or(false)) + .count() + } else { + 0 + }; + assert_eq!( + stream_count, 0, + "expected no non-ref streams after GC, got {}", + stream_count + ); + + Ok(()) +} +integration_test!(test_oci_gc_removes_untagged); + +/// Test layer tar roundtrip: import a layer, extract as tar, verify integrity. +/// +/// This verifies that the splitstream storage correctly preserves tar content +/// by comparing the original tar with the reconstructed one. +fn test_layer_tar_roundtrip() -> Result<()> { + use std::io::Read; + + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + let repo_dir = init_insecure_repo(&sh, &cfsctl)?; + let repo = repo_dir.path(); + let fixture_dir = tempfile::tempdir()?; + let oci_layout = create_oci_layout(fixture_dir.path())?; + + // Pull the image + cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci pull oci:{oci_layout} test-image" + ) + .read()?; + + // Get the layer diff_id + let config_output = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci inspect test-image --config" + ) + .read()?; + let config: serde_json::Value = serde_json::from_str(&config_output)?; + let layer_id = config["rootfs"]["diff_ids"][0] + .as_str() + .expect("expected layer diff_id"); + + // Extract the layer as tar + let tar_output = cmd!(sh, "{cfsctl} --insecure --repo {repo} oci layer {layer_id}").output()?; + assert!(tar_output.status.success(), "layer extraction failed"); + + // Parse the tar and collect file entries + let mut archive = tar::Archive::new(tar_output.stdout.as_slice()); + let mut entries: Vec<(String, Vec)> = Vec::new(); + + for entry in archive.entries()? { + let mut entry = entry?; + let path = entry.path()?.to_string_lossy().to_string(); + let mut content = Vec::new(); + entry.read_to_end(&mut content)?; + entries.push((path, content)); + } + + // Verify we got the expected files (usr/ directory and hello.txt) + assert_eq!( + entries.len(), + 2, + "expected 2 entries in layer (usr/ and hello.txt)" + ); + + // Find hello.txt and verify content + let hello_entry = entries + .iter() + .find(|(path, _)| path == "hello.txt") + .expect("expected hello.txt in layer"); + assert_eq!( + hello_entry.1, b"hello from test layer\n", + "hello.txt content mismatch" + ); + + // Verify usr/ directory exists + assert!( + entries + .iter() + .any(|(path, _)| path == "usr" || path == "usr/"), + "expected usr/ directory in layer" + ); + + Ok(()) +} +integration_test!(test_layer_tar_roundtrip); + +/// Test computing the composefs image ID for an OCI image. +/// +/// This verifies that we can compute the filesystem verity hash for an image, +/// which is the prerequisite for sealing and mounting. +fn test_compute_image_id() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + let repo_dir = init_insecure_repo(&sh, &cfsctl)?; + let repo = repo_dir.path(); + let fixture_dir = tempfile::tempdir()?; + let oci_layout = create_oci_layout(fixture_dir.path())?; + + // Pull an image + cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci pull oci:{oci_layout} test-image" + ) + .read()?; + + // Get the config digest from inspect output + let inspect_output = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci inspect test-image" + ) + .read()?; + let inspect: serde_json::Value = serde_json::from_str(&inspect_output)?; + let config_digest = inspect["manifest"]["config"]["digest"] + .as_str() + .expect("expected config digest"); + // Digests must be prefixed with '@' to distinguish from named refs + let config_ref = format!("@{config_digest}"); + + // Compute the image ID + let compute_output = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci compute-id {config_ref}" + ) + .read()?; + + // The output should be a valid hex digest + // composefs uses SHA-256 fs-verity which produces 64 hex chars + // (but the underlying digest could be longer in some configurations) + let image_id = compute_output.trim(); + assert!( + image_id.len() >= 64, + "image ID should be at least 64 hex chars, got {} chars: {}", + image_id.len(), + image_id + ); + assert!( + image_id.chars().all(|c| c.is_ascii_hexdigit()), + "image ID should be hex, got: {}", + image_id + ); + + // Computing the same image should produce the same ID (deterministic) + let compute_output2 = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci compute-id {config_ref}" + ) + .read()?; + assert_eq!( + image_id, + compute_output2.trim(), + "compute-id should be deterministic" + ); + + Ok(()) +} +integration_test!(test_compute_image_id); From f24e5e5abbe24cd345398b3be391d65d994aa842 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Tue, 7 Apr 2026 12:08:26 -0400 Subject: [PATCH 09/10] tests: Add test dependencies and integration binary disambiguation Add podman, skopeo, and xfsprogs to test dependencies for the containers-storage integration tests coming next. Disambiguate the integration test binary name in Justfile cargo commands since the integration-tests crate will have multiple binaries (the main test runner and a cleanup helper). Assisted-by: OpenCode (Claude Opus 4) Signed-off-by: Colin Walters --- Justfile | 4 ++-- contrib/packaging/install-test-deps.sh | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Justfile b/Justfile index fade3a7f..973267ae 100644 --- a/Justfile +++ b/Justfile @@ -59,7 +59,7 @@ _test_image := if base_image =~ "debian" { "localhost/composefs-rs-test-debian:l # Run unprivileged integration tests against the cfsctl binary (no root, no VM) test-integration: build - CFSCTL_PATH=$(pwd)/target/debug/cfsctl cargo run -p integration-tests -- --skip privileged_ + CFSCTL_PATH=$(pwd)/target/debug/cfsctl cargo run -p integration-tests --bin cfsctl-integration-tests -- --skip privileged_ # Build the test container image for VM-based integration tests _integration-container-build: @@ -69,7 +69,7 @@ _integration-container-build: test-integration-vm: build _integration-container-build COMPOSEFS_TEST_IMAGE={{_test_image}} \ CFSCTL_PATH=$(pwd)/target/debug/cfsctl \ - cargo run -p integration-tests + cargo run -p integration-tests --bin cfsctl-integration-tests # Run everything: checks + full integration tests including VM ci: check test-integration-vm diff --git a/contrib/packaging/install-test-deps.sh b/contrib/packaging/install-test-deps.sh index f414dcd0..f1be65e6 100755 --- a/contrib/packaging/install-test-deps.sh +++ b/contrib/packaging/install-test-deps.sh @@ -11,11 +11,12 @@ set -euo pipefail case "${ID}" in centos|fedora|rhel) - pkg_install composefs openssl + pkg_install composefs openssl podman skopeo xfsprogs ;; debian|ubuntu) pkg_install \ - openssl e2fsprogs bubblewrap openssh-server + openssl e2fsprogs bubblewrap openssh-server \ + podman skopeo # OSTree symlink targets — /root, /home, /srv, etc. are symlinks # into /var on OSTree systems, so the target directories must exist. From 27b887b48e537206f0da49414fcc387d2fbdc713 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Mon, 13 Apr 2026 14:08:58 -0400 Subject: [PATCH 10/10] oci: Add containers-storage integration with zero-copy import Add a native containers-storage import path that bypasses skopeo's tar streaming by reading layer content directly from the overlay diff directories and using reflink/hardlink to avoid data copies. The import path is gated by a new LocalFetchOpt enum on PullOptions: - Disabled (default): fall through to skopeo like any other transport - IfPossible: use native import with reflink/hardlink/copy fallback - ZeroCopy: use native import but error if zero-copy is not possible This is exposed via `cfsctl oci pull --local-fetch disabled|auto|zerocopy`. The hardlink path enables fs-verity on source files in-place (permanent, irreversible) and requires CAP_DAC_READ_SEARCH for linkat(AT_EMPTY_PATH); try_hardlink_object now checks this upfront with a clear error message. Integration tests cover the full {ext4,xfs} x {sha256,sha512} x {auto,zerocopy} matrix using synthetic OCI images on loopback filesystems. Assisted-by: OpenCode (Claude Opus 4) Signed-off-by: Colin Walters --- .config/nextest.toml | 16 +- .github/workflows/ci.yml | 2 + Cargo.toml | 16 + Justfile | 31 +- crates/cfsctl/Cargo.toml | 5 +- crates/cfsctl/src/lib.rs | 46 +- crates/cfsctl/src/main.rs | 16 +- crates/composefs-oci/Cargo.toml | 5 + crates/composefs-oci/src/cstor.rs | 727 ++++++++++++++++++ crates/composefs-oci/src/image.rs | 263 +++++++ crates/composefs-oci/src/lib.rs | 324 +++++++- crates/composefs-oci/src/oci_image.rs | 2 +- crates/composefs/Cargo.toml | 2 +- crates/composefs/src/repository.rs | 13 + crates/cstorage/src/image.rs | 15 + crates/cstorage/src/tar_split.rs | 6 +- crates/cstorage/src/userns_helper.rs | 6 +- crates/integration-tests/Cargo.toml | 26 +- crates/integration-tests/src/cleanup.rs | 54 ++ crates/integration-tests/src/lib.rs | 125 +++ crates/integration-tests/src/main.rs | 9 + crates/integration-tests/src/tests/cstor.rs | 466 +++++++++++ crates/integration-tests/src/tests/mod.rs | 1 + .../integration-tests/src/tests/privileged.rs | 401 +++++++++- 24 files changed, 2525 insertions(+), 52 deletions(-) create mode 100644 crates/composefs-oci/src/cstor.rs create mode 100644 crates/integration-tests/src/cleanup.rs create mode 100644 crates/integration-tests/src/tests/cstor.rs diff --git a/.config/nextest.toml b/.config/nextest.toml index 3ca6324e..7ff301cc 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -14,6 +14,11 @@ failure-output = "immediate" success-output = "never" status-level = "pass" +# Tests that pull OCI images need more time (especially on cold CI runners) +[[profile.default.overrides]] +filter = 'test(/digest_stability|oci_pull/)' +slow-timeout = { period = "300s", terminate-after = 2 } + # Profile for integration tests — run with limited parallelism due to QEMU/KVM resources [profile.integration] test-threads = 2 @@ -30,7 +35,12 @@ path = "junit.xml" store-success-output = true store-failure-output = true -# VM tests boot an ephemeral QEMU instance per test, limit parallelism +# Privileged tests boot an ephemeral QEMU instance per test — limit +# parallelism to avoid OOM kills on 16 GB CI runners. +# Requiring all 2 threads effectively serialises them. +# +# NOTE: use /regex/ syntax, not ~substring — the ~ operator treats ^ $ as +# literal characters, so ~^foo never matches anything. [[profile.integration.overrides]] -filter = 'test(~^vm_)' -threads-required = 4 +filter = 'test(/^privileged_/)' +threads-required = 2 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6651b615..5e9bdfe8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -51,6 +51,7 @@ jobs: - uses: actions/checkout@v6 - uses: bootc-dev/actions/bootc-ubuntu-setup@main - uses: dtolnay/rust-toolchain@stable + - uses: taiki-e/install-action@nextest - uses: Swatinem/rust-cache@v2 - run: just test-integration @@ -114,6 +115,7 @@ jobs: libvirt: true - uses: dtolnay/rust-toolchain@stable + - uses: taiki-e/install-action@nextest - uses: Swatinem/rust-cache@v2 diff --git a/Cargo.toml b/Cargo.toml index 2c2d7ef6..d33e6c68 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,21 @@ [workspace] members = ["crates/*"] +# Exclude integration-tests from default `cargo test` — those require a +# built cfsctl binary and (for privileged tests) a VM. Run them via +# `just test-integration` or `just test-integration-vm` instead. +default-members = [ + "crates/cfsctl", + "crates/composefs", + "crates/composefs-boot", + "crates/composefs-fuse", + "crates/composefs-http", + "crates/composefs-ioctls", + "crates/composefs-oci", + "crates/composefs-setup-root", + "crates/cstorage", + "crates/erofs-debug", + "crates/splitfdstream", +] resolver = "2" [workspace.package] diff --git a/Justfile b/Justfile index 973267ae..57e42de4 100644 --- a/Justfile +++ b/Justfile @@ -58,18 +58,37 @@ cfsctl_features := env("COMPOSEFS_CFSCTL_FEATURES", "pre-6.15") _test_image := if base_image =~ "debian" { "localhost/composefs-rs-test-debian:latest" } else if base_image =~ "stream9" { "localhost/composefs-rs-test-c9s:latest" } else { "localhost/composefs-rs-test:latest" } # Run unprivileged integration tests against the cfsctl binary (no root, no VM) -test-integration: build - CFSCTL_PATH=$(pwd)/target/debug/cfsctl cargo run -p integration-tests --bin cfsctl-integration-tests -- --skip privileged_ +# Prefers nextest for parallelism control and better UX; falls back to direct harness. +test-integration *ARGS: build + #!/usr/bin/env bash + set -euo pipefail + export CFSCTL_PATH=$(pwd)/target/debug/cfsctl + if command -v cargo-nextest &> /dev/null; then + cargo nextest run -p integration-tests -E 'not test(/^privileged_/)' {{ ARGS }} + else + cargo test -p integration-tests --test cfsctl-integration-tests -- --skip privileged_ {{ ARGS }} + fi # Build the test container image for VM-based integration tests _integration-container-build: podman build --build-arg base_image={{base_image}} --build-arg cfsctl_features={{cfsctl_features}} -t {{_test_image}} . # Run all integration tests including privileged VM tests (requires podman + libvirt) -test-integration-vm: build _integration-container-build - COMPOSEFS_TEST_IMAGE={{_test_image}} \ - CFSCTL_PATH=$(pwd)/target/debug/cfsctl \ - cargo run -p integration-tests --bin cfsctl-integration-tests +# Uses nextest with the integration profile for parallelism control of VM tests. +test-integration-vm *ARGS: build _integration-container-build + #!/usr/bin/env bash + set -euo pipefail + export COMPOSEFS_TEST_IMAGE={{_test_image}} + export CFSCTL_PATH=$(pwd)/target/debug/cfsctl + if command -v cargo-nextest &> /dev/null; then + cargo nextest run -P integration -p integration-tests {{ ARGS }} + else + cargo test -p integration-tests --test cfsctl-integration-tests -- {{ ARGS }} + fi + +# Install cargo-nextest if not already installed +install-nextest: + @which cargo-nextest > /dev/null 2>&1 || cargo install cargo-nextest --locked # Run everything: checks + full integration tests including VM ci: check test-integration-vm diff --git a/crates/cfsctl/Cargo.toml b/crates/cfsctl/Cargo.toml index bfb56522..e9ca29bb 100644 --- a/crates/cfsctl/Cargo.toml +++ b/crates/cfsctl/Cargo.toml @@ -14,9 +14,10 @@ version.workspace = true path = "src/lib.rs" [features] -default = ['pre-6.15', 'oci'] +default = ['pre-6.15', 'oci', 'containers-storage'] http = ['composefs-http'] oci = ['composefs-oci'] +containers-storage = ['composefs-oci/containers-storage', 'cstorage'] rhel9 = ['composefs/rhel9'] 'pre-6.15' = ['composefs/pre-6.15'] @@ -29,8 +30,10 @@ composefs = { workspace = true } composefs-boot = { workspace = true } composefs-oci = { workspace = true, optional = true, features = ["boot"] } composefs-http = { workspace = true, optional = true } +cstorage = { path = "../cstorage", version = "0.3.0", features = ["userns-helper"], optional = true } env_logger = { version = "0.11.0", default-features = false } hex = { version = "0.4.0", default-features = false } +indicatif = { version = "0.17.0", default-features = false } rustix = { version = "1.0.0", default-features = false, features = ["fs", "process"] } serde = { version = "1.0", default-features = false, features = ["derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } diff --git a/crates/cfsctl/src/lib.rs b/crates/cfsctl/src/lib.rs index eaaf1fca..3a99800f 100644 --- a/crates/cfsctl/src/lib.rs +++ b/crates/cfsctl/src/lib.rs @@ -170,6 +170,30 @@ impl std::fmt::Display for OciReference { } } +/// CLI representation of [`composefs_oci::LocalFetchOpt`]. +#[cfg(feature = "oci")] +#[derive(Debug, Clone, Copy, Default, clap::ValueEnum)] +enum LocalFetchCli { + /// Do not use native containers-storage import; use skopeo. + #[default] + Disabled, + /// Use native import with reflink/hardlink/copy fallback. + Auto, + /// Use native import; error if zero-copy is not possible. + Zerocopy, +} + +#[cfg(feature = "oci")] +impl From for composefs_oci::LocalFetchOpt { + fn from(cli: LocalFetchCli) -> Self { + match cli { + LocalFetchCli::Disabled => Self::Disabled, + LocalFetchCli::Auto => Self::IfPossible, + LocalFetchCli::Zerocopy => Self::ZeroCopy, + } + } +} + /// Common options for operations using OCI config manifest streams that may transform the image rootfs #[cfg(feature = "oci")] #[derive(Debug, Parser)] @@ -226,6 +250,10 @@ enum OciCommand { /// Also generate a bootable EROFS image from the pulled OCI image #[arg(long)] bootable: bool, + /// Controls whether containers-storage: references use the native + /// import path with zero-copy reflink/hardlink support. + #[arg(long, value_enum, default_value_t = LocalFetchCli::Disabled)] + local_fetch: LocalFetchCli, }, /// List all tagged OCI images in the repository #[clap(name = "images")] @@ -925,23 +953,23 @@ where ref image, name, bootable, + local_fetch, } => { // If no explicit name provided, use the image reference as the tag let tag_name = name.as_deref().unwrap_or(image); - let (result, stats) = - composefs_oci::pull_image(&repo, image, Some(tag_name), None).await?; + + let opts = composefs_oci::PullOptions { + local_fetch: local_fetch.into(), + ..Default::default() + }; + + let result = composefs_oci::pull(&repo, image, Some(tag_name), opts).await?; println!("manifest {}", result.manifest_digest); println!("config {}", result.config_digest); println!("verity {}", result.manifest_verity.to_hex()); println!("tagged {tag_name}"); - println!( - "objects {} copied, {} already present, {} bytes copied, {} bytes inlined", - stats.objects_copied, - stats.objects_already_present, - stats.bytes_copied, - stats.bytes_inlined, - ); + println!("objects {}", result.stats); if bootable { let image_verity = diff --git a/crates/cfsctl/src/main.rs b/crates/cfsctl/src/main.rs index 4873e608..83424c12 100644 --- a/crates/cfsctl/src/main.rs +++ b/crates/cfsctl/src/main.rs @@ -9,8 +9,20 @@ use cfsctl::App; use anyhow::Result; use clap::Parser; -#[tokio::main] -async fn main() -> Result<()> { +fn main() -> Result<()> { + // If we were spawned as a userns helper process, handle that and exit. + // This MUST be called before the tokio runtime is created. + #[cfg(feature = "containers-storage")] + cstorage::init_if_helper(); + + // Now we can create the tokio runtime for the main application + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()? + .block_on(async_main()) +} + +async fn async_main() -> Result<()> { env_logger::init(); let args = App::parse(); diff --git a/crates/composefs-oci/Cargo.toml b/crates/composefs-oci/Cargo.toml index f3b2a56c..2273e39b 100644 --- a/crates/composefs-oci/Cargo.toml +++ b/crates/composefs-oci/Cargo.toml @@ -11,17 +11,21 @@ rust-version.workspace = true version.workspace = true [features] +default = ["containers-storage"] test = ["tar", "rand", "composefs/test"] boot = ["composefs-boot"] +containers-storage = ["dep:cstorage", "dep:base64", "cstorage/userns-helper"] [dependencies] anyhow = { version = "1.0.87", default-features = false } fn-error-context = "0.2" async-compression = { version = "0.4.0", default-features = false, features = ["tokio", "zstd", "gzip"] } +base64 = { version = "0.22", default-features = false, features = ["std"], optional = true } bytes = { version = "1", default-features = false } composefs = { workspace = true } composefs-boot = { workspace = true, optional = true } containers-image-proxy = { version = "0.9.2", default-features = false } +cstorage = { path = "../cstorage", version = "0.3.0", optional = true } hex = { version = "0.4.0", default-features = false } indicatif = { version = "0.18.0", default-features = false, features = ["tokio"] } rustix = { version = "1.0.0", features = ["fs"] } @@ -34,6 +38,7 @@ tar = { version = "0.4.38", default-features = false, optional = true } tar-core = "0.1.0" tokio = { version = "1.24.2", features = ["macros", "rt-multi-thread"] } tokio-util = { version = "0.7", default-features = false, features = ["io"] } +tracing = { version = "0.1", default-features = false } [dev-dependencies] cap-std = { version = "4.0.0", default-features = false } diff --git a/crates/composefs-oci/src/cstor.rs b/crates/composefs-oci/src/cstor.rs new file mode 100644 index 00000000..00cdc768 --- /dev/null +++ b/crates/composefs-oci/src/cstor.rs @@ -0,0 +1,727 @@ +//! containers-storage integration for zero-copy layer import. +//! +//! This module provides functionality to import container images directly from +//! containers-storage (as used by podman/buildah) into composefs repositories. +//! It uses the cstorage crate to access the storage and leverages reflinks when +//! available to avoid copying file data, enabling efficient zero-copy extraction. +//! +//! This module requires the `containers-storage` feature to be enabled. +//! +//! The main entry point is [`import_from_containers_storage`], which takes an +//! image ID and imports all layers into the repository. +//! +//! # Overview +//! +//! When importing from containers-storage, we: +//! 1. Open the storage and locate the image +//! 2. For each layer, iterate through the tar-split metadata +//! 3. For large files (> INLINE_CONTENT_MAX_V0), reflink directly to objects/ +//! 4. For small files, embed inline in the splitstream +//! 5. Handle overlay whiteouts properly +//! +//! # Rootless Support +//! +//! When running as an unprivileged user, files in containers-storage may have +//! restrictive permissions (e.g., `/etc/shadow` with mode 0600 owned by remapped +//! UIDs). In this case, we spawn a helper process via `podman unshare` that can +//! read all files, and it streams the content back to us via a Unix socket with +//! file descriptor passing. +//! +//! # Example +//! +//! ```ignore +//! use composefs_oci::cstor::import_from_containers_storage; +//! +//! let repo = Arc::new(Repository::open_user()?); +//! let (result, stats) = import_from_containers_storage(&repo, "sha256:abc123...", None, false).await?; +//! println!("Imported config: {}", result.0); +//! println!("Stats: {:?}", stats); +//! ``` + +use std::os::unix::fs::FileExt; +use std::os::unix::io::OwnedFd; +use std::sync::Arc; + +use anyhow::{Context, Result}; +use base64::Engine; +use indicatif::{ProgressBar, ProgressStyle}; + +use composefs::{ + INLINE_CONTENT_MAX_V0, + fsverity::FsVerityHashValue, + repository::{ImportContext, ObjectStoreMethod, Repository}, +}; + +use cstorage::{ + Image, Layer, ProxiedTarSplitItem, Storage, StorageProxy, TarSplitFdStream, TarSplitItem, + can_bypass_file_permissions, +}; + +// Re-export init_if_helper for consumers that need userns helper support +pub use cstorage::init_if_helper; + +use crate::oci_image::manifest_identifier; +use crate::skopeo::{OCI_CONFIG_CONTENT_TYPE, OCI_MANIFEST_CONTENT_TYPE, TAR_LAYER_CONTENT_TYPE}; +use crate::{ContentAndVerity, ImportStats, OciDigest, config_identifier, layer_identifier}; + +/// Full result of a cstor import: manifest and config digests + verities. +type CstorImportResult = (ContentAndVerity, ContentAndVerity); + +/// Zero padding buffer for tar block alignment (512 bytes max needed). +const ZERO_PADDING: [u8; 512] = [0u8; 512]; + +/// Import a container image from containers-storage into the composefs repository. +/// +/// This function reads an image from the local containers-storage (podman/buildah) +/// and imports all layers using reflinks when possible, avoiding data duplication. +/// It creates full OCI structure (manifest + config + layers) matching the skopeo +/// import path. +/// +/// For rootless access, this function will automatically spawn a userns helper +/// process via `podman unshare` to read files with restrictive permissions. +/// +/// # Arguments +/// * `repo` - The composefs repository to import into +/// * `image_id` - The image ID (sha256 digest or name) to import +/// * `reference` - Optional reference name to assign to the imported image +/// * `zerocopy` - If true, error instead of falling back to copy (reflink or hardlink required) +/// * `storage_root` - Explicit storage root; skips auto-discovery when set +/// * `additional_image_stores` - Additional read-only image stores (appended after the primary store) +/// +/// # Returns +/// A tuple of ((manifest_digest, manifest_verity), (config_digest, config_verity)) +/// plus import stats. +pub async fn import_from_containers_storage( + repo: &Arc>, + image_id: &str, + reference: Option<&str>, + zerocopy: bool, + storage_root: Option<&std::path::Path>, + additional_image_stores: &[&std::path::Path], +) -> Result<(CstorImportResult, ImportStats)> { + // Check if we can access files directly or need a proxy + if can_bypass_file_permissions() { + // Direct access - use blocking implementation + let repo = Arc::clone(repo); + let image_id = image_id.to_owned(); + let reference = reference.map(|s| s.to_owned()); + let storage_root = storage_root.map(|p| p.to_path_buf()); + let additional_image_stores: Vec = additional_image_stores + .iter() + .map(|p| p.to_path_buf()) + .collect(); + + tokio::task::spawn_blocking(move || { + import_from_containers_storage_direct( + &repo, + &image_id, + reference.as_deref(), + zerocopy, + storage_root.as_deref(), + &additional_image_stores, + ) + }) + .await + .context("spawn_blocking failed")? + } else { + // The proxied (rootless) path uses a userns helper process that does + // its own storage discovery. Explicit storage paths are not yet + // plumbed through the proxy protocol. + if storage_root.is_some() || !additional_image_stores.is_empty() { + anyhow::bail!( + "storage_root and additional_image_stores are not supported in rootless mode" + ); + } + import_from_containers_storage_proxied(repo, image_id, reference, zerocopy).await + } +} + +/// Direct (privileged) implementation of containers-storage import. +/// +/// All file I/O operations in this function are blocking, so it must be called +/// from a blocking context (e.g., via `spawn_blocking`). +fn import_from_containers_storage_direct( + repo: &Arc>, + image_id: &str, + reference: Option<&str>, + zerocopy: bool, + storage_root: Option<&std::path::Path>, + additional_image_stores: &[std::path::PathBuf], +) -> Result<(CstorImportResult, ImportStats)> { + let mut stats = ImportStats::default(); + let mut ctx = ImportContext::default(); + + // Build the list of stores to search. When an explicit root is given, + // skip auto-discovery entirely; otherwise discover from the standard + // locations and $STORAGE_OPTS. + let mut stores = if let Some(root) = storage_root { + vec![ + Storage::open(root) + .with_context(|| format!("Failed to open storage root at {}", root.display()))?, + ] + } else { + // Auto-discover; tolerate failure only if extra stores are provided. + match Storage::discover_all() { + Ok(s) => s, + Err(e) if !additional_image_stores.is_empty() => { + tracing::warn!( + "containers-storage auto-discovery failed ({e:#}), \ + using additional image stores only" + ); + Vec::new() + } + Err(e) => return Err(e).context("Failed to discover containers-storage"), + } + }; + for path in additional_image_stores { + stores.push(Storage::open(path).with_context(|| { + format!( + "Failed to open additional image store at {}", + path.display() + ) + })?); + } + + // Search all stores for the image (primary first, then additional image stores) + let (_image_store, image) = stores + .iter() + .find_map(|s| { + Image::open(s, image_id) + .or_else(|_| s.find_image_by_name(image_id)) + .ok() + .map(|img| (s, img)) + }) + .with_context(|| format!("Failed to find image {image_id} in any storage"))?; + + // Get the storage layer IDs — layers may span stores (e.g. base layers + // in an additional image store, new layers in the primary) + let storage_layer_ids = image + .storage_layer_ids(&stores) + .context("Failed to get storage layer IDs from image")?; + + // Get the config to access diff_ids + let config = image.config().context("Failed to read image config")?; + let diff_ids: Vec = config + .rootfs() + .diff_ids() + .iter() + .map(|s| s.parse::().context("parsing diff_id")) + .collect::>()?; + + // Ensure layer count matches + anyhow::ensure!( + storage_layer_ids.len() == diff_ids.len(), + "Layer count mismatch: {} layers in storage, {} diff_ids in config", + storage_layer_ids.len(), + diff_ids.len() + ); + + stats.layers = storage_layer_ids.len() as u64; + + // Import each layer with progress bar + let progress = ProgressBar::new(storage_layer_ids.len() as u64); + progress.set_style( + ProgressStyle::default_bar() + .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}") + .expect("valid template") + .progress_chars("=>-"), + ); + + let mut layer_refs = Vec::with_capacity(storage_layer_ids.len()); + for (storage_layer_id, diff_id) in storage_layer_ids.iter().zip(diff_ids.iter()) { + let content_id = layer_identifier(diff_id); + let diff_id_str: &str = diff_id.as_ref(); + let short_id = diff_id_str.get(..19).unwrap_or(diff_id_str); + + let layer_verity = if let Some(existing) = repo.has_stream(&content_id)? { + progress.set_message(format!("Already have {short_id}...")); + stats.layers_already_present += 1; + existing + } else { + progress.set_message(format!("Importing {short_id}...")); + let (layer_store, layer) = stores + .iter() + .find_map(|s| Layer::open(s, storage_layer_id).ok().map(|l| (s, l))) + .with_context(|| format!("Failed to open layer {}", storage_layer_id))?; + let (verity, layer_stats) = + import_layer_direct(repo, layer_store, &layer, diff_id, zerocopy, &mut ctx)?; + stats.merge(&layer_stats); + verity + }; + + layer_refs.push((diff_id.clone(), layer_verity)); + progress.inc(1); + } + progress.finish_with_message("Layers imported"); + + finalize_import(repo, &image, &layer_refs, reference, &progress, stats) +} + +/// Proxied (rootless) implementation of containers-storage import. +/// +/// This spawns a helper process via `podman unshare` that can read all files +/// in containers-storage, and communicates with it via Unix socket + fd passing. +async fn import_from_containers_storage_proxied( + repo: &Arc>, + image_id: &str, + reference: Option<&str>, + zerocopy: bool, +) -> Result<(CstorImportResult, ImportStats)> { + let mut stats = ImportStats::default(); + let mut ctx = ImportContext::default(); + + // Spawn the proxy helper + let mut proxy = StorageProxy::spawn() + .await + .context("Failed to spawn userns helper")? + .context("Expected proxy but got None")?; + + // Discover storage paths (primary + additional from $STORAGE_OPTS) + let storage_paths = discover_storage_paths()?; + + // Search all storage paths for the image + let mut image_info = None; + let mut found_storage_path = String::new(); + for path in &storage_paths { + match proxy.get_image(path, image_id).await { + Ok(info) => { + found_storage_path = path.clone(); + image_info = Some(info); + break; + } + Err(_) => continue, + } + } + let image_info = + image_info.with_context(|| format!("Failed to find image {} in any storage", image_id))?; + let storage_path = found_storage_path; + + // Ensure layer count matches + anyhow::ensure!( + image_info.storage_layer_ids.len() == image_info.layer_diff_ids.len(), + "Layer count mismatch: {} layers in storage, {} diff_ids in config", + image_info.storage_layer_ids.len(), + image_info.layer_diff_ids.len() + ); + + stats.layers = image_info.storage_layer_ids.len() as u64; + + // Import each layer with progress bar + let progress = ProgressBar::new(image_info.storage_layer_ids.len() as u64); + progress.set_style( + ProgressStyle::default_bar() + .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}") + .expect("valid template") + .progress_chars("=>-"), + ); + + let mut layer_refs = Vec::with_capacity(image_info.storage_layer_ids.len()); + + for (storage_layer_id, diff_id) in image_info + .storage_layer_ids + .iter() + .zip(image_info.layer_diff_ids.iter()) + { + let content_id = layer_identifier(diff_id); + let diff_id_str: &str = diff_id.as_ref(); + let short_id = diff_id_str.get(..19).unwrap_or(diff_id_str); + + let layer_verity = if let Some(existing) = repo.has_stream(&content_id)? { + progress.set_message(format!("Already have {short_id}...")); + stats.layers_already_present += 1; + existing + } else { + progress.set_message(format!("Importing {short_id}...")); + let (verity, layer_stats) = import_layer_proxied( + repo, + &mut proxy, + &storage_path, + storage_layer_id, + diff_id, + zerocopy, + &mut ctx, + ) + .await?; + stats.merge(&layer_stats); + verity + }; + + layer_refs.push((diff_id.clone(), layer_verity)); + progress.inc(1); + } + progress.finish_with_message("Layers imported"); + + // Config and manifest metadata don't have restrictive file permissions, + // so we can read them directly without the proxy. + let stores = Storage::discover_all().context("Failed to discover containers-storage")?; + let (_, image) = stores + .iter() + .find_map(|s| Image::open(s, &image_info.id).ok().map(|img| (s, img))) + .with_context(|| format!("Failed to open image {}", image_info.id))?; + + // Shutdown the proxy before the blocking finalization + proxy.shutdown().await.context("Failed to shutdown proxy")?; + + finalize_import(repo, &image, &layer_refs, reference, &progress, stats) +} + +/// Create config + manifest splitstreams, generate the EROFS image, and tag. +/// +/// This is the shared finalization step for both direct and proxied import +/// paths. By this point all layers are already imported; this function: +/// 1. Creates the config splitstream with layer references +/// 2. Creates the manifest splitstream +/// 3. Generates the composefs EROFS image and links it to the config +/// 4. Tags the manifest if a reference was provided +fn finalize_import( + repo: &Arc>, + image: &Image, + layer_refs: &[(OciDigest, ObjectID)], + reference: Option<&str>, + progress: &ProgressBar, + stats: ImportStats, +) -> Result<(CstorImportResult, ImportStats)> { + // Read the raw config JSON bytes from metadata + let config_key = format!("sha256:{}", image.id()); + let encoded_key = base64::engine::general_purpose::STANDARD.encode(config_key.as_bytes()); + let config_json = image + .read_metadata(&encoded_key) + .context("Failed to read config bytes")?; + let config_digest = crate::sha256_content_digest(&config_json); + let content_id = config_identifier(&config_digest); + + let config_verity = if let Some(existing) = repo.has_stream(&content_id)? { + progress.println(format!("Already have config {config_digest}")); + existing + } else { + progress.println(format!("Creating config splitstream {config_digest}")); + let mut writer = repo.create_stream(OCI_CONFIG_CONTENT_TYPE)?; + + for (diff_id, verity) in layer_refs { + let key: &str = diff_id.as_ref(); + writer.add_named_stream_ref(key, verity); + } + + writer.write_external(&config_json)?; + repo.write_stream(writer, &content_id, None)? + }; + + // Create the manifest splitstream (matching the skopeo path) + let manifest_json = image + .read_manifest_raw() + .context("Failed to read manifest bytes")?; + let manifest_digest = crate::sha256_content_digest(&manifest_json); + + let manifest_content_id = manifest_identifier(&manifest_digest); + let manifest_verity = if let Some(existing) = repo.has_stream(&manifest_content_id)? { + progress.println(format!("Already have manifest {manifest_digest}")); + existing + } else { + progress.println(format!("Creating manifest splitstream {manifest_digest}")); + let mut writer = repo.create_stream(OCI_MANIFEST_CONTENT_TYPE)?; + + let config_ref_key = format!("config:{config_digest}"); + writer.add_named_stream_ref(&config_ref_key, &config_verity); + + for (diff_id, verity) in layer_refs { + let key: &str = diff_id.as_ref(); + writer.add_named_stream_ref(key, verity); + } + + writer.write_external(&manifest_json)?; + repo.write_stream(writer, &manifest_content_id, None)? + }; + + // Generate the composefs EROFS image and tag the manifest. + // Skip if the image already has an EROFS ref (idempotent re-import). + let existing_erofs = + crate::composefs_erofs_for_manifest(repo, &manifest_digest, Some(&manifest_verity))?; + if existing_erofs.is_none() { + let erofs = crate::ensure_oci_composefs_erofs( + repo, + &manifest_digest, + Some(&manifest_verity), + reference, + )?; + if erofs.is_none() { + // Not a container image (unlikely for cstor, but handle consistently) + if let Some(name) = reference { + crate::oci_image::tag_image(repo, &manifest_digest, name)?; + } + } + } else if let Some(name) = reference { + crate::oci_image::tag_image(repo, &manifest_digest, name)?; + } + + // Re-read verities: ensure_oci_composefs_erofs rewrites config and + // manifest splitstreams (adding the EROFS ref), so the verities captured + // above may be stale. + let config_verity = repo + .has_stream(&content_id)? + .context("config splitstream missing after finalization")?; + let manifest_verity = repo + .has_stream(&manifest_content_id)? + .context("manifest splitstream missing after finalization")?; + + Ok(( + ( + (manifest_digest, manifest_verity), + (config_digest, config_verity), + ), + stats, + )) +} + +/// Import a single layer directly (privileged mode). +fn import_layer_direct( + repo: &Arc>, + storage: &Storage, + layer: &Layer, + diff_id: &OciDigest, + zerocopy: bool, + ctx: &mut ImportContext, +) -> Result<(ObjectID, ImportStats)> { + let mut stats = ImportStats::default(); + let mut inline_buf = Vec::new(); + + let mut stream = TarSplitFdStream::new(storage, layer) + .with_context(|| format!("Failed to create tar-split stream for layer {}", layer.id()))?; + + let mut writer = repo.create_stream(TAR_LAYER_CONTENT_TYPE)?; + let content_id = layer_identifier(diff_id); + + // Track padding from previous file - tar-split bundles padding with the NEXT + // file's header in Segment entries, but we need to write padding immediately + // after file content (like tar.rs does) for consistent splitstream output. + let mut prev_file_padding: usize = 0; + + while let Some(item) = stream.next()? { + match item { + TarSplitItem::Segment(bytes) => { + // Skip the leading padding bytes (we already wrote them after prev file) + let header_bytes = &bytes[prev_file_padding..]; + stats.bytes_inlined += header_bytes.len() as u64; + writer.write_inline(header_bytes); + prev_file_padding = 0; + } + TarSplitItem::FileContent { fd, size, name } => { + process_file_content( + repo, + &mut writer, + &mut stats, + ctx, + fd, + size, + &name, + zerocopy, + &mut inline_buf, + )?; + + // Write padding inline immediately after file content + let padding_size = (size as usize).next_multiple_of(512) - size as usize; + if padding_size > 0 { + stats.bytes_inlined += padding_size as u64; + writer.write_inline(&ZERO_PADDING[..padding_size]); + } + prev_file_padding = padding_size; + } + } + } + + // Write the stream with the content identifier + let verity = repo.write_stream(writer, &content_id, None)?; + Ok((verity, stats)) +} + +/// Import a single layer via the proxy (rootless mode). +async fn import_layer_proxied( + repo: &Arc>, + proxy: &mut StorageProxy, + storage_path: &str, + layer_id: &str, + diff_id: &OciDigest, + zerocopy: bool, + ctx: &mut ImportContext, +) -> Result<(ObjectID, ImportStats)> { + let mut stats = ImportStats::default(); + let mut inline_buf = Vec::new(); + + let mut writer = repo.create_stream(TAR_LAYER_CONTENT_TYPE)?; + let content_id = layer_identifier(diff_id); + + // Track padding from previous file - tar-split bundles padding with the NEXT + // file's header in Segment entries, but we need to write padding immediately + // after file content (like tar.rs does) for consistent splitstream output. + let mut prev_file_padding: usize = 0; + + // Stream the layer via the proxy + let mut stream = proxy + .stream_layer(storage_path, layer_id) + .await + .with_context(|| format!("Failed to start streaming layer {}", layer_id))?; + + while let Some(item) = stream + .next() + .await + .with_context(|| format!("Failed to receive stream item for layer {}", layer_id))? + { + match item { + ProxiedTarSplitItem::Segment(bytes) => { + // Skip the leading padding bytes (we already wrote them after prev file) + let header_bytes = &bytes[prev_file_padding..]; + stats.bytes_inlined += header_bytes.len() as u64; + writer.write_inline(header_bytes); + prev_file_padding = 0; + } + ProxiedTarSplitItem::FileContent { fd, size, name } => { + process_file_content( + repo, + &mut writer, + &mut stats, + ctx, + fd, + size, + &name, + zerocopy, + &mut inline_buf, + )?; + + // Write padding inline immediately after file content + let padding_size = (size as usize).next_multiple_of(512) - size as usize; + if padding_size > 0 { + stats.bytes_inlined += padding_size as u64; + writer.write_inline(&ZERO_PADDING[..padding_size]); + } + prev_file_padding = padding_size; + } + } + } + + // Write the stream with the content identifier + let verity = repo.write_stream(writer, &content_id, None)?; + Ok((verity, stats)) +} + +/// Process file content (shared between direct and proxied modes). +#[allow(clippy::too_many_arguments)] +fn process_file_content( + repo: &Arc>, + writer: &mut composefs::splitstream::SplitStreamWriter, + stats: &mut ImportStats, + ctx: &mut ImportContext, + fd: OwnedFd, + size: u64, + name: &str, + zerocopy: bool, + inline_buf: &mut Vec, +) -> Result<()> { + // Convert fd to File for operations + let file = std::fs::File::from(fd); + + if size as usize > INLINE_CONTENT_MAX_V0 { + // Large file: store as external object + let (object_id, method) = if zerocopy { + repo.ensure_object_from_file_zerocopy(&file, size, ctx) + } else { + repo.ensure_object_from_file(&file, size, ctx) + } + .with_context(|| format!("Failed to store object for {}", name))?; + + match method { + ObjectStoreMethod::Reflinked => { + stats.objects_reflinked += 1; + stats.bytes_reflinked += size; + } + ObjectStoreMethod::Hardlinked => { + stats.objects_hardlinked += 1; + stats.bytes_hardlinked += size; + } + ObjectStoreMethod::Copied => { + stats.objects_copied += 1; + stats.bytes_copied += size; + } + ObjectStoreMethod::AlreadyPresent => { + stats.objects_already_present += 1; + } + } + + writer.add_external_size(size); + writer.write_reference(object_id)?; + } else { + // Small file: read and embed inline (reuse buffer across calls) + inline_buf.resize(size as usize, 0); + file.read_exact_at(inline_buf, 0)?; + stats.bytes_inlined += size; + writer.write_inline(inline_buf); + } + + Ok(()) +} + +/// Discover storage paths: the primary store plus any additional image stores +/// from `$STORAGE_OPTS`. +fn discover_storage_paths() -> Result> { + let mut paths = Vec::new(); + + // Try user storage first (rootless podman) + if let Ok(home) = std::env::var("HOME") { + let user_path = format!("{}/.local/share/containers/storage", home); + if std::path::Path::new(&user_path).exists() { + paths.push(user_path); + } + } + + // Fall back to system storage + let system_path = "/var/lib/containers/storage"; + if std::path::Path::new(system_path).exists() { + paths.push(system_path.to_string()); + } + + // Also check $STORAGE_OPTS for additional image stores + if let Ok(opts) = std::env::var("STORAGE_OPTS") { + for item in opts.split(',') { + let item = item.trim(); + if let Some(path) = item.strip_prefix("additionalimagestore=") + && std::path::Path::new(path).exists() + { + paths.push(path.to_string()); + } + } + } + + anyhow::ensure!( + !paths.is_empty(), + "Could not find containers-storage at standard locations" + ); + Ok(paths) +} + +/// Check if an image reference uses the containers-storage transport. +/// +/// Returns the image ID portion if the reference starts with "containers-storage:", +/// otherwise returns None. +pub fn parse_containers_storage_ref(imgref: &str) -> Option<&str> { + imgref.strip_prefix("containers-storage:") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_containers_storage_ref() { + assert_eq!( + parse_containers_storage_ref("containers-storage:sha256:abc123"), + Some("sha256:abc123") + ); + assert_eq!( + parse_containers_storage_ref("containers-storage:quay.io/fedora:latest"), + Some("quay.io/fedora:latest") + ); + assert_eq!( + parse_containers_storage_ref("docker://quay.io/fedora:latest"), + None + ); + assert_eq!(parse_containers_storage_ref("sha256:abc123"), None); + } +} diff --git a/crates/composefs-oci/src/image.rs b/crates/composefs-oci/src/image.rs index 615c849c..14a8ae2f 100644 --- a/crates/composefs-oci/src/image.rs +++ b/crates/composefs-oci/src/image.rs @@ -557,4 +557,267 @@ mod test { Ok(()) } + + // --- Whiteout-specific tests --- + + #[test] + fn test_whiteout_file_removes_entry() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/etc"))?; + process_entry(&mut fs, file_entry("/etc/hosts"))?; + process_entry(&mut fs, file_entry("/etc/passwd"))?; + assert_files(&fs, &["/", "/etc", "/etc/hosts", "/etc/passwd"])?; + + // Whiteout hosts — only hosts should be removed + process_entry(&mut fs, file_entry("/etc/.wh.hosts"))?; + assert_files(&fs, &["/", "/etc", "/etc/passwd"])?; + + Ok(()) + } + + #[test] + fn test_whiteout_nonexistent_file_is_noop() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/etc"))?; + process_entry(&mut fs, file_entry("/etc/hosts"))?; + assert_files(&fs, &["/", "/etc", "/etc/hosts"])?; + + // Whiteout a file that doesn't exist — should be a no-op + process_entry(&mut fs, file_entry("/etc/.wh.nosuchfile"))?; + assert_files(&fs, &["/", "/etc", "/etc/hosts"])?; + + Ok(()) + } + + #[test] + fn test_whiteout_directory() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/usr"))?; + process_entry(&mut fs, dir_entry("/usr/local"))?; + process_entry(&mut fs, file_entry("/usr/local/bin"))?; + process_entry(&mut fs, dir_entry("/etc"))?; + assert_files(&fs, &["/", "/etc", "/usr", "/usr/local", "/usr/local/bin"])?; + + // Whiteout the directory /usr/local (removes the entire subtree) + process_entry(&mut fs, file_entry("/usr/.wh.local"))?; + assert_files(&fs, &["/", "/etc", "/usr"])?; + + Ok(()) + } + + #[test] + fn test_whiteout_in_root_directory() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/mydir"))?; + process_entry(&mut fs, file_entry("/toplevel"))?; + assert_files(&fs, &["/", "/mydir", "/toplevel"])?; + + // Whiteout in root (no leading dir component) + process_entry(&mut fs, file_entry("/.wh.toplevel"))?; + assert_files(&fs, &["/", "/mydir"])?; + + // Also works without leading slash + process_entry(&mut fs, file_entry(".wh.mydir"))?; + assert_files(&fs, &["/"])?; + + Ok(()) + } + + #[test] + fn test_whiteout_in_nested_directory() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/a"))?; + process_entry(&mut fs, dir_entry("/a/b"))?; + process_entry(&mut fs, dir_entry("/a/b/c"))?; + process_entry(&mut fs, file_entry("/a/b/c/deep"))?; + assert_files(&fs, &["/", "/a", "/a/b", "/a/b/c", "/a/b/c/deep"])?; + + process_entry(&mut fs, file_entry("/a/b/c/.wh.deep"))?; + assert_files(&fs, &["/", "/a", "/a/b", "/a/b/c"])?; + + Ok(()) + } + + #[test] + fn test_opaque_whiteout_clears_directory() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/etc"))?; + process_entry(&mut fs, file_entry("/etc/hosts"))?; + process_entry(&mut fs, file_entry("/etc/passwd"))?; + process_entry(&mut fs, file_entry("/etc/resolv.conf"))?; + assert_files( + &fs, + &["/", "/etc", "/etc/hosts", "/etc/passwd", "/etc/resolv.conf"], + )?; + + // Opaque whiteout — clears all entries in /etc + process_entry(&mut fs, file_entry("/etc/.wh..wh..opq"))?; + assert_files(&fs, &["/", "/etc"])?; + + Ok(()) + } + + #[test] + fn test_opaque_whiteout_then_add_new_entries() -> Result<()> { + // This is a very common pattern in container images: the layer + // marks a dir opaque (hiding all lower-layer contents), then + // adds new entries in the same directory. + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/etc"))?; + process_entry(&mut fs, file_entry("/etc/old_config"))?; + process_entry(&mut fs, file_entry("/etc/another_old"))?; + assert_files(&fs, &["/", "/etc", "/etc/another_old", "/etc/old_config"])?; + + // Opaque whiteout clears everything + process_entry(&mut fs, file_entry("/etc/.wh..wh..opq"))?; + assert_files(&fs, &["/", "/etc"])?; + + // Then re-add new entries + process_entry(&mut fs, file_entry("/etc/new_config"))?; + process_entry(&mut fs, file_entry("/etc/new_other"))?; + assert_files(&fs, &["/", "/etc", "/etc/new_config", "/etc/new_other"])?; + + Ok(()) + } + + #[test] + fn test_multiple_whiteouts_in_single_layer() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/usr"))?; + process_entry(&mut fs, file_entry("/usr/a"))?; + process_entry(&mut fs, file_entry("/usr/b"))?; + process_entry(&mut fs, file_entry("/usr/c"))?; + process_entry(&mut fs, file_entry("/usr/d"))?; + assert_files(&fs, &["/", "/usr", "/usr/a", "/usr/b", "/usr/c", "/usr/d"])?; + + // Multiple whiteouts in the same directory + process_entry(&mut fs, file_entry("/usr/.wh.a"))?; + process_entry(&mut fs, file_entry("/usr/.wh.c"))?; + assert_files(&fs, &["/", "/usr", "/usr/b", "/usr/d"])?; + + Ok(()) + } + + #[test] + fn test_double_whiteout_is_idempotent() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/d"))?; + process_entry(&mut fs, file_entry("/d/target"))?; + assert_files(&fs, &["/", "/d", "/d/target"])?; + + // Whiteout the same file twice — the second is a no-op + process_entry(&mut fs, file_entry("/d/.wh.target"))?; + assert_files(&fs, &["/", "/d"])?; + + process_entry(&mut fs, file_entry("/d/.wh.target"))?; + assert_files(&fs, &["/", "/d"])?; + + Ok(()) + } + + #[test] + fn test_whiteout_unusual_name_dot_wh_dot() -> Result<()> { + // ".wh..wh." (without trailing "opq") is a whiteout for a file + // literally named ".wh." — it is NOT an opaque whiteout. + // The code checks `whiteout == b".wh..opq"` for the complete + // filename ".wh..wh..opq", so ".wh..wh." won't match. + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/d"))?; + process_entry(&mut fs, file_entry("/d/real_file"))?; + assert_files(&fs, &["/", "/d", "/d/real_file"])?; + + // ".wh..wh." is interpreted as a whiteout for the file named ".wh." + // (strip ".wh." prefix → ".wh." remainder). Since no file named ".wh." + // exists, it's a no-op. Crucially, it is NOT treated as an opaque + // whiteout — those require the exact name ".wh..wh..opq". + process_entry(&mut fs, file_entry("/d/.wh..wh."))?; + assert_files(&fs, &["/", "/d", "/d/real_file"])?; + + // Note: a tar entry named ".wh." is consumed as a whiteout for "" (empty + // name), which is effectively a no-op — the file is never stored. + process_entry(&mut fs, file_entry("/d/.wh."))?; + assert_files(&fs, &["/", "/d", "/d/real_file"])?; + + Ok(()) + } + + #[test] + fn test_whiteout_across_multiple_directories() -> Result<()> { + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/a"))?; + process_entry(&mut fs, dir_entry("/b"))?; + process_entry(&mut fs, file_entry("/a/file1"))?; + process_entry(&mut fs, file_entry("/a/file2"))?; + process_entry(&mut fs, file_entry("/b/file1"))?; + process_entry(&mut fs, file_entry("/b/file2"))?; + assert_files( + &fs, + &[ + "/", "/a", "/a/file1", "/a/file2", "/b", "/b/file1", "/b/file2", + ], + )?; + + // Whiteout file1 in /a and file2 in /b independently + process_entry(&mut fs, file_entry("/a/.wh.file1"))?; + process_entry(&mut fs, file_entry("/b/.wh.file2"))?; + assert_files(&fs, &["/", "/a", "/a/file2", "/b", "/b/file1"])?; + + Ok(()) + } + + #[test] + fn test_opaque_whiteout_with_subdirectories() -> Result<()> { + // Opaque whiteout should clear subdirectories too + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/parent"))?; + process_entry(&mut fs, dir_entry("/parent/child"))?; + process_entry(&mut fs, file_entry("/parent/child/deep"))?; + process_entry(&mut fs, file_entry("/parent/sibling"))?; + assert_files( + &fs, + &[ + "/", + "/parent", + "/parent/child", + "/parent/child/deep", + "/parent/sibling", + ], + )?; + + process_entry(&mut fs, file_entry("/parent/.wh..wh..opq"))?; + assert_files(&fs, &["/", "/parent"])?; + + Ok(()) + } + + #[test] + fn test_whiteout_then_recreate() -> Result<()> { + // Delete a file with whiteout, then re-add it in the same layer + let mut fs = FileSystem::::new(Stat::uninitialized()); + + process_entry(&mut fs, dir_entry("/etc"))?; + process_entry(&mut fs, file_entry("/etc/config"))?; + assert_files(&fs, &["/", "/etc", "/etc/config"])?; + + // Whiteout and then re-add + process_entry(&mut fs, file_entry("/etc/.wh.config"))?; + assert_files(&fs, &["/", "/etc"])?; + + process_entry(&mut fs, file_entry("/etc/config"))?; + assert_files(&fs, &["/", "/etc", "/etc/config"])?; + + Ok(()) + } } diff --git a/crates/composefs-oci/src/lib.rs b/crates/composefs-oci/src/lib.rs index 9e5a9568..8d59b35c 100644 --- a/crates/composefs-oci/src/lib.rs +++ b/crates/composefs-oci/src/lib.rs @@ -13,6 +13,8 @@ #![forbid(unsafe_code)] pub mod boot; +#[cfg(feature = "containers-storage")] +pub mod cstor; pub mod image; pub mod oci_image; pub mod skopeo; @@ -201,37 +203,56 @@ impl std::fmt::Display for ImportStats { } } +/// Controls whether and how the `containers-storage:` native import path +/// is used when pulling images. +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] +pub enum LocalFetchOpt { + /// Do not use the native containers-storage import path; fall through + /// to skopeo. + #[default] + Disabled, + /// Use native containers-storage import with reflink → hardlink → copy + /// fallback chain. + IfPossible, + /// Use native containers-storage import but error if zero-copy + /// (reflink or hardlink) is not possible. + ZeroCopy, +} + /// Options for a [`pull`] operation. /// /// Use `Default::default()` for the common case (skopeo transport, no -/// zero-copy requirement). +/// containers-storage import). #[derive(Debug, Default)] pub struct PullOptions<'a> { /// Image proxy configuration passed to skopeo (ignored for - /// `containers-storage:` references). + /// `containers-storage:` references when `local_fetch` is not + /// [`Disabled`](LocalFetchOpt::Disabled)). pub img_proxy_config: Option, - /// If `true`, the containers-storage import path will error instead of - /// falling back to a data copy when neither reflink nor hardlink succeeds. - /// Intended for bootc's unified storage layout where the composefs repo - /// and containers-storage are always on the same filesystem. - pub zerocopy: bool, + /// Controls whether the native containers-storage import path is used. + /// See [`LocalFetchOpt`] for details. + pub local_fetch: LocalFetchOpt, /// Explicit containers-storage root. When set, auto-discovery is skipped /// and only this path (plus any `additional_image_stores`) is searched. - /// Only relevant for `containers-storage:` references. + /// Only relevant when `local_fetch` is not [`Disabled`](LocalFetchOpt::Disabled). pub storage_root: Option<&'a std::path::Path>, /// Additional read-only image stores to search beyond the primary /// (auto-discovered or explicit) store. Equivalent to the /// `additionalimagestore=` option in containers/storage. - /// Only relevant for `containers-storage:` references. + /// Only relevant when `local_fetch` is not [`Disabled`](LocalFetchOpt::Disabled). pub additional_image_stores: &'a [&'a std::path::Path], } /// Result of a pull operation. #[derive(Debug)] pub struct PullResult { + /// The manifest digest (sha256:...). + pub manifest_digest: OciDigest, + /// The fs-verity hash of the manifest splitstream. + pub manifest_verity: ObjectID, /// The config digest (sha256:...). pub config_digest: OciDigest, /// The fs-verity hash of the config splitstream. @@ -329,11 +350,12 @@ pub fn ls_layer( /// Pull the target image, and add the provided tag. If this is a mountable /// image (i.e. not an artifact), it is *not* unpacked by default. /// -/// When the `containers-storage` feature is enabled and the image reference -/// starts with `containers-storage:`, this uses the native cstor import path -/// which supports zero-copy reflinks/hardlinks. Otherwise, it uses skopeo. +/// When the `containers-storage` feature is enabled, the image reference +/// starts with `containers-storage:`, **and** [`PullOptions::local_fetch`] +/// is not [`LocalFetchOpt::Disabled`], this uses the native cstor import path +/// which supports zero-copy reflinks/hardlinks. Otherwise, it uses skopeo. /// -/// See [`PullOptions`] for tunable knobs (zero-copy mode, extra storage +/// See [`PullOptions`] for tunable knobs (local-copy mode, extra storage /// roots, image proxy configuration). pub async fn pull( repo: &Arc>, @@ -341,11 +363,37 @@ pub async fn pull( reference: Option<&str>, opts: PullOptions<'_>, ) -> Result> { - let (config_digest, config_verity, stats) = - skopeo::pull(repo, imgref, reference, opts.img_proxy_config).await?; + #[cfg(feature = "containers-storage")] + if opts.local_fetch != LocalFetchOpt::Disabled + && let Some(image_id) = cstor::parse_containers_storage_ref(imgref) + { + let zerocopy = opts.local_fetch == LocalFetchOpt::ZeroCopy; + let (((manifest_digest, manifest_verity), (config_digest, config_verity)), stats) = + cstor::import_from_containers_storage( + repo, + image_id, + reference, + zerocopy, + opts.storage_root, + opts.additional_image_stores, + ) + .await?; + return Ok(PullResult { + manifest_digest, + manifest_verity, + config_digest, + config_verity, + stats, + }); + } + + let (result, stats) = + skopeo::pull_image(repo, imgref, reference, opts.img_proxy_config).await?; Ok(crate::PullResult { - config_digest, - config_verity, + manifest_digest: result.manifest_digest, + manifest_verity: result.manifest_verity, + config_digest: result.config_digest, + config_verity: result.config_verity, stats, }) } @@ -1266,4 +1314,246 @@ mod test { ); assert_eq!(empty.total_objects(), 0); } + + /// End-to-end test: multi-layer OCI image with nontrivial whiteout usage. + /// + /// Builds three tar layers exercising individual file whiteouts (`.wh.`) + /// and opaque directory whiteouts (`.wh..wh..opq`), imports them through the + /// full OCI pipeline (tar → splitstream → OCI config/manifest → EROFS), and + /// verifies the resulting filesystem contains exactly the expected files. + #[tokio::test] + async fn test_whiteout_multi_layer_import() { + use composefs::test::TestRepo; + use containers_image_proxy::oci_spec::image::{ + ConfigBuilder, DescriptorBuilder, ImageConfigurationBuilder, ImageManifestBuilder, + MediaType, RootFsBuilder, + }; + + // --- Tar builder helpers (local to this test) --- + + fn tar_dir(builder: &mut ::tar::Builder>, name: &str) { + let mut header = ::tar::Header::new_ustar(); + header.set_uid(0); + header.set_gid(0); + header.set_mode(0o755); + header.set_entry_type(::tar::EntryType::Directory); + header.set_size(0); + builder + .append_data(&mut header, name, std::io::empty()) + .unwrap(); + } + + fn tar_file(builder: &mut ::tar::Builder>, name: &str, content: &[u8]) { + let mut header = ::tar::Header::new_ustar(); + header.set_uid(0); + header.set_gid(0); + header.set_mode(0o644); + header.set_entry_type(::tar::EntryType::Regular); + header.set_size(content.len() as u64); + builder.append_data(&mut header, name, content).unwrap(); + } + + /// Zero-length regular file — used for `.wh.` and `.wh..wh..opq` entries. + fn tar_whiteout(builder: &mut ::tar::Builder>, name: &str) { + tar_file(builder, name, &[]); + } + + // --- Build the three layers --- + + // Layer 1 (base): create initial filesystem + let layer1 = { + let mut b = ::tar::Builder::new(vec![]); + tar_dir(&mut b, "etc"); + tar_file(&mut b, "etc/config.toml", b"[server]\nport = 8080\n"); + tar_file(&mut b, "etc/hosts", b"127.0.0.1 localhost\n"); + tar_dir(&mut b, "usr"); + tar_dir(&mut b, "usr/bin"); + tar_file(&mut b, "usr/bin/app", b"#!/bin/sh\necho hello\n"); + tar_dir(&mut b, "usr/lib"); + tar_file(&mut b, "usr/lib/old-lib.so", b"fake-old-lib-content"); + tar_file(&mut b, "usr/lib/shared.so", b"fake-shared-lib-content"); + tar_dir(&mut b, "tmp"); + tar_dir(&mut b, "tmp/cache"); + tar_file(&mut b, "tmp/cache/data.bin", b"cached-data-payload"); + tar_file(&mut b, "tmp/cache/index.db", b"cached-index-payload"); + b.into_inner().unwrap() + }; + + // Layer 2 (whiteout + modify): + // - delete /etc/hosts (file whiteout) + // - delete /usr/lib/old-lib.so (file whiteout) + // - add /etc/hosts.new (replacement) + // - opaque whiteout on /tmp/cache (clears data.bin + index.db) + // - add /tmp/cache/fresh.bin (re-populate after opaque) + let layer2 = { + let mut b = ::tar::Builder::new(vec![]); + tar_dir(&mut b, "etc"); + tar_whiteout(&mut b, "etc/.wh.hosts"); + tar_file(&mut b, "etc/hosts.new", b"127.0.0.1 localhost.new\n"); + tar_dir(&mut b, "usr"); + tar_dir(&mut b, "usr/lib"); + tar_whiteout(&mut b, "usr/lib/.wh.old-lib.so"); + tar_dir(&mut b, "tmp"); + tar_dir(&mut b, "tmp/cache"); + tar_whiteout(&mut b, "tmp/cache/.wh..wh..opq"); + tar_file(&mut b, "tmp/cache/fresh.bin", b"fresh-cache-content"); + b.into_inner().unwrap() + }; + + // Layer 3 (more whiteouts): + // - delete /usr/bin/app (file whiteout) + // - add /usr/bin/app-v2 (replacement) + let layer3 = { + let mut b = ::tar::Builder::new(vec![]); + tar_dir(&mut b, "usr"); + tar_dir(&mut b, "usr/bin"); + tar_whiteout(&mut b, "usr/bin/.wh.app"); + tar_file(&mut b, "usr/bin/app-v2", b"#!/bin/sh\necho hello v2\n"); + b.into_inner().unwrap() + }; + + // --- Import layers and build OCI image --- + + let test_repo = TestRepo::::new(); + let repo = &test_repo.repo; + + let layers_data = [&layer1[..], &layer2[..], &layer3[..]]; + let mut layer_digests = Vec::new(); + let mut layer_verities_map: HashMap, composefs::fsverity::Sha256HashValue> = + HashMap::new(); + let mut layer_descriptors = Vec::new(); + + for tar_data in &layers_data { + let digest = hash_sha256(tar_data); + let (verity, _stats) = import_layer(repo, &digest, None, *tar_data).await.unwrap(); + + let descriptor = DescriptorBuilder::default() + .media_type(MediaType::ImageLayerGzip) + .digest(digest.clone()) + .size(tar_data.len() as u64) + .build() + .unwrap(); + + layer_verities_map.insert(digest.to_string().into_boxed_str(), verity); + layer_digests.push(digest.to_string()); + layer_descriptors.push(descriptor); + } + + // Build OCI config + let rootfs = RootFsBuilder::default() + .typ("layers") + .diff_ids(layer_digests.clone()) + .build() + .unwrap(); + + let cfg = ConfigBuilder::default().build().unwrap(); + + let config = ImageConfigurationBuilder::default() + .architecture("amd64") + .os("linux") + .rootfs(rootfs) + .config(cfg) + .build() + .unwrap(); + + let config_json = config.to_string().unwrap(); + let config_digest = hash_sha256(config_json.as_bytes()); + + let mut config_stream = repo.create_stream(skopeo::OCI_CONFIG_CONTENT_TYPE).unwrap(); + for (digest, verity) in &layer_verities_map { + config_stream.add_named_stream_ref(digest, verity); + } + config_stream + .write_external(config_json.as_bytes()) + .unwrap(); + let config_verity = repo + .write_stream(config_stream, &config_identifier(&config_digest), None) + .unwrap(); + + // Build OCI manifest + let config_descriptor = DescriptorBuilder::default() + .media_type(MediaType::ImageConfig) + .digest(config_digest.clone()) + .size(config_json.len() as u64) + .build() + .unwrap(); + + let manifest = ImageManifestBuilder::default() + .schema_version(2u32) + .media_type(MediaType::ImageManifest) + .config(config_descriptor) + .layers(layer_descriptors) + .build() + .unwrap(); + + let manifest_json = manifest.to_string().unwrap(); + let manifest_digest = hash_sha256(manifest_json.as_bytes()); + + let (_stored_digest, manifest_verity) = oci_image::write_manifest( + repo, + &manifest, + &manifest_digest, + &config_verity, + &layer_verities_map, + Some("whiteout-test:v1"), + ) + .unwrap(); + + // --- Create the EROFS image --- + + let erofs_id = ensure_oci_composefs_erofs( + repo, + &manifest_digest, + Some(&manifest_verity), + Some("whiteout-test:v1"), + ) + .unwrap() + .expect("container image should produce EROFS"); + + // --- Verify the flattened filesystem --- + + let erofs_data = repo.read_object(&erofs_id).unwrap(); + let fs = + composefs::erofs::reader::erofs_to_filesystem::(&erofs_data).unwrap(); + let mut dump = Vec::new(); + composefs::dumpfile::write_dumpfile(&mut dump, &fs).unwrap(); + let dump = String::from_utf8(dump).unwrap(); + + // Extract just the paths from the dumpfile for structural verification + let paths: Vec<&str> = dump.lines().map(|l| l.split_once(' ').unwrap().0).collect(); + + // Files that SHOULD exist after all three layers + let expected_present = [ + "/", + "/etc", + "/etc/config.toml", // layer 1, survived + "/etc/hosts.new", // layer 2 addition + "/tmp", + "/tmp/cache", + "/tmp/cache/fresh.bin", // layer 2, after opaque whiteout + "/usr", + "/usr/bin", + "/usr/bin/app-v2", // layer 3 replacement + "/usr/lib", + "/usr/lib/shared.so", // layer 1, survived + ]; + + // Files that MUST NOT exist (removed by whiteouts) + let must_not_exist = [ + "/etc/hosts", // deleted by layer 2 file whiteout + "/usr/lib/old-lib.so", // deleted by layer 2 file whiteout + "/usr/bin/app", // deleted by layer 3 file whiteout + "/tmp/cache/data.bin", // cleared by layer 2 opaque whiteout + "/tmp/cache/index.db", // cleared by layer 2 opaque whiteout + ]; + + similar_asserts::assert_eq!(paths, expected_present); + + for path in &must_not_exist { + assert!( + !paths.contains(path), + "{path} should have been removed by whiteout but is still present" + ); + } + } } diff --git a/crates/composefs-oci/src/oci_image.rs b/crates/composefs-oci/src/oci_image.rs index 3c2c5b9e..da624254 100644 --- a/crates/composefs-oci/src/oci_image.rs +++ b/crates/composefs-oci/src/oci_image.rs @@ -574,7 +574,7 @@ pub fn list_images( }); } Err(e) => { - eprintln!("Warning: skipping image {name}: {e:#}"); + tracing::warn!("skipping image {name}: {e:#}"); continue; } } diff --git a/crates/composefs/Cargo.toml b/crates/composefs/Cargo.toml index 0e732372..d8c78357 100644 --- a/crates/composefs/Cargo.toml +++ b/crates/composefs/Cargo.toml @@ -23,7 +23,7 @@ fn-error-context = "0.2" hex = { version = "0.4.0", default-features = false, features = ["std"] } log = { version = "0.4.8", default-features = false } once_cell = { version = "1.21.3", default-features = false, features = ["std"] } -rustix = { version = "1.0.0", default-features = false, features = ["fs", "mount", "process", "std"] } +rustix = { version = "1.0.0", default-features = false, features = ["fs", "mount", "process", "std", "thread"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } sha2 = { version = "0.11.0", default-features = false } thiserror = { version = "2.0.0", default-features = false } diff --git a/crates/composefs/src/repository.rs b/crates/composefs/src/repository.rs index ff3bdff7..9bd84c9a 100644 --- a/crates/composefs/src/repository.rs +++ b/crates/composefs/src/repository.rs @@ -1325,6 +1325,19 @@ impl Repository { size: u64, ) -> Result<(ObjectID, ObjectStoreMethod)> { use crate::fsverity::enable_verity_with_retry; + use rustix::thread::{CapabilitySet, capabilities}; + + // AT_EMPTY_PATH linkat requires CAP_DAC_READ_SEARCH. Check upfront + // so callers get a clear error instead of a confusing ENOENT. + let has_cap = capabilities(None) + .map(|caps| caps.effective.contains(CapabilitySet::DAC_READ_SEARCH)) + .unwrap_or(false); + if !has_cap { + anyhow::bail!( + "hardlinking objects requires CAP_DAC_READ_SEARCH \ + (run as root or use the copy fallback)" + ); + } let objects_dir = self.objects_dir()?; diff --git a/crates/cstorage/src/image.rs b/crates/cstorage/src/image.rs index 0b1d4814..51ea6bdc 100644 --- a/crates/cstorage/src/image.rs +++ b/crates/cstorage/src/image.rs @@ -78,6 +78,21 @@ impl Image { &self.id } + /// Read the raw manifest JSON bytes. + /// + /// Returns the original manifest bytes as stored on disk, preserving + /// whitespace and field ordering for content-addressed hashing. + /// + /// # Errors + /// + /// Returns an error if the manifest file cannot be read. + pub fn read_manifest_raw(&self) -> Result> { + let mut file = self.image_dir.open(MANIFEST_FILENAME)?; + let mut data = Vec::new(); + file.read_to_end(&mut data)?; + Ok(data) + } + /// Read and parse the image manifest. /// /// The manifest is stored as a JSON file named "manifest" in the image directory. diff --git a/crates/cstorage/src/tar_split.rs b/crates/cstorage/src/tar_split.rs index c3eeed57..5f6a418c 100644 --- a/crates/cstorage/src/tar_split.rs +++ b/crates/cstorage/src/tar_split.rs @@ -18,7 +18,7 @@ //! - Type 2 (SegmentType): Raw TAR header bytes and padding (base64-encoded) //! - CRC64-ISO algorithm for checksums -use std::io::{BufRead, BufReader, Read}; +use std::io::{BufRead, BufReader, Read, Seek}; use std::os::fd::OwnedFd; use base64::prelude::*; @@ -610,8 +610,8 @@ impl TarSplitFdStream { if let Some(ref crc64_b64) = crc64 { self.verify_crc64(&mut file, crc64_b64, file_size)?; - // Reopen file since we consumed it for CRC check - file = self.open_file_in_chain(path)?; + // Seek back to start after CRC verification consumed the file + file.rewind().map_err(StorageError::Io)?; } // Convert to OwnedFd and return diff --git a/crates/cstorage/src/userns_helper.rs b/crates/cstorage/src/userns_helper.rs index f28ef6dc..339e1293 100644 --- a/crates/cstorage/src/userns_helper.rs +++ b/crates/cstorage/src/userns_helper.rs @@ -158,7 +158,7 @@ pub struct GetImageResult { /// Image names. pub names: Vec, /// Layer diff IDs (sha256:...). - pub layer_diff_ids: Vec, + pub layer_diff_ids: Vec, /// Storage layer IDs (internal IDs used by containers-storage). pub storage_layer_ids: Vec, } @@ -657,11 +657,11 @@ fn handle_get_image( } }; - let diff_ids: Vec = config + let diff_ids: Vec = config .rootfs() .diff_ids() .iter() - .map(|s| s.to_string()) + .map(|s| s.parse().expect("config diff_id should be valid digest")) .collect(); let storage_layer_ids = match image.storage_layer_ids(std::slice::from_ref(&storage)) { diff --git a/crates/integration-tests/Cargo.toml b/crates/integration-tests/Cargo.toml index 41b3063d..718d3b1b 100644 --- a/crates/integration-tests/Cargo.toml +++ b/crates/integration-tests/Cargo.toml @@ -1,6 +1,9 @@ [package] name = "integration-tests" publish = false +description = "Integration tests for composefs-rs (not published)" +autobins = false +autotests = false edition.workspace = true license.workspace = true @@ -8,25 +11,42 @@ repository.workspace = true rust-version.workspace = true version.workspace = true +# The integration test runner is declared as both [[bin]] (for the container +# image build) and [[test]] (so nextest discovers the libtest-mimic tests). +# The integration-tests crate is excluded from workspace default-members so +# that plain `cargo test` does not run these; use `just test-integration`. +# Cargo warns about the shared source file — this is harmless. [[bin]] name = "cfsctl-integration-tests" path = "src/main.rs" +[[test]] +name = "cfsctl-integration-tests" +path = "src/main.rs" +harness = false + +[[bin]] +name = "test-cleanup" +path = "src/cleanup.rs" + [dependencies] anyhow = "1" cap-std-ext = "5.0" composefs = { workspace = true } # Only the test_util module is used — for creating test OCI images. # All verification must go through the cfsctl CLI. -composefs-oci = { workspace = true, features = ["test", "boot"] } +composefs-oci = { workspace = true, features = ["test", "boot", "containers-storage"] } +hex = "0.4" libtest-mimic = "0.8" linkme = "0.3" ocidir = "0.7" paste = "1" -rustix = { version = "1.0.0", default-features = false, features = ["process"] } -serde_json = "1.0" +rustix = { version = "1", features = ["fs", "process"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" tar = "0.4" tempfile = "3" +tokio = { version = "1", features = ["rt-multi-thread", "macros"] } xshell = "0.2" [lints] diff --git a/crates/integration-tests/src/cleanup.rs b/crates/integration-tests/src/cleanup.rs new file mode 100644 index 00000000..6a2ef8d5 --- /dev/null +++ b/crates/integration-tests/src/cleanup.rs @@ -0,0 +1,54 @@ +//! Cleanup utility for integration test resources +//! +//! This binary cleans up any leftover resources from integration tests. + +use std::process::Command; + +use integration_tests::INTEGRATION_TEST_LABEL; + +fn main() { + println!("Cleaning up integration test resources..."); + + // Clean up podman containers with our label + let output = Command::new("podman") + .args([ + "ps", + "-a", + "--filter", + &format!("label={}", INTEGRATION_TEST_LABEL), + "-q", + ]) + .output(); + + if let Ok(output) = output { + let container_ids = String::from_utf8_lossy(&output.stdout); + for id in container_ids.lines() { + if !id.is_empty() { + println!("Removing container: {}", id); + let _ = Command::new("podman").args(["rm", "-f", id]).output(); + } + } + } + + // Clean up podman images with our label + let output = Command::new("podman") + .args([ + "images", + "--filter", + &format!("label={}", INTEGRATION_TEST_LABEL), + "-q", + ]) + .output(); + + if let Ok(output) = output { + let image_ids = String::from_utf8_lossy(&output.stdout); + for id in image_ids.lines() { + if !id.is_empty() { + println!("Removing image: {}", id); + let _ = Command::new("podman").args(["rmi", "-f", id]).output(); + } + } + } + + println!("Cleanup complete."); +} diff --git a/crates/integration-tests/src/lib.rs b/crates/integration-tests/src/lib.rs index 84391096..c86ebfb1 100644 --- a/crates/integration-tests/src/lib.rs +++ b/crates/integration-tests/src/lib.rs @@ -7,6 +7,14 @@ // linkme requires unsafe for distributed slices #![allow(unsafe_code)] +use std::process::Command; +use std::sync::Arc; + +use anyhow::Result; +use composefs_oci::composefs::fsverity::{Algorithm, Sha256HashValue}; +use composefs_oci::composefs::repository::Repository; +use tempfile::TempDir; + /// A test function that returns a Result. pub type TestFn = fn() -> anyhow::Result<()>; @@ -50,3 +58,120 @@ macro_rules! integration_test { } }; } + +// ============================================================================ +// Utilities for containers-storage tests +// ============================================================================ + +/// Test label for cleanup +pub const INTEGRATION_TEST_LABEL: &str = "composefs-rs.integration-test=1"; + +/// Get the path to cfsctl binary +pub fn get_cfsctl_path() -> Result { + // Check environment first + if let Ok(path) = std::env::var("CFSCTL_PATH") { + return Ok(path); + } + // Look in common locations + for path in [ + "./target/release/cfsctl", + "./target/debug/cfsctl", + "/usr/bin/cfsctl", + ] { + if std::path::Path::new(path).exists() { + return Ok(path.to_string()); + } + } + anyhow::bail!("cfsctl not found; set CFSCTL_PATH or build with `cargo build --release`") +} + +/// Get the primary test image +pub fn get_primary_image() -> String { + std::env::var("COMPOSEFS_RS_PRIMARY_IMAGE") + .unwrap_or_else(|_| "quay.io/centos-bootc/centos-bootc:stream10".to_string()) +} + +/// Get all test images +pub fn get_all_images() -> Vec { + std::env::var("COMPOSEFS_RS_ALL_IMAGES") + .unwrap_or_else(|_| get_primary_image()) + .split_whitespace() + .map(String::from) + .collect() +} + +/// Create a test repository in a temporary directory. +/// +/// Initializes a new insecure SHA-256 repository. +pub fn create_test_repository(tempdir: &TempDir) -> Result>> { + let fd = rustix::fs::open( + tempdir.path(), + rustix::fs::OFlags::CLOEXEC | rustix::fs::OFlags::RDONLY, + 0.into(), + )?; + + let (mut repo, _created) = + Repository::::init_path(&fd, ".", Algorithm::SHA256, false)?; + repo.set_insecure(); + Ok(Arc::new(repo)) +} + +fn podman_command() -> Command { + Command::new("podman") +} + +/// Build a minimal test image using podman and return its ID +pub fn build_test_image() -> Result { + let temp_dir = TempDir::new()?; + let containerfile = temp_dir.path().join("Containerfile"); + + // Create a simple Containerfile with various file sizes to test + // both inline and external storage paths. + // Use centos instead of busybox because busybox has UID 65534 which + // breaks in nested container environments due to user namespace issues. + // Include /boot and /sysroot so the image is compatible with --bootable + // (transform_for_boot requires these directories). + std::fs::write( + &containerfile, + r#"FROM quay.io/centos/centos:stream10 +# Small file (should be inlined) +RUN echo "small content" > /small.txt +# Larger file (should be external) +RUN dd if=/dev/zero of=/large.bin bs=1024 count=100 2>/dev/null +# Directory with files +RUN mkdir -p /testdir && echo "file1" > /testdir/a.txt && echo "file2" > /testdir/b.txt +# Symlink +RUN ln -s /small.txt /link.txt +# Directories required by composefs boot transformations +RUN mkdir -p /boot /sysroot +"#, + )?; + + let iid_file = temp_dir.path().join("image.iid"); + + let output = podman_command() + .args([ + "build", + "--pull=newer", + &format!("--iidfile={}", iid_file.display()), + "-f", + &containerfile.to_string_lossy(), + &temp_dir.path().to_string_lossy(), + ]) + .output()?; + + if !output.status.success() { + anyhow::bail!( + "podman build failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + } + + let image_id = std::fs::read_to_string(&iid_file)?.trim().to_string(); + Ok(image_id) +} + +/// Remove a test image +pub fn cleanup_test_image(image_id: &str) { + let _ = podman_command().args(["rmi", "-f", image_id]).output(); +} diff --git a/crates/integration-tests/src/main.rs b/crates/integration-tests/src/main.rs index e54dc518..87b70f1c 100644 --- a/crates/integration-tests/src/main.rs +++ b/crates/integration-tests/src/main.rs @@ -3,6 +3,10 @@ //! This binary uses [`libtest_mimic`] as a custom test harness (no `#[test]`). //! Tests are registered via the [`integration_test!`] macro in submodules //! and collected from the [`INTEGRATION_TESTS`] distributed slice at startup. +//! +//! IMPORTANT: This binary may be re-executed via `podman unshare` to act as a +//! userns helper for rootless containers-storage access. The init_if_helper() +//! call at the start of main() handles this. // linkme requires unsafe for distributed slices #![allow(unsafe_code)] @@ -71,6 +75,11 @@ pub(crate) fn create_test_rootfs(parent: &Path) -> Result { } fn main() { + // CRITICAL: Handle userns helper re-execution. + // When running rootless, this binary may be re-executed via `podman unshare` + // to act as a helper process for containers-storage access. + composefs_oci::cstor::init_if_helper(); + let args = Arguments::from_args(); let tests: Vec = INTEGRATION_TESTS diff --git a/crates/integration-tests/src/tests/cstor.rs b/crates/integration-tests/src/tests/cstor.rs new file mode 100644 index 00000000..73987b50 --- /dev/null +++ b/crates/integration-tests/src/tests/cstor.rs @@ -0,0 +1,466 @@ +//! Tests for containers-storage import functionality. +//! +//! These tests verify that importing from containers-storage produces identical +//! results to importing via skopeo/tar streaming. +//! +//! These tests require `podman unshare` which needs user namespace support. +//! On environments without proper user namespace support (like GHA runners), +//! they dispatch to a bcvk VM like other privileged tests. + +use anyhow::Result; +use tempfile::TempDir; +use xshell::{Shell, cmd}; + +use integration_tests::{build_test_image, cleanup_test_image, create_test_repository}; + +use crate::integration_test; +use crate::tests::privileged::{require_privileged, require_userns}; + +/// Helper: copy an image into a separate containers-storage root. +/// +/// Uses skopeo to copy from the default store to a standalone overlay +/// storage at `dest/storage`. +fn copy_image_to_separate_store(sh: &Shell, image_id: &str, dest: &std::path::Path) -> Result<()> { + let dest_str = dest.display().to_string(); + let storage_root = format!("{dest_str}/storage"); + let run_root = format!("{dest_str}/run"); + + // skopeo needs the raw hex ID without the sha256: prefix + let raw_id = image_id.strip_prefix("sha256:").unwrap_or(image_id); + let src_ref = format!("containers-storage:{raw_id}"); + // Use [overlay@+] to target the separate storage + let dest_ref = format!("containers-storage:[overlay@{storage_root}+{run_root}]test:latest"); + + // Use podman unshare so that skopeo has UID/GID mappings available + // for unpacking layers with non-root ownership. + if rustix::process::getuid().is_root() { + cmd!(sh, "skopeo copy {src_ref} {dest_ref}").run()?; + } else { + cmd!(sh, "podman unshare skopeo copy {src_ref} {dest_ref}").run()?; + } + + Ok(()) +} + +/// Test that containers-storage import produces identical results to skopeo/tar import. +/// +/// This is a critical correctness test: both import paths should produce the +/// exact same splitstream digests because they represent the same content. +/// +/// Requires a VM because skopeo's containers-storage transport also needs user +/// namespaces internally, and that fails on GHA runners even when podman unshare works. +fn privileged_test_cstor_vs_skopeo_equivalence() -> Result<()> { + if require_privileged("privileged_test_cstor_vs_skopeo_equivalence")?.is_some() { + return Ok(()); + } + let sh = Shell::new()?; + let rt = tokio::runtime::Runtime::new()?; + rt.block_on(async { + println!("Building test image..."); + let test_image = build_test_image()?; + println!("Built test image: {}", test_image); + + // Create two separate repositories for comparison + let cstor_repo_dir = TempDir::new()?; + let skopeo_repo_dir = TempDir::new()?; + + let cstor_repo = create_test_repository(&cstor_repo_dir)?; + let skopeo_repo = create_test_repository(&skopeo_repo_dir)?; + + // Import via containers-storage (reflink path) + let cstor_image_ref = format!("containers-storage:{}", test_image); + println!("Importing via containers-storage: {}", cstor_image_ref); + let cstor_opts = composefs_oci::PullOptions { + local_fetch: composefs_oci::LocalFetchOpt::IfPossible, + ..Default::default() + }; + let cstor_result = + composefs_oci::pull(&cstor_repo, &cstor_image_ref, None, cstor_opts).await?; + + // Import via skopeo (tar streaming path) - copy to OCI directory first + let oci_dir = TempDir::new()?; + let oci_path = oci_dir.path().join("image"); + + // Use skopeo to copy from containers-storage to oci directory + // Strip sha256: prefix for skopeo compatibility + let image_id_for_skopeo = test_image.strip_prefix("sha256:").unwrap_or(&test_image); + let cstor_ref = format!("containers-storage:{}", image_id_for_skopeo); + let oci_ref = format!("oci:{}:test", oci_path.display()); + println!("Copying to OCI dir via skopeo..."); + cmd!(sh, "skopeo copy {cstor_ref} {oci_ref}").run()?; + + // Import from the OCI directory via skopeo/tar path + let skopeo_image_ref = format!("oci:{}:test", oci_path.display()); + println!("Importing via skopeo/OCI: {}", skopeo_image_ref); + let (skopeo_pull_result, _skopeo_stats) = + composefs_oci::pull_image(&skopeo_repo, &skopeo_image_ref, None, None).await?; + let (skopeo_config_digest, skopeo_config_verity) = skopeo_pull_result.into_config(); + + // Get layer maps from both configs + let cstor_oc = composefs_oci::open_config( + &cstor_repo, + &cstor_result.config_digest, + Some(&cstor_result.config_verity), + )?; + let skopeo_oc = composefs_oci::open_config( + &skopeo_repo, + &skopeo_config_digest, + Some(&skopeo_config_verity), + )?; + + // Compare results + println!("CSTOR config digest: {}", cstor_result.config_digest); + println!("SKOPEO config digest: {}", skopeo_config_digest); + assert_eq!( + cstor_result.config_digest, skopeo_config_digest, + "config digests must match" + ); + + println!("CSTOR layers: {:?}", cstor_oc.layer_refs); + println!("SKOPEO layers: {:?}", skopeo_oc.layer_refs); + assert_eq!( + cstor_oc.layer_refs, skopeo_oc.layer_refs, + "layer verity IDs must match" + ); + + println!("CSTOR config verity: {:?}", cstor_result.config_verity); + println!("SKOPEO config verity: {:?}", skopeo_config_verity); + + // NOTE: Config verity IDs may differ due to layer ref ordering. + // The skopeo path sorts layers by size for parallel fetching, then adds + // named refs in that order. The cstor path adds refs in config order. + // Both produce valid splitstreams with correct content, but different verity. + // TODO: Fix the ordering discrepancy in one of the implementations. + if cstor_result.config_verity != skopeo_config_verity { + println!( + "WARNING: Config verity IDs differ due to layer ref ordering. \ + Content is equivalent but splitstream structure differs." + ); + } + + println!("SUCCESS: Both import paths produced equivalent content"); + println!(" Config digest: {}", cstor_result.config_digest); + println!(" Layers: {}", cstor_oc.layer_refs.len()); + + Ok(()) + }) +} +integration_test!(privileged_test_cstor_vs_skopeo_equivalence); + +/// Test that importing the same image twice produces identical results (idempotency). +/// +/// The second import should return the same verity IDs, and import stats should +/// reflect that layers came from cache. +/// +/// Requires user namespace support (podman unshare), so runs only in privileged/VM tests. +fn privileged_test_cstor_idempotent_import() -> Result<()> { + if require_userns("privileged_test_cstor_idempotent_import")?.is_some() { + return Ok(()); + } + let rt = tokio::runtime::Runtime::new()?; + rt.block_on(async { + println!("Building test image..."); + let test_image = build_test_image()?; + println!("Built test image: {}", test_image); + + let repo_dir = TempDir::new()?; + let repo = create_test_repository(&repo_dir)?; + + let cstor_image_ref = format!("containers-storage:{}", test_image); + + // First import + println!("First import via containers-storage..."); + let cstor_opts = composefs_oci::PullOptions { + local_fetch: composefs_oci::LocalFetchOpt::IfPossible, + ..Default::default() + }; + let first_result = composefs_oci::pull(&repo, &cstor_image_ref, None, cstor_opts).await?; + + // Second import of the same image + println!("Second import via containers-storage (should use cache)..."); + let cstor_opts = composefs_oci::PullOptions { + local_fetch: composefs_oci::LocalFetchOpt::IfPossible, + ..Default::default() + }; + let second_result = composefs_oci::pull(&repo, &cstor_image_ref, None, cstor_opts).await?; + + // Verify idempotency: both imports should produce identical results + assert_eq!( + first_result.config_digest, second_result.config_digest, + "config digests must match between imports" + ); + assert_eq!( + first_result.config_verity, second_result.config_verity, + "config verity IDs must match between imports" + ); + + // Verify layer verity IDs match + let first_oc = composefs_oci::open_config( + &repo, + &first_result.config_digest, + Some(&first_result.config_verity), + )?; + let second_oc = composefs_oci::open_config( + &repo, + &second_result.config_digest, + Some(&second_result.config_verity), + )?; + assert_eq!( + first_oc.layer_refs, second_oc.layer_refs, + "layer verity IDs must match between imports" + ); + + // Check import stats: second import should find objects already present + let first_stats = &first_result.stats; + let second_stats = &second_result.stats; + println!("First import stats: {:?}", first_stats); + println!("Second import stats: {:?}", second_stats); + + // The first import should have copied some objects + assert!( + first_stats.objects_copied > 0, + "first import should copy objects" + ); + + // The second import should find everything already present + assert_eq!( + second_stats.objects_copied, 0, + "second import should not copy any new objects" + ); + + println!("SUCCESS: Idempotent import produced identical results"); + println!(" Config digest: {}", first_result.config_digest); + println!(" Layers: {}", first_oc.layer_refs.len()); + println!( + " Second import: {} objects already present", + second_stats.objects_already_present + ); + + Ok(()) + }) +} +integration_test!(privileged_test_cstor_idempotent_import); + +/// Test that importing with a reference parameter creates an OCI manifest tag. +/// +/// The cstor import path now creates full OCI structure (manifest + config + +/// layers), so the reference becomes an OCI-style manifest tag visible via +/// `list_refs()`. +/// +/// Requires user namespace support (podman unshare), so runs only in privileged/VM tests. +fn privileged_test_cstor_import_with_reference() -> Result<()> { + if require_userns("privileged_test_cstor_import_with_reference")?.is_some() { + return Ok(()); + } + let rt = tokio::runtime::Runtime::new()?; + rt.block_on(async { + println!("Building test image..."); + let test_image = build_test_image()?; + println!("Built test image: {}", test_image); + + let repo_dir = TempDir::new()?; + let repo = create_test_repository(&repo_dir)?; + + let cstor_image_ref = format!("containers-storage:{}", test_image); + let reference_name = "test-ref"; + + // Import with a reference name + println!("Importing with reference: {}", reference_name); + let cstor_opts = composefs_oci::PullOptions { + local_fetch: composefs_oci::LocalFetchOpt::IfPossible, + ..Default::default() + }; + let result = + composefs_oci::pull(&repo, &cstor_image_ref, Some(reference_name), cstor_opts).await?; + + println!("Import complete. Config digest: {}", result.config_digest); + println!( + "Import complete. Manifest digest: {}", + result.manifest_digest + ); + + // Verify the OCI tag was created — it should be visible via list_refs() + let refs = composefs_oci::oci_image::list_refs(&repo)?; + let found = refs.iter().any(|(name, _)| name == reference_name); + assert!( + found, + "reference '{}' should appear in OCI refs, got: {:?}", + reference_name, + refs.iter().map(|(n, _)| n).collect::>() + ); + + // The OCI ref should resolve to the manifest we imported + let ref_path = repo_dir + .path() + .join("streams/refs/oci") + .join(reference_name); + assert!( + ref_path.is_symlink(), + "OCI reference '{}' should exist as symlink at {:?}", + reference_name, + ref_path + ); + + let target = std::fs::read_link(&ref_path)?; + let target_str = target.to_string_lossy(); + assert!( + target_str.contains("oci-manifest-"), + "reference should point to oci-manifest stream, got: {}", + target_str + ); + + println!("SUCCESS: Import with reference created OCI manifest tag"); + println!(" Reference: {}", reference_name); + println!(" Manifest digest: {}", result.manifest_digest); + + Ok(()) + }) +} +integration_test!(privileged_test_cstor_import_with_reference); + +/// Test that importing from an additional image store works. +/// +/// This exercises the `STORAGE_OPTS=additionalimagestore=` mechanism +/// used by bcvk to expose the host's containers-storage inside a VM. +/// +/// The test: +/// 1. Builds an image in the default store +/// 2. Copies it to a separate overlay store in a temp directory +/// 3. Removes it from the default store +/// 4. Runs `cfsctl oci pull` with `STORAGE_OPTS` in the command environment +/// (avoiding process-global env mutation) +fn privileged_test_cstor_additional_image_store() -> Result<()> { + if require_userns("privileged_test_cstor_additional_image_store")?.is_some() { + return Ok(()); + } + let sh = Shell::new()?; + let cfsctl = crate::cfsctl()?; + + println!("Building test image..."); + let test_image = build_test_image()?; + println!("Built test image: {}", test_image); + + // Copy the image to a separate store + let separate_store = TempDir::new()?; + println!( + "Copying image to separate store at {}...", + separate_store.path().display() + ); + copy_image_to_separate_store(&sh, &test_image, separate_store.path())?; + + // Remove from the default store so the only way to find it is via + // the additional image store + cleanup_test_image(&test_image); + + // Set STORAGE_OPTS in the command environment (not process-global) + let additional_store = separate_store.path().join("storage"); + let storage_opts = format!("additionalimagestore={}", additional_store.display()); + println!("Running cfsctl with STORAGE_OPTS={}", storage_opts); + + let repo_dir = TempDir::new()?; + let repo = repo_dir.path(); + let cstor_image_ref = format!("containers-storage:{}", test_image); + + // Initialize the repository first + cmd!(sh, "{cfsctl} --insecure --repo {repo} init").run()?; + + // Run cfsctl as an external process with STORAGE_OPTS in its environment + let output = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci pull --local-fetch auto {cstor_image_ref}" + ) + .env("STORAGE_OPTS", &storage_opts) + .read()?; + + println!("SUCCESS: Imported from additional image store"); + println!(" cfsctl output: {}", output); + + // Verify the pull succeeded by checking the output contains a config digest + assert!( + output.contains("sha256:"), + "expected sha256 digest in pull output, got: {output}" + ); + + Ok(()) +} +integration_test!(privileged_test_cstor_additional_image_store); + +/// Test that `--bootable` works for containers-storage imports. +/// +/// This verifies the full end-to-end flow: +/// 1. Build a test image (non-bootable) in containers-storage +/// 2. Pull it via `cfsctl oci pull --bootable` using the cstor path +/// 3. Verify the CLI succeeds and the output includes the boot image verity +/// +/// Even though the test image doesn't contain actual boot content (no UKI, +/// no kernel), `--bootable` should still succeed — `transform_for_boot` +/// gracefully handles images without boot entries by applying SELinux +/// relabeling and emptying /boot and /sysroot. +fn privileged_test_cstor_bootable() -> Result<()> { + if require_userns("privileged_test_cstor_bootable")?.is_some() { + return Ok(()); + } + let sh = Shell::new()?; + let cfsctl = crate::cfsctl()?; + + println!("Building test image..."); + let test_image = build_test_image()?; + println!("Built test image: {}", test_image); + + let repo_dir = TempDir::new()?; + let repo = repo_dir.path(); + let cstor_image_ref = format!("containers-storage:{}", test_image); + + // Initialize the repository first + cmd!(sh, "{cfsctl} --insecure --repo {repo} init").run()?; + + // Pull with --bootable via the cstor path + let output = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci pull --local-fetch auto --bootable {cstor_image_ref}" + ) + .read()?; + + println!("cfsctl output:\n{}", output); + + // Verify pull succeeded with expected output + assert!( + output.contains("manifest"), + "expected 'manifest' in output, got: {output}" + ); + assert!( + output.contains("Boot image:"), + "expected 'Boot image:' in output (--bootable should produce a boot EROFS), got: {output}" + ); + + // Verify the image is visible via oci images (OCI manifest tag was created) + let ls_output = cmd!(sh, "{cfsctl} --insecure --repo {repo} oci images --json").read()?; + let images: serde_json::Value = serde_json::from_str(&ls_output)?; + let images_arr = images.as_array().expect("oci ls should return array"); + assert!( + !images_arr.is_empty(), + "oci ls should show at least one image after cstor pull" + ); + + // Verify the boot EROFS ref is visible via inspect + let tag_name = &cstor_image_ref; + let inspect_output = cmd!( + sh, + "{cfsctl} --insecure --repo {repo} oci inspect {tag_name}" + ) + .read()?; + let inspect: serde_json::Value = serde_json::from_str(&inspect_output)?; + assert!( + inspect.get("composefs_erofs").is_some(), + "inspect should show composefs_erofs field" + ); + assert!( + inspect.get("composefs_boot_erofs").is_some(), + "inspect should show composefs_boot_erofs field after --bootable pull" + ); + + println!("SUCCESS: --bootable works for containers-storage imports"); + + Ok(()) +} +integration_test!(privileged_test_cstor_bootable); diff --git a/crates/integration-tests/src/tests/mod.rs b/crates/integration-tests/src/tests/mod.rs index f94b78b8..0e970a9c 100644 --- a/crates/integration-tests/src/tests/mod.rs +++ b/crates/integration-tests/src/tests/mod.rs @@ -1,5 +1,6 @@ //! Integration test modules, organized by execution environment. pub mod cli; +pub mod cstor; pub mod digest_stability; pub mod privileged; diff --git a/crates/integration-tests/src/tests/privileged.rs b/crates/integration-tests/src/tests/privileged.rs index 4c218bf1..de15ba63 100644 --- a/crates/integration-tests/src/tests/privileged.rs +++ b/crates/integration-tests/src/tests/privileged.rs @@ -9,13 +9,17 @@ //! recursion — see [`require_privileged`]. use std::path::{Path, PathBuf}; +use std::sync::Arc; -use anyhow::{Result, bail, ensure}; +use anyhow::{Context, Result, bail, ensure}; use xshell::{Shell, cmd}; +use composefs_oci::composefs::fsverity::{FsVerityHashValue, Sha256HashValue, Sha512HashValue}; +use composefs_oci::composefs::repository::Repository; + use crate::{cfsctl, integration_test}; -/// Ensure we're running as root, or re-exec this test inside a VM. +/// Ensure we're running in a privileged environment, or re-exec this test inside a VM. /// /// If already root (e.g. inside a bcvk VM), returns `Ok(None)` and the /// test proceeds normally. @@ -26,7 +30,10 @@ use crate::{cfsctl, integration_test}; /// the test already ran in the VM. /// /// If not root and no test image is configured, returns an error. -fn require_privileged(test_name: &str) -> Result> { +/// +/// This is also used by cstor tests which need user namespace support +/// (via `podman unshare`) that may not be available on GHA runners. +pub fn require_privileged(test_name: &str) -> Result> { if rustix::process::getuid().is_root() { return Ok(None); } @@ -53,6 +60,58 @@ fn require_privileged(test_name: &str) -> Result> { Ok(Some(())) } +/// Check if user namespaces work (needed for podman unshare). +fn userns_works() -> bool { + std::process::Command::new("podman") + .args(["unshare", "true"]) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .map(|s| s.success()) + .unwrap_or(false) +} + +/// Ensure user namespace support is available, or re-exec this test inside a VM. +/// +/// Unlike `require_privileged`, this doesn't require root — it just needs +/// working user namespaces (for `podman unshare`). If user namespaces work, +/// the test proceeds normally. Otherwise, it dispatches to a VM. +/// +/// Returns `Ok(None)` if the test should proceed, `Ok(Some(()))` if it was +/// dispatched to a VM and the caller should return immediately. +pub fn require_userns(test_name: &str) -> Result> { + // If we're root (e.g. in VM), userns works + if rustix::process::getuid().is_root() { + return Ok(None); + } + + // Check if userns works on this host + if userns_works() { + return Ok(None); + } + + // userns doesn't work — delegate to a VM + if std::env::var_os("COMPOSEFS_IN_VM").is_some() { + bail!("COMPOSEFS_IN_VM is set but userns doesn't work — VM setup is broken"); + } + + let image = std::env::var("COMPOSEFS_TEST_IMAGE").map_err(|_| { + anyhow::anyhow!( + "user namespaces not available and COMPOSEFS_TEST_IMAGE not set; \ + run `just build-test-image` or use `just test-integration-vm`" + ) + })?; + + let sh = Shell::new()?; + let bcvk = std::env::var("BCVK_PATH").unwrap_or_else(|_| "bcvk".into()); + cmd!( + sh, + "{bcvk} ephemeral run-ssh {image} -- cfsctl-integration-tests --exact {test_name}" + ) + .run()?; + Ok(Some(())) +} + /// A temporary directory backed by a loopback ext4 filesystem with verity support. /// /// tmpfs doesn't support fs-verity, so privileged tests that need verity @@ -431,3 +490,339 @@ fn privileged_pull_readonly_repo() -> Result<()> { Ok(()) } integration_test!(privileged_pull_readonly_repo); + +// ============================================================================ +// Filesystem-specific reflink / hardlink tests +// ============================================================================ + +/// A temporary directory backed by a loop-mounted filesystem. +/// +/// Supports ext4 (with verity) and XFS (with reflinks). The backing sparse +/// file is 512 MB — large enough for a synthetic OCI image in +/// containers-storage plus a composefs repo. +struct LoopTempDir { + mountpoint: PathBuf, + _backing: tempfile::TempDir, +} + +impl LoopTempDir { + /// Create a loop-mounted ext4 filesystem with verity support. + fn ext4_verity() -> Result { + Self::create("mkfs.ext4", &["-q", "-O", "verity", "-b", "4096"]) + } + + /// Create a loop-mounted XFS filesystem with reflink support. + fn xfs_reflink() -> Result { + Self::create("mkfs.xfs", &["-q", "-m", "reflink=1"]) + } + + fn create(mkfs: &str, args: &[&str]) -> Result { + let backing = tempfile::tempdir()?; + let img = backing.path().join("fs.img"); + let mountpoint = backing.path().join("mnt"); + std::fs::create_dir(&mountpoint)?; + + let sh = Shell::new()?; + cmd!(sh, "truncate -s 512M {img}").run()?; + cmd!(sh, "{mkfs} {args...} {img}").run()?; + cmd!(sh, "mount -o loop {img} {mountpoint}").run()?; + + Ok(Self { + mountpoint, + _backing: backing, + }) + } + + fn path(&self) -> &Path { + &self.mountpoint + } +} + +impl Drop for LoopTempDir { + fn drop(&mut self) { + let _ = std::process::Command::new("umount") + .arg(&self.mountpoint) + .status(); + } +} + +/// Create a minimal OCI directory image with files large enough to exercise +/// the `ensure_object_from_file` path (> 64 bytes, the inline threshold). +/// +/// Returns the path to the OCI directory. +pub(super) fn create_oci_layout_with_large_files(parent: &Path) -> Result { + use cap_std_ext::cap_std; + use ocidir::oci_spec::image::{ + ConfigBuilder, ImageConfigurationBuilder, Platform, PlatformBuilder, RootFsBuilder, + }; + + let oci_dir = parent.join("oci-image"); + std::fs::create_dir_all(&oci_dir)?; + + let dir = cap_std::fs::Dir::open_ambient_dir(&oci_dir, cap_std::ambient_authority())?; + let ocidir = ocidir::OciDir::ensure(dir)?; + + let mut manifest = ocidir.new_empty_manifest()?.build()?; + + let runtime_config = ConfigBuilder::default().build()?; + let rootfs = RootFsBuilder::default() + .typ("layers") + .diff_ids(Vec::::new()) + .build()?; + let mut config = ImageConfigurationBuilder::default() + .architecture("amd64") + .os("linux") + .rootfs(rootfs) + .config(runtime_config) + .build()?; + + // Create a layer with several files > INLINE_CONTENT_MAX_V0 (64 bytes) + // so they go through the ensure_object_from_file path during cstor import. + // The image must have /usr (required by transform_for_oci). + let mut layer_builder = ocidir.create_layer(None)?; + + // Add /usr directory (required by composefs OCI transformations) + let mut usr_hdr = tar::Header::new_gnu(); + usr_hdr.set_entry_type(tar::EntryType::Directory); + usr_hdr.set_size(0); + usr_hdr.set_mode(0o755); + usr_hdr.set_uid(0); + usr_hdr.set_gid(0); + usr_hdr.set_mtime(1234567890); + usr_hdr.set_cksum(); + layer_builder.append_data(&mut usr_hdr, "usr/", &[] as &[u8])?; + + for i in 0..5u8 { + let data = vec![i.wrapping_mul(0x37); 4096]; + let name = format!("usr/file_{i}.bin"); + let mut header = tar::Header::new_gnu(); + header.set_size(data.len() as u64); + header.set_mode(0o644); + header.set_uid(0); + header.set_gid(0); + header.set_mtime(1234567890); + header.set_cksum(); + layer_builder.append_data(&mut header, &name, &data[..])?; + } + let layer = layer_builder.into_inner()?.complete()?; + + ocidir.push_layer(&mut manifest, &mut config, layer, "test layer", None); + + let platform: Platform = PlatformBuilder::default() + .architecture("amd64") + .os("linux") + .build()?; + ocidir.insert_manifest_and_config(manifest, config, None, platform)?; + + Ok(oci_dir) +} + +/// Copy an OCI directory image into containers-storage on a specific filesystem. +/// +/// Uses `skopeo copy` with the `[overlay@root+runroot]` syntax to target +/// a containers-storage instance at the given mount point. +/// +/// Returns the storage root path. +pub(super) fn copy_oci_to_cstor(sh: &Shell, oci_dir: &Path, mount: &Path) -> Result { + let storage_root = mount.join("storage"); + let run_root = mount.join("run"); + std::fs::create_dir_all(&storage_root)?; + std::fs::create_dir_all(&run_root)?; + + let oci_ref = format!("oci:{}", oci_dir.display()); + let cstor_ref = format!( + "containers-storage:[overlay@{}+{}]test:latest", + storage_root.display(), + run_root.display() + ); + + // Run in a private mount namespace so that any bind mounts the overlay + // driver creates (e.g. on storage/overlay) don't leak into our namespace. + // This ensures the diff files we later open are on the raw filesystem, + // not behind a bind mount that would cause EXDEV on hardlinks. + cmd!(sh, "unshare -m skopeo copy {oci_ref} {cstor_ref}").run()?; + + Ok(storage_root) +} + +/// Open a repository at `path`, initializing it first. Uses insecure mode +/// so the tests work on filesystems without verity (XFS). +fn init_insecure_repo_at( + path: &Path, + algorithm: composefs_oci::composefs::fsverity::Algorithm, +) -> Result>> { + std::fs::create_dir_all(path)?; + let fd = rustix::fs::open( + path, + rustix::fs::OFlags::CLOEXEC | rustix::fs::OFlags::RDONLY, + 0.into(), + )?; + let (mut repo, _created) = Repository::::init_path(&fd, ".", algorithm, false)?; + repo.set_insecure(); + Ok(Arc::new(repo)) +} + +/// Pull a containers-storage image into a composefs repo with an explicit +/// storage root and local-fetch mode, and return the import stats. +fn cstor_pull( + repo: &Arc>, + storage_root: &Path, + local_fetch: composefs_oci::LocalFetchOpt, +) -> Result { + let opts = composefs_oci::PullOptions { + local_fetch, + storage_root: Some(storage_root), + ..Default::default() + }; + + let rt = tokio::runtime::Runtime::new()?; + let pull_result = rt + .block_on(async { + composefs_oci::pull(repo, "containers-storage:test:latest", None, opts).await + }) + .context("containers-storage pull failed")?; + Ok(pull_result.stats) +} + +/// Dispatch a cstor pull with the given algorithm and local-fetch mode. +/// +/// This creates a fresh repo at `repo_path` with the appropriate hash type +/// and pulls the `test:latest` image from containers-storage. +fn cstor_pull_with_algorithm( + storage_root: &Path, + repo_path: &Path, + algorithm: composefs_oci::composefs::fsverity::Algorithm, + local_fetch: composefs_oci::LocalFetchOpt, +) -> Result { + match algorithm { + composefs_oci::composefs::fsverity::Algorithm::Sha256 { .. } => { + let repo = init_insecure_repo_at::(repo_path, algorithm)?; + cstor_pull(&repo, storage_root, local_fetch) + } + composefs_oci::composefs::fsverity::Algorithm::Sha512 { .. } => { + let repo = init_insecure_repo_at::(repo_path, algorithm)?; + cstor_pull(&repo, storage_root, local_fetch) + } + } +} + +/// On ext4 (no reflink support), the import should skip FICLONE after the +/// first probe fails and use hardlinks instead (zero-copy). The `skopeo +/// copy` step runs in `unshare -m` to prevent the overlay driver's bind +/// mount from interfering with hardlinks. +/// +/// Covers all combinations of `{sha256, sha512} × {auto, zerocopy}`. +fn privileged_cstor_import_ext4_hardlink() -> Result<()> { + use composefs_oci::LocalFetchOpt; + use composefs_oci::composefs::fsverity::Algorithm; + + if require_privileged("privileged_cstor_import_ext4_hardlink")?.is_some() { + return Ok(()); + } + + let algorithms = [(Algorithm::SHA256, "sha256"), (Algorithm::SHA512, "sha512")]; + let modes = [ + (LocalFetchOpt::IfPossible, "auto"), + (LocalFetchOpt::ZeroCopy, "zerocopy"), + ]; + + let sh = Shell::new()?; + + for (algorithm, alg_name) in &algorithms { + // One LoopTempDir per algorithm: enabling verity is irreversible and + // algorithm-specific, so we cannot reuse across algorithms. + let fs = LoopTempDir::ext4_verity()?; + + let oci_dir = create_oci_layout_with_large_files(fs.path())?; + let storage_root = copy_oci_to_cstor(&sh, &oci_dir, fs.path())?; + + for (mode, mode_name) in &modes { + let repo_dir = fs.path().join(format!("repo-{alg_name}-{mode_name}")); + let stats = cstor_pull_with_algorithm(&storage_root, &repo_dir, *algorithm, *mode)?; + + println!("ext4 [{alg_name}/{mode_name}] import stats: {stats:?}"); + ensure!( + stats.objects_reflinked == 0, + "ext4 [{alg_name}/{mode_name}] should not reflink any objects, got {} reflinked", + stats.objects_reflinked, + ); + ensure!( + stats.objects_hardlinked > 0, + "ext4 [{alg_name}/{mode_name}] should hardlink objects, got 0 hardlinked (copied={})", + stats.objects_copied, + ); + ensure!( + stats.objects_copied == 0, + "ext4 [{alg_name}/{mode_name}] should not need copies, got {} copied", + stats.objects_copied, + ); + println!( + "ext4 [{alg_name}/{mode_name}]: {} hardlinked, {} already present", + stats.objects_hardlinked, stats.objects_already_present, + ); + } + } + + Ok(()) +} +integration_test!(privileged_cstor_import_ext4_hardlink); + +/// On XFS with reflink support, importing from containers-storage on the same +/// filesystem should use reflinks (zero-copy), and the import stats should +/// show `objects_reflinked > 0`. +/// +/// Covers all combinations of `{sha256, sha512} × {auto, zerocopy}`. +fn privileged_cstor_import_xfs_reflink() -> Result<()> { + use composefs_oci::LocalFetchOpt; + use composefs_oci::composefs::fsverity::Algorithm; + + if require_privileged("privileged_cstor_import_xfs_reflink")?.is_some() { + return Ok(()); + } + + // Skip if mkfs.xfs is not available (e.g. Debian bootc images). + if !Path::new("/usr/sbin/mkfs.xfs").exists() && !Path::new("/sbin/mkfs.xfs").exists() { + println!("SKIP: mkfs.xfs not available"); + return Ok(()); + } + + let algorithms = [(Algorithm::SHA256, "sha256"), (Algorithm::SHA512, "sha512")]; + let modes = [ + (LocalFetchOpt::IfPossible, "auto"), + (LocalFetchOpt::ZeroCopy, "zerocopy"), + ]; + + let sh = Shell::new()?; + + for (algorithm, alg_name) in &algorithms { + let fs = LoopTempDir::xfs_reflink()?; + + let oci_dir = create_oci_layout_with_large_files(fs.path())?; + let storage_root = copy_oci_to_cstor(&sh, &oci_dir, fs.path())?; + + for (mode, mode_name) in &modes { + let repo_dir = fs.path().join(format!("repo-{alg_name}-{mode_name}")); + let stats = cstor_pull_with_algorithm(&storage_root, &repo_dir, *algorithm, *mode)?; + + println!("XFS [{alg_name}/{mode_name}] import stats: {stats:?}"); + ensure!( + stats.objects_reflinked > 0, + "XFS [{alg_name}/{mode_name}] should reflink objects, got 0 reflinked (hardlinked={}, copied={})", + stats.objects_hardlinked, + stats.objects_copied, + ); + ensure!( + stats.objects_copied == 0, + "XFS [{alg_name}/{mode_name}] should not need copies, got {} copied", + stats.objects_copied, + ); + println!( + "XFS [{alg_name}/{mode_name}]: {} reflinked, {} already present", + stats.objects_reflinked, stats.objects_already_present, + ); + } + } + + Ok(()) +} +integration_test!(privileged_cstor_import_xfs_reflink);