From dd6ed36828a7d0dc272dae652171542069206a48 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Sun, 3 May 2026 10:55:28 -0400 Subject: [PATCH 1/8] fs,repository: Add write concurrency control and semaphore override Add set_write_concurrency() to Repository for overriding the default parallelism. Add read_filesystem_with_semaphore() as a public entry point that accepts an explicit Semaphore, and refactor the internal read_filesystem_impl() to centralize semaphore selection. Prep for wiring up --threads in mkcomposefs. Assisted-by: OpenCode (Claude Sonnet 4.6) Signed-off-by: Colin Walters --- crates/composefs/src/fs.rs | 42 +++++++++++++++++++++++++----- crates/composefs/src/repository.rs | 34 ++++++++++++++++++++++-- 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/crates/composefs/src/fs.rs b/crates/composefs/src/fs.rs index 76ac2f3a..f4380f93 100644 --- a/crates/composefs/src/fs.rs +++ b/crates/composefs/src/fs.rs @@ -533,18 +533,46 @@ pub fn read_file( /// /// If `repo` is `Some`, file objects are stored in the repository. /// If `None`, fsverity digests are computed without writing to disk. +/// +/// An optional `semaphore` can be provided to override the default concurrency +/// control. When `None`, the semaphore is derived from the repository (if any) +/// or from [`available_parallelism`]. pub async fn read_filesystem( dirfd: OwnedFd, path: PathBuf, repo: Option>>, ) -> Result> { - let semaphore = repo - .as_ref() - .map(|r| r.write_semaphore()) - .unwrap_or_else(|| { - let n = available_parallelism().map(|n| n.get()).unwrap_or(4); - Arc::new(Semaphore::new(n)) - }); + read_filesystem_impl(dirfd, path, repo, None).await +} + +/// Like [`read_filesystem`] but with an explicit concurrency limit. +/// +/// The `semaphore`, if provided, overrides the default parallelism derived from +/// the repository or [`available_parallelism`]. This is the recommended way to +/// honour a user-supplied `--threads` argument when no repository is present. +pub async fn read_filesystem_with_semaphore( + dirfd: OwnedFd, + path: PathBuf, + repo: Option>>, + semaphore: Arc, +) -> Result> { + read_filesystem_impl(dirfd, path, repo, Some(semaphore)).await +} + +async fn read_filesystem_impl( + dirfd: OwnedFd, + path: PathBuf, + repo: Option>>, + semaphore_override: Option>, +) -> Result> { + let semaphore = semaphore_override.unwrap_or_else(|| { + repo.as_ref() + .map(|r| r.write_semaphore()) + .unwrap_or_else(|| { + let n = available_parallelism().map(|n| n.get()).unwrap_or(4); + Arc::new(Semaphore::new(n)) + }) + }); // Channel for streaming work items from the scan thread to the // async runtime. The scan sends (key, fd, size) as files are diff --git a/crates/composefs/src/repository.rs b/crates/composefs/src/repository.rs index 05ef92f4..8b50c241 100644 --- a/crates/composefs/src/repository.rs +++ b/crates/composefs/src/repository.rs @@ -743,6 +743,10 @@ pub struct Repository { repository: OwnedFd, objects: OnceCell, write_semaphore: OnceCell>, + /// Optional override for the number of concurrent object writes. + /// Set via [`set_write_concurrency`](Self::set_write_concurrency) before the semaphore + /// is first used; if `None`, defaults to [`available_parallelism`]. + write_concurrency: Option, insecure: bool, metadata: RepoMetadata, /// When true, SplitStreamWriter::done() writes old-format (pre-repr(C)) @@ -1051,15 +1055,40 @@ impl Repository { .get_or_try_init(|| ensure_dir_and_openat(&self.repository, "objects", OFlags::PATH)) } + /// Override the maximum number of concurrent object writes. + /// + /// Must be called before the first use of [`write_semaphore`](Self::write_semaphore); + /// has no effect if the semaphore has already been initialized. + pub fn set_write_concurrency(&mut self, n: usize) { + // Guard: the semaphore is lazily initialized on first use. If it's + // already been initialized, this call has no effect. Callers must + // set concurrency before any write operations begin. + debug_assert!( + self.write_semaphore.get().is_none(), + "set_write_concurrency called after write_semaphore was already initialized; \ + call this before any write operations" + ); + if self.write_semaphore.get().is_some() { + log::warn!( + "set_write_concurrency called after semaphore was already initialized; ignoring" + ); + return; + } + self.write_concurrency = Some(n); + } + /// Return a shared semaphore for limiting concurrent object writes. /// - /// This semaphore is lazily initialized with `available_parallelism()` permits, + /// This semaphore is lazily initialized with `available_parallelism()` permits + /// (or the value set via [`set_write_concurrency`](Self::set_write_concurrency)), /// and shared across all operations on this repository. Use this to limit /// concurrent I/O when processing multiple files or layers in parallel. pub fn write_semaphore(&self) -> Arc { self.write_semaphore .get_or_init(|| { - let max_concurrent = available_parallelism().map(|n| n.get()).unwrap_or(4); + let max_concurrent = self + .write_concurrency + .unwrap_or_else(|| available_parallelism().map(|n| n.get()).unwrap_or(4)); Arc::new(Semaphore::new(max_concurrent)) }) .clone() @@ -1175,6 +1204,7 @@ impl Repository { repository, objects: OnceCell::new(), write_semaphore: OnceCell::new(), + write_concurrency: None, insecure: !has_verity, metadata, #[cfg(any(test, feature = "test"))] From 9cbb55b7ae0274e6da214679e603a6b112f7da82 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Sun, 17 May 2026 18:08:45 -0400 Subject: [PATCH 2/8] just: Fix bootc::patch recipe crate path The patch recipe referenced crates/cfsctl which was never a valid path; the crate has always been named composefs-ctl. Also relax the clean-tree check to allow untracked files (only committed changes need to match the pinned revision). Assisted-by: OpenCode (Claude Sonnet 4.6) Signed-off-by: Colin Walters --- bootc/Justfile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bootc/Justfile b/bootc/Justfile index ad8b4223..b450b058 100644 --- a/bootc/Justfile +++ b/bootc/Justfile @@ -51,10 +51,12 @@ patch: clone #!/bin/bash set -euo pipefail - # Require a clean composefs-rs working tree so we test a real commit + # Require a clean composefs-rs working tree so we test a real commit. + # Only tracked files matter; untracked files are allowed. + # git diff HEAD already excludes untracked files. if ! git -C "$_COMPOSEFS_SRC" diff --quiet HEAD 2>/dev/null; then echo "error: composefs-rs has uncommitted changes — commit or stash first" >&2 - git -C "$_COMPOSEFS_SRC" status --short >&2 + git -C "$_COMPOSEFS_SRC" diff --stat HEAD >&2 exit 1 fi From a0a1d6d2e3781459f87f63bbd0a31bbe1dfe1cdd Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Sun, 17 May 2026 09:30:11 -0400 Subject: [PATCH 3/8] oci: Check repo writability before opening OCI layout source import_oci_layout() was opening the layout directory before calling ensure_writable(), so pulling into a read-only repo produced a misleading ENOENT error instead of a clear 'not writable' message. Move the write check to the top of the function, matching the existing skopeo pull path. Fixes privileged_pull_readonly_repo integration test. Signed-off-by: Colin Walters --- crates/composefs-oci/src/oci_layout.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crates/composefs-oci/src/oci_layout.rs b/crates/composefs-oci/src/oci_layout.rs index 2c831cf0..a340f126 100644 --- a/crates/composefs-oci/src/oci_layout.rs +++ b/crates/composefs-oci/src/oci_layout.rs @@ -82,6 +82,10 @@ pub async fn import_oci_layout( layout_tag: Option<&str>, reporter: SharedReporter, ) -> Result<(PullResult, ImportStats)> { + // Check writability before touching the source, so a read-only repo gives + // a clear "not writable" error rather than a misleading source-open error. + repo.ensure_writable()?; + // Open the OCI layout directory let dir = cap_std::fs::Dir::open_ambient_dir(layout_path, cap_std::ambient_authority()) .with_context(|| format!("Opening OCI layout directory {}", layout_path.display()))?; From e50ab2fa24ffc7cf659bef8d548e545a6c891a1b Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Sun, 3 May 2026 10:55:47 -0400 Subject: [PATCH 4/8] fs: Add ObjectStore trait and FlatDigestStore For compatibility with the C composefs, we need to support writing directly to a flat XX/DIGEST path, without a leading `objects/`. Assisted-by: OpenCode (Claude Sonnet 4.6) Signed-off-by: Colin Walters --- crates/composefs/src/fs.rs | 311 +++++++++++++++++++++++++++++++++-- crates/composefs/src/util.rs | 17 ++ 2 files changed, 312 insertions(+), 16 deletions(-) diff --git a/crates/composefs/src/fs.rs b/crates/composefs/src/fs.rs index f4380f93..0c33be72 100644 --- a/crates/composefs/src/fs.rs +++ b/crates/composefs/src/fs.rs @@ -39,9 +39,201 @@ use crate::{ repository::Repository, shared_internals::IO_BUF_CAPACITY, tree::{Directory, FileSystem, Inode, Leaf, LeafContent, RegularFile, Stat}, - util::proc_self_fd, + util::{create_tmpfile_in, proc_self_fd, reopen_tmpfile_ro}, }; +// --------------------------------------------------------------------------- +// ObjectStore trait +// --------------------------------------------------------------------------- + +/// An abstraction over content-addressed storage for file objects. +/// +/// Both [`Repository`] and the C-compatible [`FlatDigestStore`] implement +/// this trait so that [`read_filesystem`] can write file content to either +/// layout without duplicating the scanning logic. +pub trait ObjectStore: Send + Sync { + /// Store `fd` as an object, returning its verity digest. + /// + /// If an object with the same digest already exists, this is a no-op + /// and the existing digest is returned. + fn ensure_object_from_fd(&self, fd: OwnedFd, size: u64) -> Result; + + /// Return a semaphore that gates concurrent object writes. + fn write_semaphore(&self) -> Arc; +} + +impl ObjectStore for Repository { + fn ensure_object_from_fd(&self, fd: OwnedFd, size: u64) -> Result { + self.ensure_object_from_fd(fd, size) + } + + fn write_semaphore(&self) -> Arc { + self.write_semaphore() + } +} + +/// C-compatible flat digest store (`/XX/DIGEST`). +/// +/// This mirrors the layout written by `mkcomposefs --digest-store` from the C +/// implementation, where file objects live at `//` +/// (e.g. `/ab/abcdef01234...`). This is distinct from the composefs-rs +/// [`Repository`] layout which nests objects under an `objects/` subdirectory. +/// +/// The flat layout makes the digest store interchangeable with the C tooling. +#[derive(Debug)] +pub struct FlatDigestStore { + /// Open directory fd for the store root. + root: Arc, + semaphore: Arc, + /// If true, fall back to userspace hashing when kernel fs-verity is + /// unavailable (e.g. tmpfs, overlayfs). Matches `Repository::insecure`. + insecure: bool, +} + +impl FlatDigestStore { + /// Open or create a flat digest store at `path`. + /// + /// `concurrency` controls how many concurrent object writes are permitted. + /// `insecure` enables userspace-hashing fallback when fs-verity is unavailable + /// (e.g. on tmpfs or overlayfs). Set to `true` for CLI use where the filesystem + /// may not support verity; set to `false` for strict security requirements. + pub fn open(path: &Path, concurrency: usize, insecure: bool) -> Result { + use rustix::fs::{Mode, mkdirat}; + + match mkdirat(CWD, path, Mode::from_raw_mode(0o755)) { + Ok(()) | Err(rustix::io::Errno::EXIST) => {} + Err(e) => { + return Err(e) + .with_context(|| format!("Failed to create flat digest store: {path:?}")); + } + } + + let root = openat( + CWD, + path, + OFlags::RDONLY | OFlags::DIRECTORY | OFlags::CLOEXEC, + Mode::empty(), + ) + .with_context(|| format!("Failed to open flat digest store: {path:?}"))?; + + Ok(Self { + root: Arc::new(root), + semaphore: Arc::new(Semaphore::new(concurrency)), + insecure, + }) + } +} + +impl ObjectStore for FlatDigestStore { + fn ensure_object_from_fd(&self, fd: OwnedFd, size: u64) -> Result { + use crate::fsverity::{EnableVerityError, enable_verity_maybe_copy, measure_verity}; + use std::io::BufRead as _; + + // 1. Create an anonymous O_TMPFILE in the store root. + // No name collision possible; invisible until linked. + let tmpfile_fd = create_tmpfile_in(self.root.as_fd()) + .context("Creating O_TMPFILE in flat digest store")?; + + // 2. Stream from source fd into tmpfile (no in-memory buffering). + let mut src = std::io::BufReader::with_capacity(IO_BUF_CAPACITY, File::from(fd)); + let mut dst = File::from(tmpfile_fd.try_clone().context("Cloning tmpfile fd")?); + let copied = std::io::copy(&mut src, &mut dst).context("Copying object data to tmpfile")?; + ensure!( + copied == size, + "object size mismatch: expected {size}, copied {copied}" + ); + drop(dst); + + // 3. Reopen as read-only (kernel requires no writable fds to enable verity). + let ro_fd = + reopen_tmpfile_ro(File::from(tmpfile_fd)).context("Reopening tmpfile as read-only")?; + + // 4. Enable kernel fs-verity (kernel reads and hashes the file for us). + let (ro_fd, verity_enabled) = + match enable_verity_maybe_copy::(self.root.as_fd(), ro_fd.as_fd()) { + Ok(None) => (ro_fd, true), + Ok(Some(new_fd)) => (new_fd, true), + Err(EnableVerityError::AlreadyEnabled) => (ro_fd, true), + Err(EnableVerityError::FilesystemNotSupported) if self.insecure => (ro_fd, false), + Err(e) => { + return Err(anyhow::anyhow!(e)).context("Enabling verity on object tmpfile"); + } + }; + + // 5. Get the digest — from the kernel (fast) or userspace fallback. + let id: ObjectID = if verity_enabled { + measure_verity(&ro_fd).context("Measuring verity digest after enable")? + } else { + // Insecure fallback: re-read the tmpfile to compute the digest. + let mut reader = std::io::BufReader::with_capacity( + IO_BUF_CAPACITY, + File::from(ro_fd.try_clone().context("Cloning ro_fd for digest")?), + ); + let mut hasher = FsVerityHasher::::new(); + loop { + let buf = reader.fill_buf().context("Reading tmpfile for digest")?; + if buf.is_empty() { + break; + } + let chunk = &buf[..buf.len().min(FsVerityHasher::::BLOCK_SIZE)]; + hasher.add_block(chunk); + let n = chunk.len(); + reader.consume(n); + } + hasher.digest() + }; + + // 6. Derive flat path: XX/rest-of-hex (C-compatible layout). + let obj_path = id.to_object_pathname(); + let slash = obj_path + .find('/') + .expect("to_object_pathname always has '/'"); + let dir_name = &obj_path[..slash]; + let file_name = &obj_path[slash + 1..]; + + // 7. Create XX/ subdirectory if needed. + match mkdirat(self.root.as_fd(), dir_name, Mode::from_raw_mode(0o755)) { + Ok(()) | Err(rustix::io::Errno::EXIST) => {} + Err(e) => { + return Err(e) + .with_context(|| format!("Creating digest store subdirectory {dir_name:?}")); + } + } + + // 8. Open the XX/ subdirectory for use as linkat target. + let subdir = openat( + self.root.as_fd(), + dir_name, + OFlags::RDONLY | OFlags::DIRECTORY | OFlags::CLOEXEC, + Mode::empty(), + ) + .with_context(|| format!("Opening digest store subdirectory {dir_name:?}"))?; + + // 9. Atomically link the tmpfile into its final content-addressed path. + // EEXIST means another writer already stored the same object — fine. + match linkat( + CWD, + proc_self_fd(&ro_fd), + &subdir, + file_name, + AtFlags::SYMLINK_FOLLOW, + ) { + Ok(()) | Err(rustix::io::Errno::EXIST) => {} + Err(e) => { + return Err(e).with_context(|| { + format!("Linking object into flat digest store: {obj_path:?}") + }); + } + } + + Ok(id) + } + + fn write_semaphore(&self) -> Arc { + self.semaphore.clone() + } +} + /// Attempt to use O_TMPFILE + rename to atomically set file contents. /// Will fall back to a non-atomic write if the target doesn't support O_TMPFILE. #[context("Setting file contents for {}", name.to_string_lossy())] @@ -542,32 +734,63 @@ pub async fn read_filesystem( path: PathBuf, repo: Option>>, ) -> Result> { - read_filesystem_impl(dirfd, path, repo, None).await + let store: Option>> = + repo.map(|r| r as Arc>); + read_filesystem_impl(dirfd, path, store, None).await +} + +/// Options for [`read_filesystem_with_opts`]. +pub struct ReadFilesystemOpts { + /// Object store to use for storing file content. When `None`, fsverity + /// digests are computed without writing to disk. + pub store: Option>>, + /// Override the default concurrency limit. When `None`, the semaphore is + /// derived from the store (if any) or from [`available_parallelism`]. + pub semaphore: Option>, } -/// Like [`read_filesystem`] but with an explicit concurrency limit. +impl std::fmt::Debug for ReadFilesystemOpts { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ReadFilesystemOpts") + .field("store", &self.store.as_ref().map(|_| "")) + .field("semaphore", &self.semaphore) + .finish() + } +} + +impl Default for ReadFilesystemOpts { + fn default() -> Self { + Self { + store: None, + semaphore: None, + } + } +} + +/// Like [`read_filesystem`] but with full control over store and concurrency. /// -/// The `semaphore`, if provided, overrides the default parallelism derived from -/// the repository or [`available_parallelism`]. This is the recommended way to -/// honour a user-supplied `--threads` argument when no repository is present. -pub async fn read_filesystem_with_semaphore( +/// This is the preferred entry point when you need to supply a custom +/// [`ObjectStore`] (e.g. [`FlatDigestStore`] for C-compatible `--digest-store` +/// behaviour) and/or override the thread concurrency limit (e.g. to honour a +/// user-supplied `--threads` argument). +pub async fn read_filesystem_with_opts( dirfd: OwnedFd, path: PathBuf, - repo: Option>>, - semaphore: Arc, + opts: ReadFilesystemOpts, ) -> Result> { - read_filesystem_impl(dirfd, path, repo, Some(semaphore)).await + read_filesystem_impl(dirfd, path, opts.store, opts.semaphore).await } async fn read_filesystem_impl( dirfd: OwnedFd, path: PathBuf, - repo: Option>>, + store: Option>>, semaphore_override: Option>, ) -> Result> { let semaphore = semaphore_override.unwrap_or_else(|| { - repo.as_ref() - .map(|r| r.write_semaphore()) + store + .as_ref() + .map(|s| s.write_semaphore()) .unwrap_or_else(|| { let n = available_parallelism().map(|n| n.get()).unwrap_or(4); Arc::new(Semaphore::new(n)) @@ -626,11 +849,11 @@ async fn read_filesystem_impl( item = items.next(), if items_open => { match item { Some(((key, fd, size), permit)) => { - let repo = repo.clone(); + let store = store.clone(); tasks.spawn_blocking(move || { let _permit = permit; - let id = if let Some(repo) = repo { - repo.ensure_object_from_fd(fd, size)? + let id = if let Some(store) = store { + store.ensure_object_from_fd(fd, size)? } else { compute_verity_from_fd::(fd)? }; @@ -724,4 +947,60 @@ mod tests { assert_eq!(std::fs::read(testpath)?, b"new contents"); Ok(()) } + + /// Verify that `FlatDigestStore` stores objects in the C-compatible `XX/DIGEST` layout. + #[test] + fn test_flat_digest_store_layout() -> Result<()> { + use crate::fsverity::Sha256HashValue; + + let td = tempfile::tempdir()?; + let store_path = td.path().join("store"); + let store = FlatDigestStore::open(&store_path, 1, true)?; + + // Store a small piece of content. + let content = b"hello, flat digest store!"; + let src_dir = tempfile::tempdir()?; + let src_path = src_dir.path().join("file"); + std::fs::write(&src_path, content)?; + let src_fd = openat( + CWD, + &src_path, + OFlags::RDONLY | OFlags::CLOEXEC, + Mode::from_raw_mode(0), + )?; + + let id = >::ensure_object_from_fd( + &store, + src_fd, + content.len() as u64, + )?; + + // Verify the layout: store/XX/rest-of-digest + let expected_path = id.to_object_pathname(); // e.g. "ab/cdef0123..." + let full_path = store_path.join(&expected_path); + assert!( + full_path.exists(), + "Expected object at flat path {full_path:?}" + ); + + // Verify content is intact. + let stored = std::fs::read(&full_path)?; + assert_eq!(stored, content); + + // Idempotent: storing the same object again should succeed. + let src_fd2 = openat( + CWD, + &src_path, + OFlags::RDONLY | OFlags::CLOEXEC, + Mode::from_raw_mode(0), + )?; + let id2 = >::ensure_object_from_fd( + &store, + src_fd2, + content.len() as u64, + )?; + assert_eq!(id, id2); + + Ok(()) + } } diff --git a/crates/composefs/src/util.rs b/crates/composefs/src/util.rs index 7a5ac23e..bdc43485 100644 --- a/crates/composefs/src/util.rs +++ b/crates/composefs/src/util.rs @@ -72,6 +72,23 @@ pub(crate) fn reopen_tmpfile_ro(file: std::fs::File) -> std::io::Result rustix::io::Result { + rustix::fs::openat( + dirfd, + ".", + rustix::fs::OFlags::RDWR | rustix::fs::OFlags::TMPFILE | rustix::fs::OFlags::CLOEXEC, + rustix::fs::Mode::from_raw_mode(0o644), + ) +} + /// This function reads the exact amount of bytes required to fill the buffer, possibly performing /// multiple reads to do so (and also retrying if required to deal with EINTR). /// From e3df24dffe66c456ba61cf94778d816924174fed Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Wed, 20 May 2026 10:15:47 -0400 Subject: [PATCH 5/8] examples: fix-verity: probe OVMF firmware paths, add q35 machine type The script hardcoded /usr/share/edk2/ovmf/OVMF_CODE.fd which is only present on Fedora. Probe a list of common paths (Ubuntu's ovmf package uses /usr/share/ovmf/OVMF.fd, Arch uses /usr/share/edk2/x64/OVMF.4m.fd) so the script works across distros without manual adjustment. Also add -machine q35, required on newer QEMU builds (e.g. RHEL10/CentOS Stream 10) where the default pc-i440fx machine type doesn't pair well with OVMF for EFI boot. Assisted-by: OpenCode (claude-sonnet-4-6@default) Signed-off-by: Colin Walters --- examples/common/fix-verity/fix-verity | 21 ++++++++++++++++- examples/testthing.py | 33 ++++++++++++++++++++------- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/examples/common/fix-verity/fix-verity b/examples/common/fix-verity/fix-verity index 788c9796..e9672bd7 100755 --- a/examples/common/fix-verity/fix-verity +++ b/examples/common/fix-verity/fix-verity @@ -18,10 +18,29 @@ if [ ! -f ${fix_verity_efi} ]; then mv "${fix_verity_efi}.tmp" "${fix_verity_efi}" fi +ovmf_code="" +ovmf_vars="" +for d in /usr/share/edk2/ovmf /usr/share/OVMF /usr/share/ovmf /usr/share/edk2/x64; do + if [ -f "$d/OVMF_CODE.fd" ] && [ -f "$d/OVMF_VARS.fd" ]; then + ovmf_code="$d/OVMF_CODE.fd" + ovmf_vars="$d/OVMF_VARS.fd" + break + fi +done + +ovmf_vars_tmp="" +if [ -n "$ovmf_code" ]; then + ovmf_vars_tmp="$(mktemp --suffix=.fd)" + cp "$ovmf_vars" "$ovmf_vars_tmp" + trap 'rm -f "$ovmf_vars_tmp"' EXIT +fi + qemu-system-x86_64 \ -nographic \ -m 4096 \ -enable-kvm \ - -bios /usr/share/edk2/ovmf/OVMF_CODE.fd \ + -machine q35 \ + ${ovmf_code:+-drive if=pflash,format=raw,readonly=on,file="$ovmf_code"} \ + ${ovmf_vars_tmp:+-drive if=pflash,format=raw,file="$ovmf_vars_tmp"} \ -drive file="$1",format=raw,if=virtio,media=disk \ -kernel "${fix_verity_efi}" diff --git a/examples/testthing.py b/examples/testthing.py index 9ec191d0..81e11c90 100644 --- a/examples/testthing.py +++ b/examples/testthing.py @@ -184,19 +184,36 @@ def _find_qemu() -> Path: raise FileNotFoundError("Unable to find qemu-kvm") -def _find_ovmf() -> tuple[str, Path]: - candidates = [ - # path for Fedora/RHEL (our tasks container) - "/usr/share/OVMF/OVMF_CODE.fd", +def _find_ovmf() -> tuple[str | tuple[str, str], ...]: + # Prefer split CODE+VARS pflash files (required on RHEL10/CentOS10 QEMU + # where -bios with the combined file hangs). Fall back to -bios with a + # combined image for Ubuntu CI and Arch. + split_candidates = [ + ("/usr/share/edk2/ovmf/OVMF_CODE.fd", "/usr/share/edk2/ovmf/OVMF_VARS.fd"), + ("/usr/share/OVMF/OVMF_CODE.fd", "/usr/share/OVMF/OVMF_VARS.fd"), + ] + for code, varst in split_candidates: + if Path(code).exists() and Path(varst).exists(): + # Copy VARS so UEFI can write to it without modifying the original. + import tempfile, shutil, atexit + tmp = tempfile.NamedTemporaryFile(suffix=".fd", delete=False) + shutil.copy2(varst, tmp.name) + atexit.register(lambda p=tmp.name: Path(p).unlink(missing_ok=True)) + return ( + ("-machine", "q35"), + ("-drive", f"if=pflash,format=raw,readonly=on,file={code}"), + ("-drive", f"if=pflash,format=raw,file={tmp.name}"), + ) + + bios_candidates = [ # path for Ubuntu (GitHub Actions runners) "/usr/share/ovmf/OVMF.fd", # path for Arch "/usr/share/edk2/x64/OVMF.4m.fd", ] - - for path in map(Path, candidates): + for path in map(Path, bios_candidates): if path.exists(): - return "-bios", path + return (("-bios", str(path)),) raise FileNotFoundError("Unable to find OVMF UEFI BIOS") @@ -618,7 +635,7 @@ async def _qemu( args = ( _find_qemu(), "-nodefaults", - _find_ovmf(), + *_find_ovmf(), ("-cpu", "host"), ("-smp", f"{self._cpus}"), ("-m", f"{self._memory}"), From 71a036db9ba4df68ef0a09ce812451ceaf665ae3 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Wed, 20 May 2026 11:02:41 -0400 Subject: [PATCH 6/8] examples: fix-verity: enable fs-verity on meta.json composefs-setup-root validates that the repo's meta.json has fs-verity enabled before trusting the repo. The dracut hook was only enabling verity on the content objects, so setup-root would see the repo as insecure and refuse to proceed. Switch the working directory to /sysroot/composefs (instead of the objects subdirectory) so we can enable verity on meta.json in addition to all the content objects. Also quote the loop variable and use the full relative path for clarity. Assisted-by: OpenCode (claude-sonnet-4-6@default) Signed-off-by: Colin Walters --- examples/common/fix-verity/dracut-hook.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/common/fix-verity/dracut-hook.sh b/examples/common/fix-verity/dracut-hook.sh index 44d01532..3110198b 100644 --- a/examples/common/fix-verity/dracut-hook.sh +++ b/examples/common/fix-verity/dracut-hook.sh @@ -1,11 +1,13 @@ # dracut hook for fixing fs-verity on composefs sysroot mount -o remount,rw /sysroot ( - cd /sysroot/composefs/objects + cd /sysroot/composefs echo >&2 'Enabling fsverity on composefs objects' - for i in */*; do - fsverity enable $i; + for i in objects/*/*; do + fsverity enable "$i" done + echo >&2 'Enabling fsverity on meta.json' + fsverity enable meta.json echo >&2 'done!' ) umount /sysroot From 7e8958a59b040b87966cc7a1d725be0fa3efe6be Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Wed, 20 May 2026 11:03:16 -0400 Subject: [PATCH 7/8] examples: increase default VM startup timeout from 30s to 60s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 30s default is tight on slower hardware (e.g. CentOS Stream 10 with OVMF pflash init overhead) — the VM boots successfully but just barely misses the window. 60s gives enough headroom while still being short enough to catch genuinely broken VMs. CI on Ubuntu with KVM acceleration boots well under 30s so the extra budget costs nothing. Assisted-by: OpenCode (claude-sonnet-4-6@default) Signed-off-by: Colin Walters --- examples/testthing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/testthing.py b/examples/testthing.py index 81e11c90..2631c4e9 100644 --- a/examples/testthing.py +++ b/examples/testthing.py @@ -408,7 +408,7 @@ def __init__( sit: bool = False, snapshot: bool = True, status_messages: bool = False, - timeout: float = 30.0, + timeout: float = 60.0, verbose: bool = False, ) -> None: """Construct a VM. @@ -1077,7 +1077,7 @@ def _main() -> None: "--ssh-key", "-i", type=Path, help="Path to SSH private key (default: generate)" ) parser.add_argument( - "--timeout", type=float, help="For startup, in seconds, or 'inf' (default: 30)" + "--timeout", type=float, help="For startup, in seconds, or 'inf' (default: 60)" ) parser.add_argument("image", type=Path, help="The path to a qcow2 VM image to run") args = parser.parse_args() From 234f87fbdb7226f9b66035326ac798b2de732121 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Thu, 21 May 2026 18:27:42 -0400 Subject: [PATCH 8/8] composefs: Add V1 EROFS format with compat mkcomposefs and composefs-info CLI Add support for generating V1 EROFS images compatible with the C composefs tools (mkcomposefs/composefs-info 1.0.8+). V1 uses compact inodes, BFS layout, and a simpler on-disk structure. Adds --erofs-version flag to cfsctl, new mkcomposefs and composefs-info compatibility subcommands, and RepositoryConfig for cleaner repo initialization. Note: this commit does not compile with --features oci (the default) until the following commit migrates OCI crate callers. Assisted-by: OpenCode (Claude Sonnet 4.5) Signed-off-by: Colin Walters ci: Point bootc revdep CI at temporary branch Update the bootc reverse dependency CI to use the cgwalters/bootc adapt-composefs-rs-api branch which adapts bootc to the composefs-rs API changes (RepositoryConfig, FormatVersion, st_mtim_nsec). Assisted-by: OpenCode (Claude Sonnet 4) Signed-off-by: Colin Walters --- .github/workflows/bootc-revdep.yml | 5 + .gitignore | 3 + Justfile | 15 + crates/composefs-boot/src/lib.rs | 6 + crates/composefs-boot/src/selabel.rs | 2 + crates/composefs-ctl/src/composefs_info.rs | 318 +++ crates/composefs-ctl/src/lib.rs | 111 +- crates/composefs-ctl/src/main.rs | 80 +- crates/composefs-ctl/src/mkcomposefs.rs | 446 +++++ crates/composefs-ctl/src/varlink.rs | 19 +- crates/composefs-integration-tests/Cargo.toml | 1 + crates/composefs-integration-tests/src/lib.rs | 12 +- .../src/tests/cli.rs | 196 +- .../src/tests/privileged.rs | 9 +- crates/composefs-oci/src/boot.rs | 9 +- crates/composefs-oci/src/image.rs | 6 +- crates/composefs-oci/src/lib.rs | 18 +- crates/composefs-oci/src/oci_layout.rs | 3 +- crates/composefs-oci/src/tar.rs | 8 +- crates/composefs-oci/src/test_util.rs | 14 +- crates/composefs/Cargo.toml | 1 + crates/composefs/fuzz/Cargo.lock | 12 + crates/composefs/fuzz/generate_corpus.rs | 176 +- crates/composefs/src/dumpfile.rs | 83 +- crates/composefs/src/dumpfile_parse.rs | 11 +- crates/composefs/src/erofs/composefs.rs | 15 +- crates/composefs/src/erofs/debug.rs | 123 +- crates/composefs/src/erofs/format.rs | 252 ++- crates/composefs/src/erofs/reader.rs | 1766 ++++++++++++++++- crates/composefs/src/erofs/writer.rs | 1660 ++++++++++++++-- crates/composefs/src/filesystem_ops.rs | 75 +- crates/composefs/src/fs.rs | 69 +- crates/composefs/src/fsverity/mod.rs | 31 +- crates/composefs/src/generic_tree.rs | 185 +- crates/composefs/src/repository.rs | 866 +++++++- crates/composefs/src/splitstream.rs | 7 +- crates/composefs/src/test.rs | 433 +++- crates/composefs/src/tree.rs | 2 + crates/composefs/tests/mkfs.rs | 192 +- crates/composefs/tests/special_v1.dump | 7 + doc/booting.md | 77 + doc/erofs.md | 82 + doc/repository.md | 31 + 43 files changed, 6746 insertions(+), 691 deletions(-) create mode 100644 crates/composefs-ctl/src/composefs_info.rs create mode 100644 crates/composefs-ctl/src/mkcomposefs.rs create mode 100644 crates/composefs/tests/special_v1.dump create mode 100644 doc/booting.md create mode 100644 doc/erofs.md diff --git a/.github/workflows/bootc-revdep.yml b/.github/workflows/bootc-revdep.yml index b6ce4e26..1ca8cec7 100644 --- a/.github/workflows/bootc-revdep.yml +++ b/.github/workflows/bootc-revdep.yml @@ -44,3 +44,8 @@ jobs: - name: Build and test bootc with local composefs-rs run: just bootc/test + env: + # Use bootc branch adapted to composefs-rs API changes + # TODO: revert to main once bootc-dev/bootc merges these adaptations + COMPOSEFS_BOOTC_REPO: https://github.com/cgwalters/bootc + COMPOSEFS_BOOTC_REF: adapt-composefs-rs-api diff --git a/.gitignore b/.gitignore index 69660095..494f0447 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ **/fuzz/target/ **/fuzz/corpus/ **/fuzz/artifacts/ +.worktrees +*.rpm +crates/composefs/fuzz/fuzz-0.log diff --git a/Justfile b/Justfile index 7a7beebf..90991eeb 100644 --- a/Justfile +++ b/Justfile @@ -93,6 +93,21 @@ test-integration-vm *ARGS: build _integration-container-build install-nextest: @which cargo-nextest > /dev/null 2>&1 || cargo install cargo-nextest --locked +# Build and run a bls example locally. +# Usage: just test-example-local bls arch +# just test-example-local bls arch fsfmt=ext4 verity=none +# 'fsfmt' defaults to ext4, 'verity' defaults to none (no fs-verity enforcement). +# Requires: qemu-kvm, OVMF, skopeo, mtools, fsverity, mkfs.erofs, systemd-repart, podman. +test-example-local example os fsfmt="ext4" verity="none": build + #!/usr/bin/env bash + set -euo pipefail + export FS_FORMAT={{ fsfmt }} + export FS_VERITY_MODE={{ verity }} + export CFSCTL_PATH=$(pwd)/target/debug/cfsctl + cd examples + {{ example }}/build {{ os }} + TEST_IMAGE="{{ example }}/{{ os }}-{{ example }}-efi.qcow2" pytest test -v + # Run everything: checks + full integration tests including VM ci: check test-integration-vm diff --git a/crates/composefs-boot/src/lib.rs b/crates/composefs-boot/src/lib.rs index 40ff0335..11e5cc33 100644 --- a/crates/composefs-boot/src/lib.rs +++ b/crates/composefs-boot/src/lib.rs @@ -101,6 +101,10 @@ impl BootOps for FileSystem { ) -> Result>> { let boot_entries = get_boot_resources(self, repo)?; empty_toplevel_dirs(self)?; + // Compact the leaves table after clearing directories, so that leaves + // which were only referenced by /boot or /sysroot are removed and + // don't appear as orphans when the filesystem is validated. + self.compact(); selabel::selabel(self, repo)?; Ok(boot_entries) @@ -108,6 +112,8 @@ impl BootOps for FileSystem { fn transform_for_boot_from_dir(&mut self, rootfs: impl AsFd) -> Result<()> { empty_toplevel_dirs(self)?; + // Same as above: compact to remove leaves orphaned by clearing dirs. + self.compact(); selabel::selabel_from_dir(self, rootfs)?; Ok(()) } diff --git a/crates/composefs-boot/src/selabel.rs b/crates/composefs-boot/src/selabel.rs index 58842ecd..3400bca7 100644 --- a/crates/composefs-boot/src/selabel.rs +++ b/crates/composefs-boot/src/selabel.rs @@ -581,6 +581,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: Default::default(), }; @@ -595,6 +596,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: Default::default(), }, LeafContent::Regular(RegularFile::Inline(data.to_vec().into_boxed_slice())), diff --git a/crates/composefs-ctl/src/composefs_info.rs b/crates/composefs-ctl/src/composefs_info.rs new file mode 100644 index 00000000..e2872460 --- /dev/null +++ b/crates/composefs-ctl/src/composefs_info.rs @@ -0,0 +1,318 @@ +//! composefs-info - Query information from composefs images. +//! +//! This is a Rust reimplementation of the C composefs-info tool, providing +//! commands to inspect EROFS images, list objects, and compute fs-verity digests. +//! +//! ## Compatibility status +//! +//! Implemented subcommands: +//! - `ls` — lists files with type suffixes, skips whiteout entries +//! - `dump` — outputs composefs-dump(5) text format (image → tree → dumpfile) +//! - `objects` — lists all backing file object paths (XX/XXXX...) +//! - `missing-objects` — lists objects not present in `--basedir` +//! - `measure-file` — computes fs-verity digest of files +//! +//! Compatibility notes: +//! - `measure-file` tries `FS_IOC_MEASURE_VERITY` first; falls back to +//! in-process computation when the kernel reports verity is absent or the +//! filesystem doesn't support it, matching the C `lcfs_fd_get_fsverity()` +//! behaviour. + +use std::collections::HashSet; +use std::io::Write; +use std::path::Path; +use std::{fs::File, io::Read, path::PathBuf}; + +use anyhow::{Context, Result}; +use clap::{Parser, Subcommand}; + +use composefs::{ + dumpfile::write_dumpfile, + erofs::reader::erofs_to_filesystem, + fsverity::{FsVerityHashValue, Sha256HashValue, measure_verity_with_fallback}, + generic_tree::{Inode, LeafContent, LeafId}, + tree::{FileSystem, RegularFile}, +}; + +/// Query information from composefs images. +#[derive(Parser, Debug)] +#[command( + name = "composefs-info", + version, + about = "Query information from composefs images" +)] +struct Cli { + /// The subcommand to run. + #[command(subcommand)] + command: Command, +} + +/// Available subcommands. +#[derive(Subcommand, Debug)] +enum Command { + /// Simple listing of files and directories in the image. + Ls { + /// Filter entries at the root level by name (can be specified multiple times). + #[arg(long = "filter", action = clap::ArgAction::Append)] + filter: Vec, + /// Composefs image files to inspect. + images: Vec, + }, + + /// Full dump in composefs-dump(5) format. + Dump { + /// Filter entries at the root level by name (can be specified multiple times). + #[arg(long = "filter", action = clap::ArgAction::Append)] + filter: Vec, + /// Composefs image files to dump. + images: Vec, + }, + + /// List all backing file object paths. + Objects { + /// Composefs image files to inspect. + images: Vec, + }, + + /// List backing files not present in basedir. + MissingObjects { + /// Base directory for object lookups. + #[arg(long = "basedir", required = true)] + basedir: PathBuf, + /// Composefs image files to inspect. + images: Vec, + }, + + /// Print the fs-verity digest of files. + MeasureFile { + /// Files to measure. + files: Vec, + }, +} + +/// Entry point for the composefs-info multi-call mode. +pub fn run() -> Result<()> { + let cli = Cli::parse(); + run_with_cli(cli) +} + +/// Entry point when invoked as a hidden `cfsctl composefs-info` subcommand. +/// +/// `extra_args` contains everything after the `composefs-info` token as +/// captured by clap's trailing-var-arg mechanism. A synthetic `argv[0]` is +/// prepended so that `Cli::parse_from` produces the same help text / error +/// messages as the standalone binary. +pub fn run_from_args(extra_args: Vec) -> Result<()> { + let mut argv = vec![std::ffi::OsString::from("composefs-info")]; + argv.extend(extra_args); + let cli = Cli::parse_from(argv); + run_with_cli(cli) +} + +fn run_with_cli(cli: Cli) -> Result<()> { + match &cli.command { + Command::Ls { filter, images } => cmd_ls(filter, images), + Command::Dump { filter, images } => cmd_dump(filter, images), + Command::Objects { images } => cmd_objects(images), + Command::MissingObjects { basedir, images } => cmd_missing_objects(basedir, images), + Command::MeasureFile { files } => cmd_measure_file(files), + } +} + +/// Print escaped path (matches C implementation behavior). +fn print_escaped(out: &mut W, s: &[u8]) -> std::io::Result<()> { + for &c in s { + match c { + b'\\' => write!(out, "\\\\")?, + b'\n' => write!(out, "\\n")?, + b'\r' => write!(out, "\\r")?, + b'\t' => write!(out, "\\t")?, + // Non-printable or non-ASCII characters are hex-escaped + c if !c.is_ascii_graphic() && c != b' ' => write!(out, "\\x{c:02x}")?, + c => out.write_all(&[c])?, + } + } + Ok(()) +} + +/// Walk and print entries: directory line first, then recurse into children. +fn ls_print( + out: &mut W, + fs: &FileSystem, + dir: &composefs::tree::Directory, + path: &[u8], + seen_leaf_ids: &mut HashSet, + filter: Option<&[String]>, +) -> Result<()> { + for (name, child) in dir.sorted_entries() { + let name_bytes = name.as_encoded_bytes(); + + if let Some(filter) = filter + && !filter.is_empty() + { + let name_str = name.to_string_lossy(); + if !filter.iter().any(|f| f == name_str.as_ref()) { + continue; + } + } + + let mut child_path = path.to_vec(); + child_path.push(b'/'); + child_path.extend_from_slice(name_bytes); + + match child { + Inode::Directory(child_dir) => { + // Print the directory entry with trailing slash. + print_escaped(out, &child_path)?; + write!(out, "/\t")?; + writeln!(out)?; + // Recurse into the directory. + ls_print(out, fs, child_dir, &child_path, seen_leaf_ids, None)?; + } + Inode::Leaf(leaf_id, _) => { + let leaf = fs.leaf(*leaf_id); + + print_escaped(out, &child_path)?; + + match &leaf.content { + LeafContent::Regular(regular) => { + let is_hardlink = !seen_leaf_ids.insert(*leaf_id); + if !is_hardlink && let RegularFile::External(id, _) = regular { + write!(out, "\t@ ")?; + print_escaped(out, id.to_object_pathname().as_bytes())?; + } + } + LeafContent::Symlink(target) => { + write!(out, "\t-> ")?; + print_escaped(out, target.as_encoded_bytes())?; + } + _ => {} + } + + writeln!(out)?; + } + } + } + Ok(()) +} + +/// List files and directories in the image. +fn cmd_ls(filter: &[String], images: &[PathBuf]) -> Result<()> { + let stdout = std::io::stdout(); + let mut out = stdout.lock(); + + for image_path in images { + let image_data = read_image(image_path)?; + let fs = erofs_to_filesystem::(&image_data) + .with_context(|| format!("Failed to parse image: {image_path:?}"))?; + + let mut seen_leaf_ids = HashSet::new(); + ls_print( + &mut out, + &fs, + &fs.root, + b"", + &mut seen_leaf_ids, + Some(filter), + )?; + } + + Ok(()) +} + +/// Dump the image in composefs-dump(5) text format. +/// +/// This matches the C composefs-info dump output: the EROFS image is parsed +/// back into a filesystem tree which is then serialized as a dumpfile. +fn cmd_dump(_filter: &[String], images: &[PathBuf]) -> Result<()> { + let stdout = std::io::stdout(); + let mut out = stdout.lock(); + + for image_path in images { + let image_data = read_image(image_path)?; + let fs = erofs_to_filesystem::(&image_data) + .with_context(|| format!("Failed to parse image: {image_path:?}"))?; + write_dumpfile(&mut out, &fs) + .with_context(|| format!("Failed to dump image: {image_path:?}"))?; + } + + Ok(()) +} + +/// Collect all external object IDs from a parsed filesystem. +/// +/// Iterates the leaves table directly — each `RegularFile::External` entry +/// is a unique content-addressed object. Because `erofs_to_filesystem` +/// deduplicates hard-linked inodes into a single leaf, each object appears +/// exactly once even if it is referenced by multiple paths. +fn collect_objects_from_fs(fs: &FileSystem) -> HashSet { + fs.leaves + .iter() + .filter_map(|leaf| match &leaf.content { + LeafContent::Regular(RegularFile::External(id, _)) => Some(id.clone()), + _ => None, + }) + .collect() +} + +/// List all object paths from the images. +fn cmd_objects(images: &[PathBuf]) -> Result<()> { + for image_path in images { + let image_data = read_image(image_path)?; + let fs = erofs_to_filesystem::(&image_data) + .with_context(|| format!("Failed to parse image: {image_path:?}"))?; + + let mut objects: Vec = collect_objects_from_fs(&fs).into_iter().collect(); + objects.sort_by_key(|id| id.to_hex()); + + for obj in objects { + println!("{}", obj.to_object_pathname()); + } + } + Ok(()) +} + +/// List objects not present in basedir. +fn cmd_missing_objects(basedir: &Path, images: &[PathBuf]) -> Result<()> { + let mut all_objects: HashSet = HashSet::new(); + + for image_path in images { + let image_data = read_image(image_path)?; + let fs = erofs_to_filesystem::(&image_data) + .with_context(|| format!("Failed to parse image: {image_path:?}"))?; + all_objects.extend(collect_objects_from_fs(&fs)); + } + + let mut missing: Vec = all_objects + .into_iter() + .filter(|obj| !basedir.join(obj.to_object_pathname()).exists()) + .collect(); + + missing.sort_by_key(|a| a.to_hex()); + + for obj in missing { + println!("{}", obj.to_object_pathname()); + } + + Ok(()) +} + +/// Compute and print the fs-verity digest of each file. +fn cmd_measure_file(files: &[PathBuf]) -> Result<()> { + for path in files { + let file = File::open(path).with_context(|| format!("Failed to open file: {path:?}"))?; + let digest = measure_verity_with_fallback::(file) + .with_context(|| format!("Failed to measure verity for {path:?}"))?; + println!("{}", digest.to_hex()); + } + Ok(()) +} + +/// Read an entire image file into memory. +fn read_image(path: &PathBuf) -> Result> { + let mut file = File::open(path).with_context(|| format!("Failed to open image: {path:?}"))?; + let mut data = Vec::new(); + file.read_to_end(&mut data) + .with_context(|| format!("Failed to read image: {path:?}"))?; + Ok(data) +} diff --git a/crates/composefs-ctl/src/lib.rs b/crates/composefs-ctl/src/lib.rs index c5744f9b..90583419 100644 --- a/crates/composefs-ctl/src/lib.rs +++ b/crates/composefs-ctl/src/lib.rs @@ -22,6 +22,8 @@ pub use composefs_http; #[cfg(feature = "oci")] pub use composefs_oci; +pub mod composefs_info; +pub mod mkcomposefs; /// Varlink RPC service exposing repository operations over a Unix socket. pub mod varlink; @@ -61,7 +63,10 @@ use composefs::{ erofs::reader::erofs_to_filesystem, fsverity::{Algorithm, FsVerityHashValue, Sha256HashValue, Sha512HashValue}, generic_tree::{FileSystem, Inode}, - repository::{REPO_METADATA_FILENAME, Repository, read_repo_algorithm, system_path, user_path}, + repository::{ + REPO_METADATA_FILENAME, Repository, RepositoryConfig, read_repo_algorithm, system_path, + user_path, + }, tree::RegularFile, }; @@ -172,6 +177,11 @@ pub struct App { #[clap(long, value_enum)] pub hash: Option, + /// The EROFS format version to use when generating images. + /// If omitted, the library default (V2) is used. + #[clap(long, value_enum)] + pub erofs_version: Option, + /// Deprecated: security mode is now auto-detected from meta.json. /// Use `cfsctl init --insecure` to create a repo without verity. /// Kept for backward compatibility. @@ -206,6 +216,30 @@ pub enum HashType { Sha512, } +/// The EROFS format version used when generating images. +#[derive(Debug, Copy, Clone, PartialEq, Eq, ValueEnum)] +pub enum ErofsVersion { + /// Format V0: compact inodes, BFS, C-compatible (composefs_version auto-detects 0 or 1). + #[clap(name = "0")] + V0, + /// Format V1: same layout as V0, composefs_version always 1. + #[clap(name = "1")] + V1, + /// Format V2: extended inodes, DFS (composefs_version=2). + #[clap(name = "2")] + V2, +} + +impl From for composefs::erofs::format::FormatVersion { + fn from(v: ErofsVersion) -> Self { + match v { + ErofsVersion::V0 => Self::V0, + ErofsVersion::V1 => Self::V1, + ErofsVersion::V2 => Self::V2, + } + } +} + /// A reference to an OCI image: either a content digest or a named ref. /// /// Digests are prefixed with `@` (e.g. `@sha256:abc123…`), while bare @@ -493,6 +527,11 @@ enum Command { /// re-imported after migration. #[clap(long)] reset_metadata: bool, + /// Default EROFS format version for images in this repository. + /// V1 is compatible with C `mkcomposefs` 1.0.8; V2 is the native format. + /// If omitted, falls back to the global `--erofs-version` flag, then defaults to V2. + #[clap(long)] + erofs_version: Option, }, /// Take a transaction lock on the repository. /// This prevents garbage collection from occurring. @@ -592,6 +631,22 @@ enum Command { #[clap(long)] address: Option, }, + + /// Run mkcomposefs (C-compatible image builder); hidden, also available via argv0 dispatch. + #[clap(hide = true, name = "mkcomposefs")] + Mkcomposefs { + /// Arguments forwarded verbatim to mkcomposefs + #[clap(trailing_var_arg = true, allow_hyphen_values = true)] + args: Vec, + }, + + /// Run composefs-info (C-compatible image inspector); hidden, also available via argv0 dispatch. + #[clap(hide = true, name = "composefs-info")] + ComposefsInfo { + /// Arguments forwarded verbatim to composefs-info + #[clap(trailing_var_arg = true, allow_hyphen_values = true)] + args: Vec, + }, } /// Acts as a proxy for the `cfsctl` CLI by executing the CLI logic programmatically @@ -751,19 +806,34 @@ pub async fn run_if_socket_activated() -> Result { /// Top-level dispatch: handle init specially, otherwise open repo and run. pub async fn run_app(args: App) -> Result<()> { + // Hidden compat subcommands: forward all trailing args to the respective tool. + if let Command::Mkcomposefs { args: extra } = args.cmd { + return mkcomposefs::run_from_args(extra); + } + if let Command::ComposefsInfo { args: extra } = args.cmd { + return composefs_info::run_from_args(extra); + } + // Init is handled before opening a repo since it creates one if let Command::Init { ref algorithm, ref path, insecure, reset_metadata, + erofs_version: ref init_erofs_version, } = args.cmd { + // Prefer the subcommand-level --erofs-version; fall back to global flag; default V2. + let erofs_version = init_erofs_version + .or(args.erofs_version) + .map(composefs::erofs::format::FormatVersion::from) + .unwrap_or(composefs::erofs::format::FormatVersion::V2); return run_init( algorithm, path.as_deref(), insecure || args.insecure, reset_metadata, + erofs_version, &args, ); } @@ -830,6 +900,7 @@ fn run_init( path: Option<&Path>, insecure: bool, reset_metadata: bool, + erofs_version: composefs::erofs::format::FormatVersion, args: &App, ) -> Result<()> { let repo_path = if let Some(p) = path { @@ -850,12 +921,17 @@ fn run_init( // init_path handles idempotency: same algorithm is a no-op, // different algorithm is an error. + let config = { + let mut c = RepositoryConfig::new(*algorithm); + c.erofs_formats = composefs::erofs::format::FormatConfig::single(erofs_version); + if insecure { c.set_insecure() } else { c } + }; let created = match algorithm { Algorithm::Sha256 { .. } => { - Repository::::init_path(CWD, &repo_path, *algorithm, !insecure)?.1 + Repository::::init_path(CWD, &repo_path, config)?.1 } Algorithm::Sha512 { .. } => { - Repository::::init_path(CWD, &repo_path, *algorithm, !insecure)?.1 + Repository::::init_path(CWD, &repo_path, config)?.1 } }; @@ -916,7 +992,13 @@ where ObjectID: FsVerityHashValue, { let path = resolve_repo_path(args)?; - open_repo_at(&path, args.insecure, args.require_verity, args.no_upgrade) + let mut repo = open_repo_at(&path, args.insecure, args.require_verity, args.no_upgrade)?; + // If the user explicitly passed --erofs-version, override the stored + // repo setting for this invocation only (does not rewrite meta.json). + if let Some(version) = args.erofs_version { + repo.set_erofs_version(version.into()); + } + Ok(repo) } /// Resolve an [`OciReference`] to an [`OciImage`]. @@ -1065,10 +1147,19 @@ fn dump_file_impl( /// Run commands that don't require a repository. pub async fn run_cmd_without_repo(args: App) -> Result<()> { + let erofs_version = args + .erofs_version + .map(composefs::erofs::format::FormatVersion::from); match args.cmd { Command::ComputeId { fs_opts } => { let fs = load_filesystem_from_ondisk_fs::(&fs_opts, None).await?; - let id = fs.compute_image_id(); + let version = erofs_version.unwrap_or_default(); + let id = composefs::fsverity::compute_verity::( + &composefs::erofs::writer::mkfs_erofs_versioned( + &mut composefs::erofs::writer::ValidatedFileSystem::new(fs)?, + version, + ), + ); println!("{}", id.to_hex()); } Command::CreateDumpfile { fs_opts } => { @@ -1152,8 +1243,8 @@ where repo.mount_at(&erofs_id.to_hex(), mountpoint.as_str())?; } OciCommand::ComputeId { config_opts } => { - let fs = load_filesystem_from_oci_image(&repo, config_opts)?; - let id = fs.compute_image_id(); + let mut fs = load_filesystem_from_oci_image(&repo, config_opts)?; + let id = fs.compute_image_id(repo.erofs_version()); println!("{}", id.to_hex()); } OciCommand::Pull { @@ -1362,7 +1453,7 @@ where fs_opts, ref image_name, } => { - let fs = load_filesystem_from_ondisk_fs(&fs_opts, Some(Arc::clone(&repo))).await?; + let mut fs = load_filesystem_from_ondisk_fs(&fs_opts, Some(Arc::clone(&repo))).await?; let id = fs.commit_image(&repo, image_name.as_deref())?; println!("{}", id.to_id()); } @@ -1455,6 +1546,10 @@ where println!("content {digest}"); println!("verity {}", verity.to_hex()); } + Command::Mkcomposefs { .. } | Command::ComposefsInfo { .. } => { + // Dispatched in run_app before a repository is opened + unreachable!("mkcomposefs/composefs-info are dispatched before opening a repository"); + } } Ok(()) } diff --git a/crates/composefs-ctl/src/main.rs b/crates/composefs-ctl/src/main.rs index a6b3e5f6..96cdbf91 100644 --- a/crates/composefs-ctl/src/main.rs +++ b/crates/composefs-ctl/src/main.rs @@ -1,28 +1,78 @@ //! Command-line control utility for composefs repositories and images. //! -//! `cfsctl` provides a comprehensive interface for managing composefs repositories, -//! creating and mounting filesystem images, handling OCI containers, and performing -//! repository maintenance operations like garbage collection. +//! `cfsctl` is a multi-call binary: when invoked as `mkcomposefs` or +//! `composefs-info` (via symlink or hardlink), it dispatches to the +//! corresponding tool. Otherwise it runs the normal `cfsctl` interface. +//! +//! ## C composefs compatibility roadmap +//! +//! This work aims to provide a Rust implementation that is a drop-in for the +//! C composefs tools and library. See: +//! +//! +//! Status: +//! 1. **CLI interfaces** (`mkcomposefs`, `composefs-info`): Substantially +//! implemented. V1 EROFS output is byte-for-byte identical to C mkcomposefs. +//! See individual module docs for remaining gaps. +//! 2. **EROFS output format**: V1 (C-compatible) writer with compact inodes, +//! BFS ordering, whiteout table, and overlay xattr escaping is complete and +//! tested. V2 (Rust-native) is the default for the composefs-rs repository. +//! 3. **C shared library (`libcomposefs`)**: TODO(compat): Not yet started. +//! This is the next major milestone — providing a C-ABI compatible shared +//! library so that existing C consumers (e.g. ostree, bootc) can link +//! against the Rust implementation. Will require `#[no_mangle]` exports, +//! a `cdylib` crate, and C header generation (e.g. via cbindgen). -use composefs_ctl::App; +use std::ffi::OsStr; +use std::path::Path; use anyhow::Result; -use clap::Parser; + +/// Extract the binary name from argv[0], stripping any directory prefix. +fn binary_name() -> Option { + std::env::args_os().next().and_then(|arg0| { + Path::new(&arg0) + .file_name() + .map(|f| f.to_string_lossy().into_owned()) + }) +} + +/// Collect all arguments after the first (i.e. argv[2..]) as OsStrings. +fn rest_of_args() -> Vec { + std::env::args_os().skip(2).collect() +} fn main() -> Result<()> { - // If we were spawned as a userns helper process, handle that and exit. - // This MUST be called before the tokio runtime is created. - #[cfg(feature = "containers-storage")] - cstorage::init_if_helper(); - - // Now we can create the tokio runtime for the main application - tokio::runtime::Builder::new_multi_thread() - .enable_all() - .build()? - .block_on(async_main()) + match binary_name().as_deref() { + Some("mkcomposefs") => composefs_ctl::mkcomposefs::run(), + Some("composefs-info") => composefs_ctl::composefs_info::run(), + // When called as `cfsctl mkcomposefs ...` or `cfsctl composefs-info ...`, + // intercept before clap so that --help and all flags go to the real tool. + _ if std::env::args_os().nth(1).as_deref() == Some(OsStr::new("mkcomposefs")) => { + composefs_ctl::mkcomposefs::run_from_args(rest_of_args()) + } + _ if std::env::args_os().nth(1).as_deref() == Some(OsStr::new("composefs-info")) => { + composefs_ctl::composefs_info::run_from_args(rest_of_args()) + } + _ => { + // If we were spawned as a userns helper process, handle that and exit. + // This MUST be called before the tokio runtime is created. + #[cfg(feature = "containers-storage")] + cstorage::init_if_helper(); + + // Now we can create the tokio runtime for the main application + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()? + .block_on(async_main()) + } + } } async fn async_main() -> Result<()> { + use clap::Parser; + use composefs_ctl::App; + env_logger::init(); // If we were launched via systemd socket activation (e.g. `varlinkctl diff --git a/crates/composefs-ctl/src/mkcomposefs.rs b/crates/composefs-ctl/src/mkcomposefs.rs new file mode 100644 index 00000000..72278c97 --- /dev/null +++ b/crates/composefs-ctl/src/mkcomposefs.rs @@ -0,0 +1,446 @@ +//! mkcomposefs - Create composefs images from directories or dumpfiles. +//! +//! This is a Rust reimplementation of the C mkcomposefs tool, providing +//! compatible command-line interface and output format. +//! +//! ## Compatibility status +//! +//! See for context. +//! +//! Implemented and tested (byte-for-byte match with C mkcomposefs): +//! - `--from-file`, `--print-digest`, `--print-digest-only` +//! - `--skip-devices`, `--skip-xattrs`, `--user-xattrs` +//! - `--min-version` / `--max-version` (V1 compact inodes, BFS ordering, whiteout table) +//! - `--digest-store` (C-compatible flat `XX/digest` layout via [`FlatDigestStore`]) +//! - `--threads` (controls tokio worker threads and verity-computation concurrency) +//! - Source from directory or dumpfile, output to file or stdout +//! +//! Bit-for-bit compatible with the C mkcomposefs by default; use `--hardlinks` +//! to enable host hard-link deduplication (shared EROFS inodes, i_nlink > 1). + +use std::{ + ffi::OsString, + fs::File, + io::{self, BufReader, IsTerminal, Read, Write}, + path::{Path, PathBuf}, + sync::Arc, + thread::available_parallelism, +}; + +use anyhow::{Context, Result, bail}; +use clap::Parser; +use rustix::fs::CWD; +use tokio::sync::Semaphore; + +use composefs::{ + dumpfile::dumpfile_to_filesystem, + erofs::{ + format::FormatVersion, + writer::{ValidatedFileSystem, mkfs_erofs_versioned}, + }, + fs::{ + FlatDigestStore, HardlinkBehavior, ObjectStore, ReadFilesystemOpts, + read_filesystem_with_opts, + }, + fsverity::{FsVerityHashValue, Sha256HashValue, compute_verity}, + tree::FileSystem, +}; + +/// Create a composefs image from a source directory or dumpfile. +/// +/// Composefs uses EROFS image files for metadata and separate content-addressed +/// backing directories for regular file data. +#[derive(Parser, Debug)] +#[command(name = "mkcomposefs", version, about)] +struct Args { + /// Treat SOURCE as a dumpfile in composefs-dump(5) format. + /// + /// If SOURCE is `-`, reads from stdin. + #[arg(long)] + from_file: bool, + + /// Print the fsverity digest of the image after writing. + #[arg(long)] + print_digest: bool, + + /// Print the fsverity digest without writing the image. + /// + /// When set, IMAGE must be omitted. + #[arg(long)] + print_digest_only: bool, + + /// Set modification time to zero (Unix epoch) for all files. + #[arg(long)] + use_epoch: bool, + + /// Exclude device nodes from the image. + #[arg(long)] + skip_devices: bool, + + /// Exclude all extended attributes. + #[arg(long)] + skip_xattrs: bool, + + /// Only include xattrs with the `user.` prefix. + #[arg(long)] + user_xattrs: bool, + + /// Minimum image format version to use (0 or 1). + #[arg(long, default_value = "0")] + min_version: u32, + + /// Maximum image format version (for auto-upgrade). + #[arg(long, default_value = "1")] + max_version: u32, + + /// Copy regular file content to the given object store directory. + /// + /// Files are stored by their fsverity digest using the same flat layout + /// as C mkcomposefs: `XX/DIGEST` where XX is the first byte of the digest. + /// The directory is created if it doesn't exist. The layout is compatible + /// with digest stores written by the C mkcomposefs tool. + #[arg(long)] + digest_store: Option, + + /// Deduplicate hard-linked files on the source filesystem into shared + /// EROFS inodes. Without this flag the output is bit-for-bit compatible + /// with the C mkcomposefs tool. + #[arg(long)] + hardlinks: bool, + + /// Number of threads to use for digest calculation and file copying. + #[arg(long)] + threads: Option, + + /// The source directory or dumpfile. + source: PathBuf, + + /// The output image path (use `-` for stdout). + /// + /// Must be omitted when using --print-digest-only. + image: Option, +} + +/// Entry point for the mkcomposefs multi-call mode. +pub fn run() -> Result<()> { + let args = Args::parse(); + run_with_args(args) +} + +/// Entry point when invoked as a hidden `cfsctl mkcomposefs` subcommand. +/// +/// `extra_args` contains everything after the `mkcomposefs` token as captured +/// by clap's trailing-var-arg mechanism. A synthetic `argv[0]` is prepended +/// so that `Args::parse_from` produces the same help text / error messages as +/// the standalone binary. +pub fn run_from_args(extra_args: Vec) -> Result<()> { + let mut argv = vec![std::ffi::OsString::from("mkcomposefs")]; + argv.extend(extra_args); + let args = Args::parse_from(argv); + run_with_args(args) +} + +fn run_with_args(args: Args) -> Result<()> { + // Validate arguments + if args.print_digest_only && args.image.is_some() { + bail!("IMAGE must be omitted when using --print-digest-only"); + } + + if !args.print_digest_only && args.image.is_none() { + bail!("IMAGE is required (or use --print-digest-only)"); + } + + if args.min_version > args.max_version { + bail!( + "Invalid version range: --min-version ({}) must not exceed --max-version ({})", + args.min_version, + args.max_version + ); + } + + // The C composefs tool supports only versions 0 and 1 (LCFS_VERSION_MAX=1). + // Both use the same C-compatible on-disk format (compact inodes, BFS, whiteout + // table). The difference is in the `composefs_version` EROFS header field: + // - version 0 (default): starts at 0, auto-upgrades to 1 if a user-visible + // whiteout device (chr 0,0) is encountered. + // - version 1: always writes composefs_version=1 regardless of content. + // Versions > 1 are not defined by the C tool and are rejected below. + if args.min_version > 1 { + bail!( + "--min-version {} is not supported; the maximum C-compatible version is 1", + args.min_version + ); + } + // Map C --min-version to our FormatVersion: 0 → V0 (auto-detect), 1 → V1 (always 1). + let format_version = if args.min_version >= 1 { + FormatVersion::V1 + } else { + FormatVersion::V0 + }; + + // Open or create the digest store if specified. + // Always uses the C-compatible flat layout (XX/DIGEST) so that the store + // is interchangeable with the one written by C mkcomposefs. + let store: Option>> = + if let Some(store_path) = &args.digest_store { + let n = args + .threads + .unwrap_or_else(|| available_parallelism().map(|n| n.get()).unwrap_or(4)); + Some(Arc::new(FlatDigestStore::open(store_path, n, true)?)) + } else { + None + }; + + // Warn if --digest-store is combined with --from-file (store is unused in that case) + if args.from_file && args.digest_store.is_some() { + eprintln!("warning: --digest-store is ignored when --from-file is specified"); + } + + // Read input + let mut fs = if args.from_file { + read_dumpfile(&args)? + } else { + read_directory( + &args.source, + store, + args.threads, + if args.hardlinks { + HardlinkBehavior::Tracked + } else { + HardlinkBehavior::Break + }, + )? + }; + + // Apply transformations based on flags + apply_transformations(&mut fs, &args)?; + + // Generate EROFS image + let image = mkfs_erofs_versioned(&mut ValidatedFileSystem::new(fs)?, format_version); + + // Write image (skipped when only the digest is requested) + if !args.print_digest_only { + let image_path = args.image.as_ref().unwrap(); + write_image(image_path, &image)?; + } + + // Print digest when requested either way + if args.print_digest || args.print_digest_only { + let digest = compute_fsverity_digest(&image); + println!("{digest}"); + } + + Ok(()) +} + +/// Read and parse a dumpfile from the given source. +fn read_dumpfile(args: &Args) -> Result> { + let content = if args.source.as_os_str() == "-" { + // Read from stdin + let stdin = io::stdin(); + let mut content = String::new(); + stdin.lock().read_to_string(&mut content)?; + content + } else { + // Read from file + let file = File::open(&args.source) + .with_context(|| format!("Failed to open dumpfile: {:?}", args.source))?; + let mut reader = BufReader::new(file); + let mut content = String::new(); + reader.read_to_string(&mut content)?; + content + }; + + dumpfile_to_filesystem(&content).context("Failed to parse dumpfile") +} + +/// Read a filesystem tree from a directory path. +/// +/// If a store is provided, large file contents are copied there and +/// referenced by digest. The store must implement [`ObjectStore`]. +/// +/// The `threads` argument controls both the tokio worker thread count and the +/// semaphore used to limit concurrent verity computations. `Some(1)` uses a +/// single-threaded runtime; `None` or `Some(n > 1)` uses the multi-threaded +/// scheduler. +fn read_directory( + path: &Path, + store: Option>>, + threads: Option, + hardlinks: HardlinkBehavior, +) -> Result> { + use rustix::fs::{Mode, OFlags}; + + // Verify the path exists and is a directory + let metadata = std::fs::metadata(path) + .with_context(|| format!("Failed to access source directory: {path:?}"))?; + + if !metadata.is_dir() { + bail!("Source path is not a directory: {path:?}"); + } + + // Open a dirfd for the current directory (required by the async API) + let dirfd = rustix::fs::openat( + CWD, + ".", + OFlags::RDONLY | OFlags::DIRECTORY | OFlags::CLOEXEC, + Mode::empty(), + ) + .context("Failed to open current directory")?; + + // Build a tokio runtime appropriate for the requested thread count. + // --threads 1 → current_thread (no extra OS threads, minimal overhead). + // --threads N → multi_thread with exactly N worker threads. + // (default) → multi_thread with the tokio default (one per logical CPU). + let rt = match threads { + Some(1) => tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .context("Failed to create single-threaded tokio runtime")?, + Some(n) => tokio::runtime::Builder::new_multi_thread() + .worker_threads(n) + .enable_all() + .build() + .context("Failed to create multi-threaded tokio runtime")?, + None => tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .context("Failed to create multi-threaded tokio runtime")?, + }; + + let path = path.to_path_buf(); + + // When a store is present its semaphore is already configured; when there + // is no store we build the semaphore ourselves so the requested thread + // count is honoured. + let semaphore = if store.is_none() { + let n = threads.unwrap_or_else(|| available_parallelism().map(|n| n.get()).unwrap_or(4)); + Some(Arc::new(Semaphore::new(n))) + } else { + None + }; + rt.block_on(read_filesystem_with_opts( + dirfd, + path, + ReadFilesystemOpts { + store, + semaphore, + hardlinks, + }, + )) + .context("Failed to read directory tree") +} + +/// Write the image to the specified path (or stdout if `-`). +fn write_image(path: &PathBuf, image: &[u8]) -> Result<()> { + if path.as_os_str() == "-" { + let stdout = io::stdout(); + if stdout.is_terminal() { + bail!( + "Refusing to write binary image to terminal. Redirect stdout or use a file path." + ); + } + stdout.lock().write_all(image)?; + } else { + let mut file = + File::create(path).with_context(|| format!("Failed to create image file: {path:?}"))?; + file.write_all(image)?; + } + Ok(()) +} + +/// Compute the fsverity digest of the image. +fn compute_fsverity_digest(image: &[u8]) -> String { + let digest: Sha256HashValue = compute_verity(image); + digest.to_hex() +} + +/// Apply filesystem transformations based on command-line flags. +fn apply_transformations(fs: &mut FileSystem, args: &Args) -> Result<()> { + // Handle xattr filtering + if args.skip_xattrs { + // Remove all xattrs + fs.filter_xattrs(|_| false); + } else if args.user_xattrs { + // Keep only user.* xattrs + fs.filter_xattrs(|name| name.as_encoded_bytes().starts_with(b"user.")); + } + + // Handle --use-epoch (set all mtimes to 0) + if args.use_epoch { + set_all_mtimes_to_epoch(fs); + } + + // Handle --skip-devices (remove device nodes) + if args.skip_devices { + remove_device_nodes(fs); + } + + Ok(()) +} + +/// Set all modification times in the filesystem to Unix epoch (0). +fn set_all_mtimes_to_epoch(fs: &mut FileSystem) { + fs.for_each_stat_mut(|stat| { + stat.st_mtim_sec = 0; + stat.st_mtim_nsec = 0; + }); +} + +/// Remove all device nodes (block and character devices) from the filesystem. +fn remove_device_nodes(fs: &mut FileSystem) { + use composefs::generic_tree::{Inode, LeafContent}; + + type Leaf = composefs::generic_tree::Leaf>; + type Dir = composefs::generic_tree::Directory>; + + fn process_dir(dir: &mut Dir, leaves: &[Leaf]) { + // First, collect names of subdirectories to process + let subdir_names: Vec = dir + .entries() + .filter_map(|(name, inode)| { + if matches!(inode, Inode::Directory(_)) { + Some(name.to_os_string()) + } else { + None + } + }) + .collect(); + + // Recursively process subdirectories + for name in subdir_names { + if let Ok(subdir) = dir.get_directory_mut(&name) { + process_dir(subdir, leaves); + } + } + + // Collect names of device nodes to remove + let devices_to_remove: Vec = dir + .entries() + .filter_map(|(name, inode)| { + if let Inode::Leaf(leaf_id, _) = inode + && matches!( + leaves[leaf_id.0].content, + LeafContent::BlockDevice(_) | LeafContent::CharacterDevice(_) + ) + { + return Some(name.to_os_string()); + } + None + }) + .collect(); + + // Remove device nodes + for name in devices_to_remove { + dir.remove(&name); + } + } + + // Split struct field borrows: Rust allows borrowing different fields simultaneously. + let FileSystem { root, leaves, .. } = fs; + process_dir(root, leaves); + + // Compact the leaves table to remove entries now unreferenced after + // device-node removal. Without this, fs.fsck() would report orphaned leaves. + fs.compact(); +} diff --git a/crates/composefs-ctl/src/varlink.rs b/crates/composefs-ctl/src/varlink.rs index 7946d6e9..bbe9488d 100644 --- a/crates/composefs-ctl/src/varlink.rs +++ b/crates/composefs-ctl/src/varlink.rs @@ -23,7 +23,7 @@ use std::sync::Arc; use anyhow::{Context as _, Result}; use composefs::fsverity::{Algorithm, FsVerityHashValue, Sha256HashValue, Sha512HashValue}; -use composefs::repository::{FsckResult, Repository, system_path, user_path}; +use composefs::repository::{FsckResult, Repository, RepositoryConfig, system_path, user_path}; use rustix::fs::CWD; use serde::{Deserialize, Serialize}; @@ -425,17 +425,26 @@ fn run_init_repository( })?; } - let enable_verity = !insecure; let created = match algorithm { Algorithm::Sha256 { .. } => { - Repository::::init_path(CWD, path, algorithm, enable_verity) + let config = if insecure { + RepositoryConfig::new(algorithm).set_insecure() + } else { + RepositoryConfig::new(algorithm) + }; + Repository::::init_path(CWD, path, config) .map_err(|e| RepositoryError::InternalError { message: format!("{e:#}"), })? .1 } Algorithm::Sha512 { .. } => { - Repository::::init_path(CWD, path, algorithm, enable_verity) + let config = if insecure { + RepositoryConfig::new(algorithm).set_insecure() + } else { + RepositoryConfig::new(algorithm) + }; + Repository::::init_path(CWD, path, config) .map_err(|e| RepositoryError::InternalError { message: format!("{e:#}"), })? @@ -587,7 +596,7 @@ async fn run_compute_id( message: format!("{e:#}"), })?; } - let id = fs.compute_image_id(); + let id = fs.compute_image_id(repo.erofs_version()); Ok(oci::OciComputeIdReply { image_id: id.to_hex(), }) diff --git a/crates/composefs-integration-tests/Cargo.toml b/crates/composefs-integration-tests/Cargo.toml index 2b3b0613..b7bfaa75 100644 --- a/crates/composefs-integration-tests/Cargo.toml +++ b/crates/composefs-integration-tests/Cargo.toml @@ -48,6 +48,7 @@ paste = "1" rustix = { version = "1", features = ["fs", "process"] } serde = { version = "1", features = ["derive"] } serde_json = "1" +similar-asserts = "1" tar = "0.4" tempfile = "3" tokio = { version = "1", features = ["rt-multi-thread", "macros"] } diff --git a/crates/composefs-integration-tests/src/lib.rs b/crates/composefs-integration-tests/src/lib.rs index c86ebfb1..b9decb88 100644 --- a/crates/composefs-integration-tests/src/lib.rs +++ b/crates/composefs-integration-tests/src/lib.rs @@ -11,8 +11,8 @@ use std::process::Command; use std::sync::Arc; use anyhow::Result; -use composefs_oci::composefs::fsverity::{Algorithm, Sha256HashValue}; -use composefs_oci::composefs::repository::Repository; +use composefs_oci::composefs::fsverity::Sha256HashValue; +use composefs_oci::composefs::repository::{Repository, RepositoryConfig}; use tempfile::TempDir; /// A test function that returns a Result. @@ -110,9 +110,11 @@ pub fn create_test_repository(tempdir: &TempDir) -> Result::init_path(&fd, ".", Algorithm::SHA256, false)?; - repo.set_insecure(); + let (repo, _created) = Repository::::init_path( + &fd, + ".", + RepositoryConfig::default().set_insecure(), + )?; Ok(Arc::new(repo)) } diff --git a/crates/composefs-integration-tests/src/tests/cli.rs b/crates/composefs-integration-tests/src/tests/cli.rs index bd292599..08288162 100644 --- a/crates/composefs-integration-tests/src/tests/cli.rs +++ b/crates/composefs-integration-tests/src/tests/cli.rs @@ -12,20 +12,28 @@ use xshell::{Shell, cmd}; use crate::{cfsctl, create_test_rootfs, integration_test}; // Pinned composefs image ID for the deterministic OCI layout built by -// create_oci_layout() (single layer with usr/ dir + hello.txt, mtime=1234567890). -const OCI_LAYOUT_COMPOSEFS_ID: &str = "f26c6eb439749b82f0d1520e83455bb21766572fb2b5cfe009dd7749a61caf74e0c42c56f1a2cbd9d\ - 359e7d172c8e2c65641666c9a18cc484a8b0f6e4e6d47ab"; +// create_oci_layout() (single layer with usr/ dir + hello.txt + hello-link.txt hardlink, mtime=1234567890). +const OCI_LAYOUT_COMPOSEFS_ID: &str = "f7684c21050615c02cec250e0c7d4118fb0ff6340af5039e06f2f372a7614fff\ + 25dfd38cf7f28ad67d8b86e86371b7deafa9ae4bf543630322aee6196417de92"; /// Create a fresh initialized insecure repository in a tempdir. /// /// Returns the tempdir (for lifetime) and the path to the repo. +/// +/// Creates a V2 (legacy) EROFS repo explicitly so that tests which compare +/// against pinned V2 digests (e.g. `OCI_LAYOUT_COMPOSEFS_ID`) continue to +/// work correctly now that `cfsctl init` defaults to V1. pub(crate) fn init_insecure_repo( sh: &Shell, cfsctl: &std::path::Path, ) -> Result { let repo_dir = tempfile::tempdir()?; let repo = repo_dir.path(); - cmd!(sh, "{cfsctl} --repo {repo} init --insecure").read()?; + cmd!( + sh, + "{cfsctl} --repo {repo} init --insecure --erofs-version 2" + ) + .read()?; Ok(repo_dir) } @@ -272,6 +280,16 @@ pub(crate) fn create_oci_layout(parent: &std::path::Path) -> Result Result<()> { assert!(info["size"].as_u64().unwrap() > 0, "expected non-zero size"); assert_eq!( info["entryCount"].as_u64().unwrap(), - 2, - "expected exactly 2 entries (usr/ + hello.txt)" + 3, + "expected exactly 3 entries (usr/ + hello.txt + hello-link.txt)" ); // Check splitstream metadata let splitstream = info @@ -1597,11 +1615,11 @@ fn test_layer_tar_roundtrip() -> Result<()> { entries.push((path, content)); } - // Verify we got the expected files (usr/ directory and hello.txt) + // Verify we got the expected files (usr/ directory, hello.txt, and hello-link.txt) assert_eq!( entries.len(), - 2, - "expected 2 entries in layer (usr/ and hello.txt)" + 3, + "expected 3 entries in layer (usr/, hello.txt, and hello-link.txt)" ); // Find hello.txt and verify content @@ -1699,3 +1717,163 @@ fn test_compute_image_id() -> Result<()> { Ok(()) } integration_test!(test_compute_image_id); + +/// Composefs header magic at offset 0 (little-endian 0xd078629a). +const COMPOSEFS_MAGIC: [u8; 4] = [0x9a, 0x62, 0x78, 0xd0]; +/// EROFS superblock magic at offset 0x400 (little-endian 0xE0F5E1E2). +const EROFS_MAGIC: [u8; 4] = [0xe2, 0xe1, 0xf5, 0xe0]; +/// Byte offset of the EROFS superblock within a composefs image. +const EROFS_SUPERBLOCK_OFFSET: usize = 0x400; + +fn test_mkcomposefs_basic() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + let fixture_dir = tempfile::tempdir()?; + let rootfs = create_test_rootfs(fixture_dir.path())?; + let out_dir = tempfile::tempdir()?; + let cfs = out_dir.path().join("out.cfs"); + + cmd!(sh, "{cfsctl} mkcomposefs --use-epoch {rootfs} {cfs}").run()?; + + let bytes = std::fs::read(&cfs)?; + assert!(!bytes.is_empty(), "output .cfs file should not be empty"); + // The composefs-specific header starts at offset 0 + assert_eq!( + &bytes[..4], + &COMPOSEFS_MAGIC, + "expected composefs magic bytes at offset 0" + ); + // The EROFS superblock starts at offset 0x400 + assert!( + bytes.len() > EROFS_SUPERBLOCK_OFFSET + 4, + "file too small to contain EROFS superblock" + ); + assert_eq!( + &bytes[EROFS_SUPERBLOCK_OFFSET..EROFS_SUPERBLOCK_OFFSET + 4], + &EROFS_MAGIC, + "expected EROFS magic at offset 0x400" + ); + Ok(()) +} +integration_test!(test_mkcomposefs_basic); + +fn test_mkcomposefs_print_digest_only() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + let fixture_dir = tempfile::tempdir()?; + let rootfs = create_test_rootfs(fixture_dir.path())?; + + let output = cmd!( + sh, + "{cfsctl} mkcomposefs --use-epoch --print-digest-only {rootfs}" + ) + .read()?; + let digest = output.trim(); + + assert!(!digest.is_empty(), "digest output should not be empty"); + assert!( + digest.chars().all(|c| c.is_ascii_hexdigit()), + "digest should be a hex string, got: {digest:?}" + ); + Ok(()) +} +integration_test!(test_mkcomposefs_print_digest_only); + +fn test_mkcomposefs_deterministic() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + let fixture_dir = tempfile::tempdir()?; + let rootfs = create_test_rootfs(fixture_dir.path())?; + let out_dir = tempfile::tempdir()?; + let cfs1 = out_dir.path().join("out1.cfs"); + let cfs2 = out_dir.path().join("out2.cfs"); + + cmd!(sh, "{cfsctl} mkcomposefs --use-epoch {rootfs} {cfs1}").run()?; + cmd!(sh, "{cfsctl} mkcomposefs --use-epoch {rootfs} {cfs2}").run()?; + + let bytes1 = std::fs::read(&cfs1)?; + let bytes2 = std::fs::read(&cfs2)?; + assert_eq!( + bytes1, bytes2, + "two mkcomposefs runs with --use-epoch should produce identical images" + ); + Ok(()) +} +integration_test!(test_mkcomposefs_deterministic); + +/// Verify that `--hardlinks` produces a different image than the default when +/// the source directory contains host hard links, and that the default is stable. +fn test_mkcomposefs_hardlinks_dedup() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + + // Build a minimal dir with two names for the same inode. + let src_dir = tempfile::tempdir()?; + let dir = src_dir.path().join("dir"); + std::fs::create_dir_all(&dir)?; + std::fs::write(dir.join("a"), "hello hardlink\n")?; + std::fs::hard_link(dir.join("a"), dir.join("b"))?; + + let out_dir = tempfile::tempdir()?; + let no_hl = out_dir.path().join("no_hardlinks.cfs"); + let with_hl = out_dir.path().join("with_hardlinks.cfs"); + let no_hl_2 = out_dir.path().join("no_hardlinks_2.cfs"); + + cmd!(sh, "{cfsctl} mkcomposefs --use-epoch {dir} {no_hl}").run()?; + cmd!( + sh, + "{cfsctl} mkcomposefs --use-epoch --hardlinks {dir} {with_hl}" + ) + .run()?; + // A second run without --hardlinks must be identical to the first. + cmd!(sh, "{cfsctl} mkcomposefs --use-epoch {dir} {no_hl_2}").run()?; + + let bytes_no_hl = std::fs::read(&no_hl)?; + let bytes_with_hl = std::fs::read(&with_hl)?; + let bytes_no_hl_2 = std::fs::read(&no_hl_2)?; + + assert_ne!( + bytes_no_hl, bytes_with_hl, + "--hardlinks should produce a different image than the default (no dedup)" + ); + assert_eq!( + bytes_no_hl, bytes_no_hl_2, + "default (no --hardlinks) should be deterministic across runs" + ); + Ok(()) +} +integration_test!(test_mkcomposefs_hardlinks_dedup); + +/// Verify byte-for-byte identity with the C `mkcomposefs` tool. +/// +/// Skipped automatically when `mkcomposefs` is not found in PATH. +fn test_mkcomposefs_vs_c_mkcomposefs() -> Result<()> { + use std::process::Command; + + // Skip if the C mkcomposefs tool is not available. + if Command::new("mkcomposefs") + .arg("--version") + .output() + .is_err() + { + return Ok(()); + } + + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + let fixture_dir = tempfile::tempdir()?; + // create_test_rootfs has no host hard links, so both tools should agree. + let rootfs = create_test_rootfs(fixture_dir.path())?; + let out_dir = tempfile::tempdir()?; + let rust_cfs = out_dir.path().join("rust.cfs"); + let c_cfs = out_dir.path().join("c.cfs"); + + cmd!(sh, "{cfsctl} mkcomposefs --use-epoch {rootfs} {rust_cfs}").run()?; + cmd!(sh, "mkcomposefs --use-epoch {rootfs} {c_cfs}").run()?; + + let rust_bytes = std::fs::read(&rust_cfs)?; + let c_bytes = std::fs::read(&c_cfs)?; + similar_asserts::assert_eq!(c_bytes, rust_bytes); + Ok(()) +} +integration_test!(test_mkcomposefs_vs_c_mkcomposefs); diff --git a/crates/composefs-integration-tests/src/tests/privileged.rs b/crates/composefs-integration-tests/src/tests/privileged.rs index 048430ee..85fe92af 100644 --- a/crates/composefs-integration-tests/src/tests/privileged.rs +++ b/crates/composefs-integration-tests/src/tests/privileged.rs @@ -15,7 +15,7 @@ use anyhow::{Context, Result, bail, ensure}; use xshell::{Shell, cmd}; use composefs_oci::composefs::fsverity::{FsVerityHashValue, Sha256HashValue, Sha512HashValue}; -use composefs_oci::composefs::repository::Repository; +use composefs_oci::composefs::repository::{Repository, RepositoryConfig}; use crate::{cfsctl, integration_test}; @@ -665,8 +665,11 @@ fn init_insecure_repo_at( rustix::fs::OFlags::CLOEXEC | rustix::fs::OFlags::RDONLY, 0.into(), )?; - let (mut repo, _created) = Repository::::init_path(&fd, ".", algorithm, false)?; - repo.set_insecure(); + let (repo, _created) = Repository::::init_path( + &fd, + ".", + RepositoryConfig::new(algorithm).set_insecure(), + )?; Ok(Arc::new(repo)) } diff --git a/crates/composefs-oci/src/boot.rs b/crates/composefs-oci/src/boot.rs index 726ded80..ad34eeeb 100644 --- a/crates/composefs-oci/src/boot.rs +++ b/crates/composefs-oci/src/boot.rs @@ -86,6 +86,7 @@ pub fn remove_boot_image( #[cfg(all(test, feature = "boot"))] mod test { use super::*; + use composefs::erofs::format::FormatVersion; use composefs::fsverity::Sha256HashValue; use composefs::test::TestRepo; use composefs_boot::bootloader::get_boot_resources; @@ -120,8 +121,9 @@ mod test { let oci = OciImage::open_ref(repo, "myapp:v1").unwrap(); assert_eq!(oci.boot_image_ref(), Some(&image_verity)); - let plain_image = crate::image::create_filesystem(repo, &img.config_digest, None).unwrap(); - let plain_verity = plain_image.compute_image_id(); + let mut plain_image = + crate::image::create_filesystem(repo, &img.config_digest, None).unwrap(); + let plain_verity = plain_image.compute_image_id(FormatVersion::V2); assert_ne!( image_verity, plain_verity, "boot-transformed image should differ from non-transformed image" @@ -267,7 +269,8 @@ mod test { "tag={tag}: expected Type2 entry" ); - let plain_fs = crate::image::create_filesystem(repo, &img.config_digest, None).unwrap(); + let mut plain_fs = + crate::image::create_filesystem(repo, &img.config_digest, None).unwrap(); let plain_verity = plain_fs.commit_image(repo, None).unwrap(); assert_ne!(boot_verity, plain_verity, "tag={tag}"); } diff --git a/crates/composefs-oci/src/image.rs b/crates/composefs-oci/src/image.rs index 14a8ae2f..44c05189 100644 --- a/crates/composefs-oci/src/image.rs +++ b/crates/composefs-oci/src/image.rs @@ -157,6 +157,7 @@ mod test { use composefs::{ dumpfile::write_dumpfile, fsverity::Sha256HashValue, + repository::RepositoryConfig, tree::{LeafContent, RegularFile, Stat}, }; use std::{collections::BTreeMap, io::BufRead, path::PathBuf}; @@ -171,6 +172,7 @@ mod test { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), }, item: TarItem::Leaf(LeafContent::Regular(RegularFile::Inline([].into()))), @@ -185,6 +187,7 @@ mod test { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), }, item: TarItem::Directory, @@ -344,8 +347,7 @@ mod test { let (repo, _) = Repository::::init_path( CWD, &repo_path, - composefs::fsverity::Algorithm::SHA256, - false, + RepositoryConfig::default().set_insecure(), )?; let repo = Arc::new(repo); let (verity, _stats) = diff --git a/crates/composefs-oci/src/lib.rs b/crates/composefs-oci/src/lib.rs index 65c08048..9f3f4b15 100644 --- a/crates/composefs-oci/src/lib.rs +++ b/crates/composefs-oci/src/lib.rs @@ -716,7 +716,7 @@ fn ensure_oci_composefs_erofs( } // Build the composefs filesystem from all layers - let fs = image::create_filesystem(repo, img.config_digest(), Some(img.config_verity()))?; + let mut fs = image::create_filesystem(repo, img.config_digest(), Some(img.config_verity()))?; // Commit as EROFS image (no name — the GC link comes from the config ref) let erofs_id = fs.commit_image(repo, None)?; @@ -822,7 +822,11 @@ mod test { use rustix::fs::CWD; - use composefs::{fsverity::Sha256HashValue, repository::Repository, test::tempdir}; + use composefs::{ + fsverity::Sha256HashValue, + repository::{Repository, RepositoryConfig}, + test::tempdir, + }; use super::*; @@ -859,13 +863,9 @@ mod test { fn create_test_repo() -> (tempfile::TempDir, Arc>) { let dir = tempdir(); let repo_path = dir.path().join("repo"); - let (repo, _) = Repository::init_path( - CWD, - &repo_path, - composefs::fsverity::Algorithm::SHA256, - false, - ) - .expect("initializing test repo"); + let (repo, _) = + Repository::init_path(CWD, &repo_path, RepositoryConfig::default().set_insecure()) + .expect("initializing test repo"); (dir, Arc::new(repo)) } diff --git a/crates/composefs-oci/src/oci_layout.rs b/crates/composefs-oci/src/oci_layout.rs index a340f126..e4365cf7 100644 --- a/crates/composefs-oci/src/oci_layout.rs +++ b/crates/composefs-oci/src/oci_layout.rs @@ -476,8 +476,7 @@ mod tests { let (repo, _) = composefs::repository::Repository::::init_path( rustix::fs::CWD, &repo_path, - composefs::fsverity::Algorithm::SHA256, - false, + composefs::repository::RepositoryConfig::default().set_insecure(), ) .unwrap(); let repo = std::sync::Arc::new(repo); diff --git a/crates/composefs-oci/src/tar.rs b/crates/composefs-oci/src/tar.rs index 1fde0246..1e2662cc 100644 --- a/crates/composefs-oci/src/tar.rs +++ b/crates/composefs-oci/src/tar.rs @@ -456,6 +456,7 @@ pub fn get_entry( st_gid: entry.gid as u32, st_mode: entry.mode, st_mtim_sec: entry.mtime as i64, + st_mtim_nsec: 0, xattrs, }, item, @@ -475,7 +476,9 @@ mod tests { use super::*; use composefs::{ - fsverity::Sha256HashValue, generic_tree::LeafContent, repository::Repository, + fsverity::Sha256HashValue, + generic_tree::LeafContent, + repository::{Repository, RepositoryConfig}, splitstream::SplitStreamReader, }; use std::{io::Read, path::Path, sync::Arc}; @@ -493,8 +496,7 @@ mod tests { let (repo, _) = Repository::init_path( rustix::fs::CWD, &repo_path, - composefs::fsverity::Algorithm::SHA256, - false, + RepositoryConfig::default().set_insecure(), )?; // Store tempdir in static to keep it alive diff --git a/crates/composefs-oci/src/test_util.rs b/crates/composefs-oci/src/test_util.rs index 73829c4a..2df99a5c 100644 --- a/crates/composefs-oci/src/test_util.rs +++ b/crates/composefs-oci/src/test_util.rs @@ -24,7 +24,7 @@ use crate::oci_image::write_manifest; use crate::skopeo::OCI_CONFIG_CONTENT_TYPE; use composefs::dumpfile_parse::{Entry, Item}; use composefs::fsverity::Sha256HashValue; -use composefs::repository::Repository; +use composefs::repository::{Repository, RepositoryConfig}; use containers_image_proxy::oci_spec::image::{ ConfigBuilder, DescriptorBuilder, Digest as OciDigest, ImageConfigurationBuilder, ImageManifestBuilder, MediaType, RootFsBuilder, @@ -639,13 +639,11 @@ pub async fn create_bootable_image( /// paths rather than `Repository` handles. Opens the repo, creates the /// image with `create_base_image`, generates the EROFS, and returns. pub fn create_test_oci_image(repo_path: &std::path::Path, tag: &str) -> anyhow::Result<()> { - let (mut repo, _) = Repository::::init_path( + let (repo, _) = Repository::::init_path( rustix::fs::CWD, repo_path, - composefs::fsverity::Algorithm::SHA256, - false, + RepositoryConfig::default().set_insecure(), )?; - repo.set_insecure(); let repo = Arc::new(repo); let rt = tokio::runtime::Runtime::new()?; rt.block_on(create_base_image(&repo, Some(tag))); @@ -663,13 +661,11 @@ pub fn create_test_bootable_oci_image( repo_path: &std::path::Path, tag: &str, ) -> anyhow::Result<()> { - let (mut repo, _) = Repository::::init_path( + let (repo, _) = Repository::::init_path( rustix::fs::CWD, repo_path, - composefs::fsverity::Algorithm::SHA256, - false, + RepositoryConfig::default().set_insecure(), )?; - repo.set_insecure(); let repo = Arc::new(repo); let rt = tokio::runtime::Runtime::new()?; let img = rt.block_on(create_bootable_image(&repo, Some(tag), 1)); diff --git a/crates/composefs/Cargo.toml b/crates/composefs/Cargo.toml index 261281f4..a14695eb 100644 --- a/crates/composefs/Cargo.toml +++ b/crates/composefs/Cargo.toml @@ -21,6 +21,7 @@ anyhow = { version = "1.0.87", default-features = false } composefs-ioctls = { workspace = true } zlink-core = { workspace = true, optional = true } serde = { version = "1.0", default-features = false, features = ["derive", "alloc"] } +serde_repr = "0.1" fn-error-context = "0.2" hex = { version = "0.4.0", default-features = false, features = ["std"] } log = { version = "0.4.8", default-features = false } diff --git a/crates/composefs/fuzz/Cargo.lock b/crates/composefs/fuzz/Cargo.lock index 3bbaf91c..98909985 100644 --- a/crates/composefs/fuzz/Cargo.lock +++ b/crates/composefs/fuzz/Cargo.lock @@ -78,6 +78,7 @@ dependencies = [ "rustix", "serde", "serde_json", + "serde_repr", "sha2", "thiserror", "tokio", @@ -460,6 +461,17 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_repr" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "sha2" version = "0.11.0" diff --git a/crates/composefs/fuzz/generate_corpus.rs b/crates/composefs/fuzz/generate_corpus.rs index dc179f76..314d95d4 100644 --- a/crates/composefs/fuzz/generate_corpus.rs +++ b/crates/composefs/fuzz/generate_corpus.rs @@ -12,7 +12,8 @@ use std::ffi::{OsStr, OsString}; use std::fs; use std::path::Path; -use composefs::erofs::writer::mkfs_erofs; +use composefs::erofs::format::FormatVersion; +use composefs::erofs::writer::{ValidatedFileSystem, mkfs_erofs, mkfs_erofs_versioned}; use composefs::fsverity::{FsVerityHashValue, Sha256HashValue}; use composefs::generic_tree::{self, LeafContent, Stat}; use composefs::tree::{self, FileSystem, RegularFile}; @@ -27,6 +28,7 @@ fn stat(mode: u32, uid: u32, gid: u32, mtime: i64) -> Stat { st_uid: uid, st_gid: gid, st_mtim_sec: mtime, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } @@ -55,20 +57,44 @@ fn insert_dir<'a>(parent: &'a mut Dir, name: &str, s: Stat) -> &'a mut Dir { parent.get_directory_mut(OsStr::new(name)).unwrap() } +/// Generate V0, V1, and V2 images for a filesystem, pushing them into seeds. +/// +/// The V2 image uses the name as-is; V1 appends "_v1"; V0 appends "_v0". +/// V0/V1 whiteout stubs are added automatically by the writer. +fn push_all_versions( + seeds: &mut Vec<(String, Vec)>, + name: &str, + build_fs: impl Fn() -> FileSystem, +) { + // V2 (Rust-native extended inodes, DFS ordering, no stubs) + let image = mkfs_erofs(&mut ValidatedFileSystem::new(build_fs()).unwrap()); + seeds.push((name.to_string(), image.into())); + + // V1 (compact inodes, always composefs_version=1, stubs injected) + let image = mkfs_erofs_versioned( + &mut ValidatedFileSystem::new(build_fs()).unwrap(), + FormatVersion::V1, + ); + seeds.push((format!("{name}_v1"), image.into())); + + // V0 (compact inodes, composefs_version auto-bumps to 1 only if whiteouts present) + let image = mkfs_erofs_versioned( + &mut ValidatedFileSystem::new(build_fs()).unwrap(), + FormatVersion::V0, + ); + seeds.push((format!("{name}_v0"), image.into())); +} + fn main() { let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); - let mut seeds: Vec<(&str, Vec)> = Vec::new(); + let mut seeds: Vec<(String, Vec)> = Vec::new(); // 1. Empty root - { - let fs = empty_root(); - let image = mkfs_erofs(&fs); - seeds.push(("empty_root", image.into())); - } + push_all_versions(&mut seeds, "empty_root", empty_root); // 2. Single inline file (small content stored in inode) - { + push_all_versions(&mut seeds, "single_inline_file", || { let mut fs = empty_root(); let id = fs.push_leaf( file_stat(), @@ -77,12 +103,11 @@ fn main() { )), ); fs.root.insert(OsStr::new("hello.txt"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("single_inline_file", image.into())); - } + fs + }); // 3. Single external (chunk-based) regular file - { + push_all_versions(&mut seeds, "single_external_file", || { let mut fs = empty_root(); let hash = Sha256HashValue::EMPTY; let id = fs.push_leaf( @@ -90,66 +115,60 @@ fn main() { LeafContent::Regular(RegularFile::External(hash, 65536)), ); fs.root.insert(OsStr::new("data.bin"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("single_external_file", image.into())); - } + fs + }); // 4. Symlink - { + push_all_versions(&mut seeds, "symlink", || { let mut fs = empty_root(); let id = fs.push_leaf( stat(0o777, 0, 0, 0), LeafContent::Symlink(OsString::from("/target/path").into_boxed_os_str()), ); fs.root.insert(OsStr::new("link"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("symlink", image.into())); - } + fs + }); // 5. FIFO - { + push_all_versions(&mut seeds, "fifo", || { let mut fs = empty_root(); let id = fs.push_leaf(file_stat(), LeafContent::Fifo); fs.root.insert(OsStr::new("mypipe"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("fifo", image.into())); - } + fs + }); // 6. Character device - { + push_all_versions(&mut seeds, "chardev", || { let mut fs = empty_root(); let id = fs.push_leaf( stat(0o666, 0, 0, 0), LeafContent::CharacterDevice(makedev(1, 3)), ); fs.root.insert(OsStr::new("null"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("chardev", image.into())); - } + fs + }); // 7. Block device - { + push_all_versions(&mut seeds, "blockdev", || { let mut fs = empty_root(); let id = fs.push_leaf( stat(0o660, 0, 6, 0), LeafContent::BlockDevice(makedev(8, 0)), ); fs.root.insert(OsStr::new("sda"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("blockdev", image.into())); - } + fs + }); // 8. Socket - { + push_all_versions(&mut seeds, "socket", || { let mut fs = empty_root(); let id = fs.push_leaf(file_stat(), LeafContent::Socket); fs.root.insert(OsStr::new("mysock"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("socket", image.into())); - } + fs + }); // 9. Nested directories: /a/b/c/file - { + push_all_versions(&mut seeds, "nested_dirs", || { let mut fs = empty_root(); let id = fs.push_leaf( file_stat(), @@ -161,30 +180,26 @@ fn main() { let b = insert_dir(a, "b", dir_stat()); let c = insert_dir(b, "c", dir_stat()); c.insert(OsStr::new("file"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("nested_dirs", image.into())); - } + fs + }); // 10. Many entries (20+ files to exercise multi-block directories) - { + push_all_versions(&mut seeds, "many_entries", || { let mut fs = empty_root(); for i in 0..25 { let name = format!("file_{i:03}"); let content = format!("content of file {i}"); let id = fs.push_leaf( file_stat(), - LeafContent::Regular(RegularFile::Inline( - content.into_bytes().into_boxed_slice(), - )), + LeafContent::Regular(RegularFile::Inline(content.into_bytes().into_boxed_slice())), ); fs.root.insert(OsStr::new(&name), Inode::leaf(id)); } - let image = mkfs_erofs(&fs); - seeds.push(("many_entries", image.into())); - } + fs + }); // 11. Extended attributes - { + push_all_versions(&mut seeds, "xattrs", || { let mut fs = empty_root(); let mut xattrs = BTreeMap::new(); xattrs.insert( @@ -206,19 +221,16 @@ fn main() { )), ); fs.root.insert(OsStr::new("xattr_file"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("xattrs", image.into())); - } + fs + }); // 12. Mixed types — one of every file type in a single directory - { + push_all_versions(&mut seeds, "mixed_types", || { let mut fs = empty_root(); let ids = [ fs.push_leaf( file_stat(), - LeafContent::Regular(RegularFile::Inline( - b"data".to_vec().into_boxed_slice(), - )), + LeafContent::Regular(RegularFile::Inline(b"data".to_vec().into_boxed_slice())), ), fs.push_leaf( stat(0o777, 0, 0, 0), @@ -246,12 +258,11 @@ fn main() { LeafContent::Regular(RegularFile::External(hash, 4096)), ); fs.root.insert(OsStr::new("external"), Inode::leaf(ext_id)); - let image = mkfs_erofs(&fs); - seeds.push(("mixed_types", image.into())); - } + fs + }); // 13. Hardlink — two entries sharing the same LeafId (nlink > 1) - { + push_all_versions(&mut seeds, "hardlink", || { let mut fs = empty_root(); let shared_id = fs.push_leaf( file_stat(), @@ -263,12 +274,11 @@ fn main() { .insert(OsStr::new("original"), Inode::leaf(shared_id)); fs.root .insert(OsStr::new("hardlink"), Inode::leaf(shared_id)); - let image = mkfs_erofs(&fs); - seeds.push(("hardlink", image.into())); - } + fs + }); // 14. Large inline — file with maximum inline content (just under 4096 bytes) - { + push_all_versions(&mut seeds, "large_inline", || { let mut fs = empty_root(); let content = vec![0xABu8; 4000]; // just under block size let id = fs.push_leaf( @@ -277,18 +287,15 @@ fn main() { ); fs.root .insert(OsStr::new("large_inline.bin"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("large_inline", image.into())); - } + fs + }); // 15. Deep nesting — 8 levels of directories - { + push_all_versions(&mut seeds, "deep_nesting", || { let mut fs = empty_root(); let id = fs.push_leaf( file_stat(), - LeafContent::Regular(RegularFile::Inline( - b"deep".to_vec().into_boxed_slice(), - )), + LeafContent::Regular(RegularFile::Inline(b"deep".to_vec().into_boxed_slice())), ); let names = ["d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8"]; let mut current = &mut fs.root; @@ -296,45 +303,36 @@ fn main() { current = insert_dir(current, name, dir_stat()); } current.insert(OsStr::new("deep_file"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("deep_nesting", image.into())); - } + fs + }); // 16. Nonzero mtime - { + push_all_versions(&mut seeds, "nonzero_mtime", || { let mut fs = FileSystem::new(stat(0o755, 0, 0, 1000000)); let id1 = fs.push_leaf( stat(0o644, 0, 0, 500000), - LeafContent::Regular(RegularFile::Inline( - b"old file".to_vec().into_boxed_slice(), - )), + LeafContent::Regular(RegularFile::Inline(b"old file".to_vec().into_boxed_slice())), ); let id2 = fs.push_leaf( stat(0o644, 0, 0, 1700000000), - LeafContent::Regular(RegularFile::Inline( - b"new file".to_vec().into_boxed_slice(), - )), + LeafContent::Regular(RegularFile::Inline(b"new file".to_vec().into_boxed_slice())), ); fs.root.insert(OsStr::new("old"), Inode::leaf(id1)); fs.root.insert(OsStr::new("new"), Inode::leaf(id2)); - let image = mkfs_erofs(&fs); - seeds.push(("nonzero_mtime", image.into())); - } + fs + }); // 17. Large uid/gid — forces extended inodes - { + push_all_versions(&mut seeds, "large_uid_gid", || { let big_id = u16::MAX as u32 + 1; // 65536, won't fit in u16 let mut fs = FileSystem::new(stat(0o755, big_id, big_id, 0)); let id = fs.push_leaf( stat(0o644, big_id, big_id, 0), - LeafContent::Regular(RegularFile::Inline( - b"big ids".to_vec().into_boxed_slice(), - )), + LeafContent::Regular(RegularFile::Inline(b"big ids".to_vec().into_boxed_slice())), ); fs.root.insert(OsStr::new("bigids.txt"), Inode::leaf(id)); - let image = mkfs_erofs(&fs); - seeds.push(("large_uid_gid", image.into())); - } + fs + }); // Write seeds to corpus directories for both fuzz targets let targets = ["read_image", "debug_image"]; diff --git a/crates/composefs/src/dumpfile.rs b/crates/composefs/src/dumpfile.rs index 7143715d..8b3d6253 100644 --- a/crates/composefs/src/dumpfile.rs +++ b/crates/composefs/src/dumpfile.rs @@ -114,11 +114,12 @@ fn write_entry( let uid = stat.st_uid; let gid = stat.st_gid; let mtim_sec = stat.st_mtim_sec; + let mtim_nsec = stat.st_mtim_nsec; write_escaped(writer, path.as_os_str().as_bytes())?; write!( writer, - " {size} {mode:o} {nlink} {uid} {gid} {rdev} {mtim_sec}.0 " + " {size} {mode:o} {nlink} {uid} {gid} {rdev} {mtim_sec}.{mtim_nsec} " )?; write_escaped(writer, payload.as_ref().as_bytes())?; write!(writer, " ")?; @@ -422,7 +423,7 @@ pub fn add_entry_to_filesystem( // Handle root directory specially if path == Path::new("/") { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; fs.set_root_stat(stat); return Ok(()); } @@ -439,7 +440,7 @@ pub fn add_entry_to_filesystem( // Convert the entry to an inode let inode = match entry.item { Item::Directory { .. } => { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; Inode::Directory(Box::new(Directory::new(stat))) } Item::Hardlink { ref target } => { @@ -450,7 +451,7 @@ pub fn add_entry_to_filesystem( Inode::leaf(existing_id) } Item::RegularInline { ref content, .. } => { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; let data: Box<[u8]> = match content { std::borrow::Cow::Borrowed(d) => Box::from(*d), std::borrow::Cow::Owned(d) => d.clone().into_boxed_slice(), @@ -464,7 +465,7 @@ pub fn add_entry_to_filesystem( ref fsverity_digest, .. } => { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; let digest = fsverity_digest .as_ref() .ok_or_else(|| anyhow::anyhow!("External file missing fsverity digest"))?; @@ -473,10 +474,19 @@ pub fn add_entry_to_filesystem( let id = push_leaf(fs, stat, content); Inode::leaf(id) } - Item::Device { rdev, .. } => { - let stat = entry_to_stat(&entry); + Item::Device { rdev, nlink } => { // S_IFMT = 0o170000, S_IFBLK = 0o60000, S_IFCHR = 0o20000 - let content = if entry.mode & 0o170000 == 0o60000 { + let is_chardev = entry.mode & 0o170000 != 0o60000; + // A whiteout is a character device with rdev=0; hardlinked whiteouts + // are invalid because composefs cannot represent them correctly. + if is_chardev && rdev == 0 && nlink > 1 { + anyhow::bail!( + "invalid dumpfile: whiteout entry {:?} has nlink > 1", + entry.path + ); + } + let stat = entry_to_stat(&entry)?; + let content = if !is_chardev { LeafContent::BlockDevice(rdev) } else { LeafContent::CharacterDevice(rdev) @@ -485,7 +495,7 @@ pub fn add_entry_to_filesystem( Inode::leaf(id) } Item::Symlink { ref target, .. } => { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; let target_os: Box = match target { std::borrow::Cow::Borrowed(t) => Box::from(t.as_os_str()), std::borrow::Cow::Owned(t) => Box::from(t.as_os_str()), @@ -495,11 +505,17 @@ pub fn add_entry_to_filesystem( Inode::leaf(id) } Item::Fifo { .. } => { - let stat = entry_to_stat(&entry); + let stat = entry_to_stat(&entry)?; let content = LeafContent::Fifo; let id = push_leaf(fs, stat, content); Inode::leaf(id) } + Item::Socket { .. } => { + let stat = entry_to_stat(&entry)?; + let content = LeafContent::Socket; + let id = push_leaf(fs, stat, content); + Inode::leaf(id) + } }; // Store LeafIds in the hardlinks map for future hardlink lookups @@ -521,7 +537,7 @@ pub fn add_entry_to_filesystem( } /// Convert a dumpfile Entry's metadata into a tree Stat structure. -fn entry_to_stat(entry: &Entry<'_>) -> Stat { +fn entry_to_stat(entry: &Entry<'_>) -> Result { let mut xattrs = BTreeMap::new(); for xattr in &entry.xattrs { let key: Box = match &xattr.key { @@ -535,13 +551,19 @@ fn entry_to_stat(entry: &Entry<'_>) -> Stat { xattrs.insert(key, value); } - Stat { + let nsec = entry.mtime.nsec; + if nsec >= 1_000_000_000 { + anyhow::bail!("Invalid mtime nanoseconds: {nsec} (must be < 1_000_000_000)"); + } + + Ok(Stat { st_mode: entry.mode & 0o7777, // Keep only permission bits st_uid: entry.uid, st_gid: entry.gid, st_mtim_sec: entry.mtime.sec as i64, + st_mtim_nsec: nsec as u32, xattrs, - } + }) } /// Parse a dumpfile string and build a complete FileSystem. @@ -566,7 +588,7 @@ pub fn dumpfile_to_filesystem( "Dumpfile must start with root directory entry, found: {:?}", entry.path ); - break entry_to_stat(&entry); + break entry_to_stat(&entry)?; } None => anyhow::bail!("Dumpfile is empty, expected root directory entry"), } @@ -591,6 +613,19 @@ pub fn dumpfile_to_filesystem( Ok(fs) } +/// Parse a composefs dumpfile string and validate the resulting filesystem +/// for EROFS serialization. +/// +/// Combines [`dumpfile_to_filesystem`] with [`ValidatedFileSystem::new`]. +/// Returns an error if the dumpfile is malformed or if the resulting +/// filesystem violates EROFS invariants (e.g. hardlinked whiteouts). +pub fn dumpfile_to_validated_filesystem( + dumpfile: &str, +) -> anyhow::Result> { + let fs = dumpfile_to_filesystem(dumpfile)?; + crate::erofs::writer::ValidatedFileSystem::new(fs) +} + #[cfg(test)] mod tests { use super::*; @@ -724,6 +759,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), }); let leaf_id = fs.push_leaf( @@ -732,6 +768,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs, }, LeafContent::Regular(RegularFile::Inline(b"test".to_vec().into())), @@ -757,6 +794,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), }; @@ -793,6 +831,23 @@ mod tests { Ok(()) } + /// A whiteout (chardev, rdev=0) with nlink > 1 must be rejected. + #[test] + fn test_hardlinked_whiteout_rejected() { + // /foo 0 20000 2 0 0 0 0.0 - - - + // ^size ^mode ^nlink ^uid ^gid ^rdev ^mtime ^payload ^digest ^xattrs + // mode 20000 = S_IFCHR (character device), rdev=0 → whiteout, nlink=2 + let dumpfile = "/ 0 40755 2 0 0 0 0.0 - - -\n\ + /foo 0 20000 2 0 0 0 0.0 - - -\n"; + let result = dumpfile_to_filesystem::(dumpfile); + let err = result.expect_err("hardlinked whiteout must be rejected"); + let msg = format!("{err:#}"); + assert!( + msg.contains("nlink"), + "error should mention nlink, got: {msg}" + ); + } + /// Helper to escape bytes through write_escaped and return the result. fn escaped(bytes: &[u8]) -> String { let mut out = String::new(); diff --git a/crates/composefs/src/dumpfile_parse.rs b/crates/composefs/src/dumpfile_parse.rs index f8cccdd8..f01a28a3 100644 --- a/crates/composefs/src/dumpfile_parse.rs +++ b/crates/composefs/src/dumpfile_parse.rs @@ -121,6 +121,11 @@ pub enum Item<'p> { /// Number of links nlink: u32, }, + /// A Unix domain socket + Socket { + /// Number of links + nlink: u32, + }, /// A directory Directory { /// Number of links @@ -482,7 +487,10 @@ impl<'p> Entry<'p> { Item::Directory { nlink } } FileType::Socket => { - anyhow::bail!("sockets are not supported"); + Self::check_nonregfile(content, fsverity_digest)?; + Self::check_rdev(rdev)?; + + Item::Socket { nlink } } FileType::Unknown => { anyhow::bail!("Unhandled file type from raw mode: {mode}") @@ -532,6 +540,7 @@ impl Item<'_> { Item::Symlink { nlink, .. } => *nlink, Item::Directory { nlink, .. } => *nlink, Item::Fifo { nlink, .. } => *nlink, + Item::Socket { nlink, .. } => *nlink, _ => 0, } } diff --git a/crates/composefs/src/erofs/composefs.rs b/crates/composefs/src/erofs/composefs.rs index 3acf1844..960064fa 100644 --- a/crates/composefs/src/erofs/composefs.rs +++ b/crates/composefs/src/erofs/composefs.rs @@ -7,19 +7,23 @@ use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout}; use crate::fsverity::FsVerityHashValue; -/* From linux/fs/overlayfs/overlayfs.h struct ovl_metacopy */ +/// Overlay metacopy xattr structure for fs-verity digest storage. +/// +/// From linux/fs/overlayfs/overlayfs.h struct ovl_metacopy #[derive(Debug, FromBytes, Immutable, KnownLayout, IntoBytes)] #[repr(C)] -pub(super) struct OverlayMetacopy { +pub struct OverlayMetacopy { version: u8, len: u8, flags: u8, digest_algo: u8, - pub(super) digest: H, + /// The fs-verity digest value. + pub digest: H, } impl OverlayMetacopy { - pub(super) fn new(digest: &H) -> Self { + /// Creates a new overlay metacopy entry with the given digest. + pub fn new(digest: &H) -> Self { Self { version: 0, len: size_of::() as u8, @@ -29,7 +33,8 @@ impl OverlayMetacopy { } } - pub(super) fn valid(&self) -> bool { + /// Checks whether this metacopy entry is valid. + pub fn valid(&self) -> bool { self.version == 0 && self.len == size_of::() as u8 && self.flags == 0 diff --git a/crates/composefs/src/erofs/debug.rs b/crates/composefs/src/erofs/debug.rs index d384d7de..e4905cf7 100644 --- a/crates/composefs/src/erofs/debug.rs +++ b/crates/composefs/src/erofs/debug.rs @@ -341,64 +341,6 @@ impl<'img> ImageVisitor<'img> { } } - fn visit_directory_block(&mut self, block: &DirectoryBlock, path: &Path) -> Result<()> { - for entry in block.entries()? { - let entry = entry?; - if entry.name == b"." || entry.name == b".." { - // TODO: maybe we want to follow those and let deduplication happen - continue; - } - self.visit_inode( - entry.header.inode_offset.get(), - &path.join(OsStr::from_bytes(entry.name)), - )?; - } - Ok(()) - } - - fn visit_inode(&mut self, id: u64, path: &Path) -> Result<()> { - let inode = self.image.inode(id)?; - let segment = match inode { - InodeType::Compact(inode) => SegmentType::CompactInode(inode), - InodeType::Extended(inode) => SegmentType::ExtendedInode(inode), - }; - if self.note(segment, Some(path))? { - // TODO: maybe we want to throw an error if we detect loops - /* already processed */ - return Ok(()); - } - - if let Some(xattrs) = inode.xattrs()? { - for id in xattrs.shared()? { - self.note( - SegmentType::XAttr(self.image.shared_xattr(id.get())?), - Some(path), - )?; - } - } - - if inode.mode().is_dir() { - if let Some(inline) = inode.inline() { - let inline_block = DirectoryBlock::ref_from_bytes(inline) - .map_err(|_| anyhow::anyhow!("invalid inline directory block"))?; - self.visit_directory_block(inline_block, path)?; - } - - for id in self.image.inode_blocks(&inode)? { - let block = self.image.directory_block(id)?; - self.visit_directory_block(block, path)?; - self.note(SegmentType::DirectoryBlock(block), Some(path))?; - } - } else { - for id in self.image.inode_blocks(&inode)? { - let block = self.image.data_block(id)?; - self.note(SegmentType::DataBlock(block), Some(path))?; - } - } - - Ok(()) - } - #[allow(clippy::type_complexity)] fn visit_image( image: &'img Image<'img>, @@ -409,7 +351,70 @@ impl<'img> ImageVisitor<'img> { }; this.note(SegmentType::Header(image.header), None)?; this.note(SegmentType::Superblock(image.sb), None)?; - this.visit_inode(image.sb.root_nid.get() as u64, &PathBuf::from("/"))?; + + // Iterative traversal: push (nid, path) pairs rather than recursing. + // The previous mutual recursion (visit_inode ↔ visit_directory_block) + // had no depth limit and would stack-overflow on deeply nested images. + // Deduplication is by byte offset via note(), so cycles and hardlinks + // are safe: note() returns true on a second visit and we skip children. + let mut stack: Vec<(u64, PathBuf)> = + vec![(image.sb.root_nid.get() as u64, PathBuf::from("/"))]; + + while let Some((id, path)) = stack.pop() { + let inode = this.image.inode(id)?; + let segment = match inode { + InodeType::Compact(inode) => SegmentType::CompactInode(inode), + InodeType::Extended(inode) => SegmentType::ExtendedInode(inode), + }; + if this.note(segment, Some(&path))? { + // Already visited this byte offset — additional path recorded, skip children. + continue; + } + + if let Some(xattrs) = inode.xattrs()? { + for xid in xattrs.shared()? { + this.note( + SegmentType::XAttr(this.image.shared_xattr(xid.get())?), + Some(&path), + )?; + } + } + + if inode.mode().is_dir() { + if let Some(inline) = inode.inline() { + let inline_block = DirectoryBlock::ref_from_bytes(inline) + .map_err(|_| anyhow::anyhow!("invalid inline directory block"))?; + for entry in inline_block.entries()? { + let entry = entry?; + if entry.name != b"." && entry.name != b".." { + stack.push(( + entry.header.inode_offset.get(), + path.join(OsStr::from_bytes(entry.name)), + )); + } + } + } + for blkid in this.image.inode_blocks(&inode)? { + let block = this.image.directory_block(blkid)?; + for entry in block.entries()? { + let entry = entry?; + if entry.name != b"." && entry.name != b".." { + stack.push(( + entry.header.inode_offset.get(), + path.join(OsStr::from_bytes(entry.name)), + )); + } + } + this.note(SegmentType::DirectoryBlock(block), Some(&path))?; + } + } else { + for blkid in this.image.inode_blocks(&inode)? { + let block = this.image.data_block(blkid)?; + this.note(SegmentType::DataBlock(block), Some(&path))?; + } + } + } + Ok(this.visited) } } diff --git a/crates/composefs/src/erofs/format.rs b/crates/composefs/src/erofs/format.rs index cc5a40a2..565bf41a 100644 --- a/crates/composefs/src/erofs/format.rs +++ b/crates/composefs/src/erofs/format.rs @@ -81,7 +81,7 @@ const INODE_DATALAYOUT_FLAT_INLINE: u16 = 4; const INODE_DATALAYOUT_CHUNK_BASED: u16 = 8; /// Data layout method for file content storage -#[derive(Debug)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[repr(u16)] pub enum DataLayout { /// File data stored in separate blocks @@ -271,11 +271,150 @@ impl std::ops::BitOr for FileType { /// EROFS format version number pub const VERSION: U32 = U32::new(1); -/// Composefs-specific version number +/// Composefs-specific version number (V2, Rust-native format) pub const COMPOSEFS_VERSION: U32 = U32::new(2); +/// Composefs-specific version number for V0 (C-compatible, no user whiteouts) +pub const COMPOSEFS_VERSION_V0: U32 = U32::new(0); +/// Composefs-specific version number for V1 (C-compatible, composefs_version=1 always) +pub const COMPOSEFS_VERSION_V1: U32 = U32::new(1); /// Magic number identifying composefs images pub const COMPOSEFS_MAGIC: U32 = U32::new(0xd078629a); +/// Format version for composefs images +/// +/// This enum represents the different format versions supported by composefs. +/// The format version affects the on-disk layout, the `composefs_version` header +/// field, and build-time handling. +/// +/// Serialized as an integer for `meta.json`: V0 → `0`, V1 → `1`, V2 → `2`. +#[repr(u32)] +#[derive( + Clone, + Copy, + Debug, + Default, + Hash, + PartialEq, + Eq, + PartialOrd, + Ord, + serde_repr::Serialize_repr, + serde_repr::Deserialize_repr, +)] +pub enum FormatVersion { + /// Format V0: compact inodes, BFS ordering, whiteout table. + /// + /// Byte-for-byte compatible with C `mkcomposefs` (default mode). + /// Build time is set to the minimum mtime across all inodes. + /// The `composefs_version` header field is `0` normally, auto-bumped to `1` + /// when user-visible whiteout char-devices are present in the input tree + /// (matching C `mkcomposefs` behaviour). + V0 = 0, + /// Format V1: same EROFS layout as V0 (compact inodes, BFS, whiteout table), + /// but `composefs_version` is always `1` in the header. + /// + /// Equivalent to C `mkcomposefs --min-version=1`. Recommended default for + /// new repositories: unconditionally signals support for data-only layers. + V1 = 1, + /// Format V2: extended inodes, DFS ordering, no whiteout table, + /// `composefs_version=2`. + /// + /// This is the composefs-rs native format, used by older bootc deployments. + #[default] + V2 = 2, +} + +/// The on-disk layout epoch — collapses the three [`FormatVersion`]s into two +/// structurally distinct layouts. +/// +/// V0 and V1 share the same EROFS layout (compact inodes, BFS ordering, whiteout +/// table); they differ only in the `composefs_version` header value. V2 uses an +/// entirely different layout (extended inodes, DFS ordering, no whiteout table). +/// +/// Use `version.epoch()` in `match` arms that select between the two layouts. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FormatEpoch { + /// Compact-inode layout (V0 and V1). + Epoch1, + /// Extended-inode layout (V2). + Epoch2, +} + +impl FormatVersion { + /// Returns the [`FormatEpoch`] — the structural on-disk layout family. + /// + /// V0 and V1 both map to [`FormatEpoch::Epoch1`] (same EROFS layout, different + /// `composefs_version` header value). V2 maps to [`FormatEpoch::Epoch2`]. + pub fn epoch(self) -> FormatEpoch { + match self { + FormatVersion::V0 | FormatVersion::V1 => FormatEpoch::Epoch1, + FormatVersion::V2 => FormatEpoch::Epoch2, + } + } + + /// Returns the `composefs_version` field value written to the EROFS header. + pub fn composefs_version(self) -> U32 { + match self { + FormatVersion::V0 => COMPOSEFS_VERSION_V0, + FormatVersion::V1 => COMPOSEFS_VERSION_V1, + FormatVersion::V2 => COMPOSEFS_VERSION, + } + } +} + +/// Configuration for which EROFS format versions to generate when committing images. +/// +/// `default` is the primary format — it receives any named ref when committing +/// images and is the version returned by single-format operations. `extra` +/// lists additional format versions to generate alongside it (no named ref). +/// Duplicates of `default` in `extra` are silently ignored. +/// +/// Persisted directly in `meta.json` as `{"default": , "extra": [...]}`. +/// Single-format configs serialize compactly as `{"default": }` because +/// `extra` is skipped when empty. +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct FormatConfig { + /// Primary format version (gets the named ref; used by `commit_image`). + pub default: FormatVersion, + /// Additional format versions to generate alongside `default`. + #[serde(default, skip_serializing_if = "std::collections::BTreeSet::is_empty")] + pub extra: std::collections::BTreeSet, +} + +impl Default for FormatConfig { + /// Returns a single-V2 config, matching the default for newly created repositories. + fn default() -> Self { + Self::single(FormatVersion::V2) + } +} + +impl FormatConfig { + /// Create a config that generates only a single format version. + pub fn single(v: FormatVersion) -> Self { + Self { + default: v, + extra: std::collections::BTreeSet::new(), + } + } + + /// Iterate all format versions in generation order: `default` first, then + /// `extra` in sorted order (excluding any duplicate of `default`). + pub fn versions(&self) -> impl Iterator + '_ { + std::iter::once(self.default).chain( + self.extra + .iter() + .copied() + .filter(move |v| *v != self.default), + ) + } +} + +impl From for FormatConfig { + fn from(v: FormatVersion) -> Self { + Self::single(v) + } +} + /// Flag indicating the presence of ACL data pub const COMPOSEFS_FLAGS_HAS_ACL: U32 = U32::new(1 << 0); @@ -493,7 +632,52 @@ pub struct XAttrHeader { pub value_size: U16, } -/// Standard xattr name prefixes indexed by name_index +/// EROFS xattr prefix index for `system.posix_acl_access` (index 2). +pub const XATTR_INDEX_POSIX_ACL_ACCESS: u8 = 2; +/// EROFS xattr prefix index for `system.posix_acl_default` (index 3). +pub const XATTR_INDEX_POSIX_ACL_DEFAULT: u8 = 3; +/// EROFS xattr prefix index for `lustre.` (index 5). +/// Absent from C mkcomposefs v1.0.8's prefix table; V1 writer skips it. +pub const XATTR_INDEX_LUSTRE: u8 = 5; + +// Overlay xattr keys used by composefs V1 whiteout escaping. +// Named to match the C mkcomposefs OVERLAY_XATTR_* constants. +/// `trusted.overlay.overlay.whiteout` — V1 escaped whiteout marker. +pub const XATTR_OVERLAY_WHITEOUT: &[u8] = b"trusted.overlay.overlay.whiteout"; +/// `user.overlay.whiteout` — userxattr escaped whiteout marker. +pub const XATTR_USERXATTR_WHITEOUT: &[u8] = b"user.overlay.whiteout"; +/// `trusted.overlay.overlay.whiteouts` — escaped whiteouts directory marker. +pub const XATTR_OVERLAY_WHITEOUTS: &[u8] = b"trusted.overlay.overlay.whiteouts"; +/// `user.overlay.whiteouts` — userxattr whiteouts directory marker. +pub const XATTR_USERXATTR_WHITEOUTS: &[u8] = b"user.overlay.whiteouts"; +/// `trusted.overlay.overlay.opaque` — escaped opaque directory marker. +pub const XATTR_OVERLAY_OPAQUE: &[u8] = b"trusted.overlay.overlay.opaque"; +/// `user.overlay.opaque` — userxattr opaque directory marker. +pub const XATTR_USERXATTR_OPAQUE: &[u8] = b"user.overlay.opaque"; +/// `trusted.overlay.opaque` — root opaque marker written by V1 writer. +pub const XATTR_OVERLAY_OPAQUE_ROOT: &[u8] = b"trusted.overlay.opaque"; +/// `trusted.overlay.metacopy` — metacopy marker (C adds redirect xattr too). +pub const XATTR_OVERLAY_METACOPY: &[u8] = b"trusted.overlay.metacopy"; +/// `trusted.overlay.redirect` — redirect target xattr. +pub const XATTR_OVERLAY_REDIRECT: &[u8] = b"trusted.overlay.redirect"; +/// `trusted.overlay.` prefix — all xattrs with this prefix are escaped in V1. +pub const XATTR_OVERLAY_PREFIX: &[u8] = b"trusted.overlay."; +/// `trusted.overlay.overlay.` prefix — escaped overlay xattr prefix. +pub const XATTR_OVERLAY_ESCAPED_PREFIX: &[u8] = b"trusted.overlay.overlay."; +/// `security.selinux` — SELinux label, copied to overlay whiteout stubs. +pub const XATTR_SECURITY_SELINUX: &[u8] = b"security.selinux"; + +/// Standard xattr name prefixes indexed by EROFS name_index. +/// +/// Index 0 is the fallback (empty prefix, full name stored as suffix). +/// Indices 1–6 map to the well-known EROFS prefix constants: +/// EROFS_XATTR_INDEX_USER=1, POSIX_ACL_ACCESS=2, POSIX_ACL_DEFAULT=3, +/// EROFS_XATTR_INDEX_TRUSTED=4, EROFS_XATTR_INDEX_LUSTRE=5, EROFS_XATTR_INDEX_SECURITY=6. +/// +/// **V1 compatibility note:** C mkcomposefs v1.0.8 does NOT include `lustre.` (index 5) +/// in its prefix table. Any `lustre.*` xattr is therefore encoded with prefix index 0 +/// (raw fallback) by C. For V1 images the writer must skip index 5 during prefix +/// matching so that `lustre.*` xattrs fall through to the empty-string fallback. pub const XATTR_PREFIXES: [&[u8]; 7] = [ b"", b"user.", @@ -519,3 +703,65 @@ pub struct DirectoryEntryHeader { /// Reserved field pub reserved: u8, } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_format_config_single() { + let cfg = FormatConfig::single(FormatVersion::V1); + let versions: Vec<_> = cfg.versions().collect(); + assert_eq!(versions, vec![FormatVersion::V1]); + } + + #[test] + fn test_format_config_with_extra() { + let cfg = FormatConfig { + default: FormatVersion::V1, + extra: [FormatVersion::V2].into(), + }; + let versions: Vec<_> = cfg.versions().collect(); + // default first, then extra in sorted order + assert_eq!(versions, vec![FormatVersion::V1, FormatVersion::V2]); + } + + #[test] + fn test_format_config_dedup() { + // Duplicating default in extra must not yield it twice. + let cfg = FormatConfig { + default: FormatVersion::V1, + extra: [FormatVersion::V1, FormatVersion::V2].into(), + }; + let versions: Vec<_> = cfg.versions().collect(); + assert_eq!(versions, vec![FormatVersion::V1, FormatVersion::V2]); + } + + #[test] + fn test_format_config_from() { + assert_eq!( + FormatConfig::from(FormatVersion::V1), + FormatConfig::single(FormatVersion::V1) + ); + } + + #[test] + fn test_format_version_ordering() { + assert!(FormatVersion::V0 < FormatVersion::V1); + assert!(FormatVersion::V1 < FormatVersion::V2); + } + + #[test] + fn test_format_version_epoch() { + assert_eq!(FormatVersion::V0.epoch(), FormatEpoch::Epoch1); + assert_eq!(FormatVersion::V1.epoch(), FormatEpoch::Epoch1); + assert_eq!(FormatVersion::V2.epoch(), FormatEpoch::Epoch2); + } + + #[test] + fn test_composefs_version_values() { + assert_eq!(FormatVersion::V0.composefs_version().get(), 0); + assert_eq!(FormatVersion::V1.composefs_version().get(), 1); + assert_eq!(FormatVersion::V2.composefs_version().get(), 2); + } +} diff --git a/crates/composefs/src/erofs/reader.rs b/crates/composefs/src/erofs/reader.rs index 06f09932..5d9f405f 100644 --- a/crates/composefs/src/erofs/reader.rs +++ b/crates/composefs/src/erofs/reader.rs @@ -17,7 +17,8 @@ use zerocopy::{FromBytes, Immutable, KnownLayout, little_endian::U32}; use super::{ composefs::OverlayMetacopy, format::{ - self, BLOCK_BITS, COMPOSEFS_MAGIC, CompactInodeHeader, ComposefsHeader, DataLayout, + self, BLOCK_BITS, COMPOSEFS_MAGIC, COMPOSEFS_VERSION, COMPOSEFS_VERSION_V0, + COMPOSEFS_VERSION_V1, CompactInodeHeader, ComposefsHeader, DataLayout, DirectoryEntryHeader, ExtendedInodeHeader, InodeXAttrHeader, MAGIC_V1, ModeField, S_IFBLK, S_IFCHR, S_IFIFO, S_IFLNK, S_IFMT, S_IFREG, S_IFSOCK, Superblock, VERSION, XATTR_PREFIXES, XAttrHeader, @@ -494,8 +495,21 @@ impl<'img> Image<'img> { self.header.version.get(), ))); } - // Note: we don't enforce composefs_version here because C mkcomposefs - // writes version 0 while the Rust writer uses version 2. Both are valid. + // Reject unknown composefs versions. + // 0 = V1 (C-compatible, no user whiteouts) + // 1 = V1 (C-compatible, user whiteouts present — C bumps version when it + // encounters a char-device-rdev-0 entry in the input tree) + // 2 = V2 (Rust-native format) + let cv = self.header.composefs_version.get(); + if cv != COMPOSEFS_VERSION.get() + && cv != COMPOSEFS_VERSION_V1.get() + && cv != COMPOSEFS_VERSION_V0.get() + { + return Err(ErofsReaderError::InvalidImage(format!( + "unknown composefs_version {cv} (expected 0, 1, or {})", + COMPOSEFS_VERSION.get(), + ))); + } // Validate EROFS superblock magic if self.sb.magic != MAGIC_V1 { @@ -649,17 +663,29 @@ impl<'img> Image<'img> { } /// Returns a data block by its ID + /// Returns a byte slice of the image at `[offset, offset+len)`, validating + /// that both the offset and the range lie within the image. + /// + /// This is the single choke point for all raw byte accesses derived from + /// image fields (block addresses, xattr offsets, etc.). All callers that + /// compute `blkaddr * block_size + delta` should go through here rather + /// than slicing `self.image` directly. + pub fn image_slice(&self, offset: usize, len: usize) -> Result<&[u8], ErofsReaderError> { + let end = offset + .checked_add(len) + .ok_or(ErofsReaderError::OutOfBounds)?; + self.image + .get(offset..end) + .ok_or(ErofsReaderError::OutOfBounds) + } + + /// Returns a block by its ID as a raw byte slice, validated against the image size. pub fn block(&self, id: u64) -> Result<&[u8], ErofsReaderError> { let start = usize::try_from(id) .ok() .and_then(|id| id.checked_mul(self.block_size)) .ok_or(ErofsReaderError::OutOfBounds)?; - let end = start - .checked_add(self.block_size) - .ok_or(ErofsReaderError::OutOfBounds)?; - self.image - .get(start..end) - .ok_or(ErofsReaderError::OutOfBounds) + self.image_slice(start, self.block_size) } /// Returns a data block by its ID as a DataBlock reference @@ -711,6 +737,303 @@ impl<'img> Image<'img> { Ok(range) } + /// Performs a full structural fsck of the image metadata by traversing the + /// entire inode tree. + /// + /// This is separate from [`Self::restrict_to_composefs`], which only checks + /// superblock and header fields without any traversal. Call this when you + /// want a thorough integrity check (e.g. during repository fsck) rather than + /// just the cheap open-time validation. + /// + /// Currently checks: + /// - V1 images: no FlatInline symlink inode has a block-boundary layout that + /// old Linux kernels (< 6.12) would reject with `EFSCORRUPTED` (`EUCLEAN`). + /// - Epoch-invariant rules (see [`Self::validate_epoch_invariants`]). + pub fn fsck_metadata(&self) -> Result<(), ErofsReaderError> { + self.validate_v1_inline_layout()?; + self.validate_epoch_invariants() + } + + /// Validates epoch-invariant structural rules that must hold for every + /// well-formed composefs image depending on its format epoch. + /// + /// **Epoch1** (`composefs_version` 0 or 1 — compact inodes, BFS, whiteout table): + /// 1. The root directory must contain exactly 256 entries with 2-hex-char names + /// (the whiteout stub table slots, 00–ff). Each slot may be occupied by + /// either a native whiteout stub (char-device rdev=0,0), an escaped whiteout + /// (regular file + `trusted.overlay.overlay.whiteout` xattr), or a + /// pre-existing user entry that shadowed the stub during image creation. + /// 2. No native whiteout (char-device rdev=0,0) may appear outside the root + /// stub table. + /// + /// **Epoch2** (`composefs_version` 2 — extended inodes, DFS, no whiteout table): + /// 1. The root directory must contain no entries with a 2-hex-char name that + /// are native whiteouts or escaped whiteouts (i.e., no stub-pattern entries). + /// 2. No escaped whiteout (regular file + `trusted.overlay.overlay.whiteout` + /// xattr) may appear anywhere in the tree. + fn validate_epoch_invariants(&self) -> Result<(), ErofsReaderError> { + let cv = self.header.composefs_version.get(); + let is_epoch1 = + cv == format::COMPOSEFS_VERSION_V0.get() || cv == format::COMPOSEFS_VERSION_V1.get(); + let is_epoch2 = cv == format::COMPOSEFS_VERSION.get(); + + // Only validate images with a known composefs_version; images opened + // without restrict_to_composefs() may have an arbitrary version field. + if !is_epoch1 && !is_epoch2 { + return Ok(()); + } + + let root_nid = self.sb.root_nid.get() as u64; + + // Helper: return true iff a name is exactly two *lowercase* hex digits. + // The stub table uses lowercase names (00..ff) generated by format!("{:02x}"). + // Uppercase hex names (e.g. "AB") are distinct entries and are not stub slots. + let is_lowercase_hex2 = |name: &[u8]| -> bool { + name.len() == 2 + && name + .iter() + .all(|b| b.is_ascii_digit() || matches!(b, b'a'..=b'f')) + }; + + // --- Walk all directories (BFS) --- + // We use an explicit stack to avoid recursion depth issues. + let mut stack = vec![root_nid]; + let mut visited: std::collections::HashSet = std::collections::HashSet::new(); + + // Track the number of hex-named root entries (used for Epoch1 check 1). + // Each of the 256 slots (00..ff) must appear exactly once. + let mut root_hex_slots: std::collections::HashSet<[u8; 2]> = + std::collections::HashSet::new(); + + while let Some(dir_nid) = stack.pop() { + if !visited.insert(dir_nid) { + continue; + } + + let dir_inode = match self.inode(dir_nid) { + Ok(i) => i, + Err(_) => continue, + }; + + if !dir_inode.mode().is_dir() { + continue; + } + + let is_root = dir_nid == root_nid; + + // Collect children from both block-based and inline directory data. + let mut children: Vec<(Vec, u64)> = Vec::new(); + + if let Ok(range) = self.inode_blocks(&dir_inode) { + for blkid in range { + if let Ok(block) = self.directory_block(blkid) + && let Ok(entries) = block.entries() + { + for entry in entries.flatten() { + if entry.name != b"." && entry.name != b".." { + children.push((entry.name.to_vec(), entry.nid())); + } + } + } + } + } + + if let Some(inline) = dir_inode.inline() + && let Ok(block) = DirectoryBlock::ref_from_bytes(inline) + && let Ok(entries) = block.entries() + { + for entry in entries.flatten() { + if entry.name != b"." && entry.name != b".." { + children.push((entry.name.to_vec(), entry.nid())); + } + } + } + + for (name, child_nid) in children { + let child_inode = match self.inode(child_nid) { + Ok(i) => i, + Err(_) => continue, + }; + + // Track hex-named root entries for Epoch1 stub table check. + if is_epoch1 && is_root && is_lowercase_hex2(&name) { + root_hex_slots.insert([name[0], name[1]]); + } + + // Recurse into subdirectories. + if child_inode.mode().is_dir() { + stack.push(child_nid); + continue; + } + + let is_native_whiteout = child_inode.is_whiteout(); + let is_escaped = is_escaped_v1_whiteout(self, &child_inode) + .map_err(|e| ErofsReaderError::InvalidImage(e.to_string()))?; + + if is_epoch1 { + if !is_root && is_native_whiteout { + // Epoch1 must not have native whiteouts outside the root stub table. + return Err(ErofsReaderError::InvalidImage( + "Epoch1 image contains native whiteout outside root stubs".into(), + )); + } + } else { + // is_epoch2: native whiteouts (char-device 0,0) are valid user + // whiteouts regardless of their name or location. The only thing + // that must not appear is an *escaped* whiteout (regular file + + // trusted.overlay.overlay.whiteout xattr), which is a V1-only + // encoding and has no place in a V2 image. + if is_escaped { + return Err(ErofsReaderError::InvalidImage( + "Epoch2 image contains escaped whiteout".into(), + )); + } + } + } + } + + // Epoch1: every hex slot 00..ff must be present in the root. + // The writer fills missing slots with stub entries; slots occupied by + // pre-existing user content are also valid (the stub was skipped). + if is_epoch1 && root_hex_slots.len() != 256 { + return Err(ErofsReaderError::InvalidImage(format!( + "Epoch1 image has {} hex-named root entries, expected 256 (00–ff)", + root_hex_slots.len(), + ))); + } + + Ok(()) + } + + /// Validates that the image does not contain FlatInline inodes with a layout + /// that old Linux kernels (< 6.12) would reject with `EFSCORRUPTED` (`EUCLEAN`). + /// + /// Only V1 (C-compatible, `composefs_version` = 0 or 1) images are expected to be + /// mounted on kernels that may predate the 6.12 fix; V2 images use a different + /// block-boundary strategy that is frozen for digest stability, so this check + /// is deliberately restricted to V1. + /// + /// The kernel's pre-6.12 fast-symlink path checks: + /// ```text + /// (inode_offset % block_size) + inode_and_xattr_size + inline_size > block_size + /// ``` + /// and returns `-EFSCORRUPTED` if true. This method returns an error for any + /// inode where that condition holds. + fn validate_v1_inline_layout(&self) -> Result<(), ErofsReaderError> { + // Only applies to V1 (C-compatible) images: composefs_version 0 (no user + // whiteouts) or 1 (user whiteouts present). V2 images (composefs_version=2) + // use a frozen layout strategy and are never mounted on pre-6.12 kernels. + let cv = self.header.composefs_version.get(); + if cv >= format::COMPOSEFS_VERSION.get() { + return Ok(()); + } + + let block_size = self.block_size as u64; + + // Walk all reachable inodes from the root rather than iterating raw nid slots. + // The inode table is not densely packed — gaps arise from padding — so + // iterating 0..sb.inos by slot can hit mid-inode bytes that accidentally + // parse as valid-looking headers with garbage xattr_icount values. + let mut stack = vec![self.sb.root_nid.get() as u64]; + let mut visited = std::collections::HashSet::new(); + + while let Some(nid) = stack.pop() { + if !visited.insert(nid) { + continue; + } + let inode = match self.inode(nid) { + Ok(i) => i, + Err(_) => continue, + }; + + // Recurse into directories to find all symlink inodes. + if inode.mode().is_dir() { + // Collect child nids from both inline and block directory data. + let mut child_nids: Vec = Vec::new(); + if let Some(inline) = inode.inline() + && let Ok(block) = DirectoryBlock::ref_from_bytes(inline) + && let Ok(entries) = block.entries() + { + for entry in entries.flatten() { + let name = entry.name; + if name == b"." || name == b".." { + continue; + } + child_nids.push(entry.nid()); + } + } + if let Ok(range) = self.inode_blocks(&inode) { + for blkid in range { + if let Ok(block) = self.directory_block(blkid) + && let Ok(entries) = block.entries() + { + for entry in entries.flatten() { + let name = entry.name; + if name == b"." || name == b".." { + continue; + } + child_nids.push(entry.nid()); + } + } + } + } + stack.extend(child_nids); + continue; + } + + // Only the pre-6.12 symlink fast-path checks the block boundary. + let mode = inode.mode().0.get(); + if mode & S_IFMT != S_IFLNK { + continue; + } + + let layout = match inode.data_layout() { + Ok(l) => l, + Err(_) => continue, + }; + if !matches!(layout, DataLayout::FlatInline) { + continue; // symlink stored out-of-band (long target > block_size) + } + + let inline_size = inode.size() % block_size; + if inline_size == 0 { + continue; + } + + // nid * 32 is the byte offset from meta_start (which is 0 for composefs). + let inode_offset = nid + .checked_mul(32) + .ok_or_else(|| ErofsReaderError::InvalidImage("nid overflow".into()))?; + let inode_pos_in_block = inode_offset % block_size; + + let header_size: u64 = match &inode { + InodeType::Compact(_) => size_of::() as u64, + InodeType::Extended(_) => size_of::() as u64, + }; + let xattr_size = inode.xattr_size() as u64; + let inode_and_xattr_size = header_size.checked_add(xattr_size).ok_or_else(|| { + ErofsReaderError::InvalidImage("inode+xattr size overflow".into()) + })?; + + let total = inode_pos_in_block + .checked_add(inode_and_xattr_size) + .and_then(|t| t.checked_add(inline_size)) + .ok_or_else(|| { + ErofsReaderError::InvalidImage("inline layout size overflow".into()) + })?; + if total > block_size { + return Err(ErofsReaderError::InvalidImage(format!( + "inode at nid {nid} (FlatInline symlink, inode_pos_in_block={inode_pos_in_block}, \ + inode_and_xattr_size={inode_and_xattr_size}, inline_size={inline_size}) \ + would trigger EUCLEAN on kernels older than 6.12: \ + {inode_pos_in_block} + {inode_and_xattr_size} + {inline_size} = {total} > {block_size}" + ))); + } + } + + Ok(()) + } + /// Finds a child directory entry by name within a directory inode. /// /// Returns the nid (inode number) of the child if found. @@ -743,6 +1066,41 @@ impl<'img> Image<'img> { } } +/// Check if an inode is a V1 escaped whiteout (a regular file carrying the +/// `trusted.overlay.overlay.whiteout` xattr added by the V1 writer). +/// +/// C composefs v1.0.8 converts char-device-rdev-0 entries to regular files +/// on write (whiteout escaping). The reader must reverse this. +fn is_escaped_v1_whiteout(img: &Image, inode: &InodeType) -> anyhow::Result { + // Only relevant for regular files + let mode = inode.mode().0.get(); + if mode & S_IFMT != S_IFREG { + return Ok(false); + } + + let Some(xattrs_section) = inode.xattrs()? else { + return Ok(false); + }; + + // Check shared xattrs + for id in xattrs_section.shared()? { + let xattr = img.shared_xattr(id.get())?; + let full_name = construct_xattr_name(xattr)?; + if full_name == format::XATTR_OVERLAY_WHITEOUT { + return Ok(true); + } + } + // Check local xattrs + for xattr in xattrs_section.local()? { + let xattr = xattr?; + let full_name = construct_xattr_name(xattr)?; + if full_name == format::XATTR_OVERLAY_WHITEOUT { + return Ok(true); + } + } + Ok(false) +} + // TODO: there must be an easier way... #[derive(FromBytes, Immutable, KnownLayout)] #[repr(C)] @@ -1041,6 +1399,7 @@ impl ObjectCollector { /// Returns a set of all referenced object IDs. pub fn collect_objects(image: &[u8]) -> ReadResult> { let img = Image::open(image)?.restrict_to_composefs()?; + img.fsck_metadata()?; let mut this = ObjectCollector { visited_nids: HashSet::new(), nids_to_visit: BTreeSet::new(), @@ -1078,21 +1437,23 @@ fn construct_xattr_name(xattr: &XAttr) -> Result, ErofsReaderError> { /// - Strips `trusted.overlay.metacopy` and `trusted.overlay.redirect` /// - Unescapes `trusted.overlay.overlay.X` back to `trusted.overlay.X` fn stat_from_inode_for_tree(img: &Image, inode: &InodeType) -> anyhow::Result { - let (st_mode, st_uid, st_gid, st_mtim_sec) = match inode { + let (st_mode, st_uid, st_gid, st_mtim_sec, st_mtim_nsec) = match inode { InodeType::Compact(inode) => ( inode.header.mode.0.get() as u32 & 0o7777, inode.header.uid.get() as u32, inode.header.gid.get() as u32, - // Compact inodes don't store mtime; the writer uses build_time - // but for round-trip purposes, 0 matches what was written for - // compact headers (the writer always uses ExtendedInodeHeader) - 0i64, + // Compact inodes don't store mtime; use superblock build_time + // (the writer sets build_time = min mtime across all inodes) + img.sb.build_time.get() as i64, + // and build_time_nsec for the nanosecond component + img.sb.build_time_nsec.get(), ), InodeType::Extended(inode) => ( inode.header.mode.0.get() as u32 & 0o7777, inode.header.uid.get(), inode.header.gid.get(), inode.header.mtime.get() as i64, + inode.header.mtime_nsec.get(), ), }; @@ -1120,6 +1481,7 @@ fn stat_from_inode_for_tree(img: &Image, inode: &InodeType) -> anyhow::Result anyhow::Result anyhow::Result, Box<[u8]>)>> { let full_name = construct_xattr_name(xattr)?; - // Skip internal overlay xattrs added by the writer - if full_name == b"trusted.overlay.metacopy" || full_name == b"trusted.overlay.redirect" { + // Skip internal overlay xattrs added by the writer (metacopy/redirect + // are composefs-internal and should not be exposed to readers). + if full_name == format::XATTR_OVERLAY_METACOPY || full_name == format::XATTR_OVERLAY_REDIRECT { + return Ok(None); + } + + // V1 whiteout escaping artifacts: strip these internal xattrs. + // XATTR_OVERLAY_WHITEOUT signals the inode is a whiteout (handled separately). + // The *_WHITEOUTS, *_OPAQUE, and user-namespace variants are parent-dir markers + // added by the V1 writer that are composefs-internal. + // Note: XATTR_OVERLAY_OPAQUE must be listed explicitly here because the general + // unescape handler below would otherwise expose it as trusted.overlay.opaque. + if full_name == format::XATTR_OVERLAY_WHITEOUT + || full_name == format::XATTR_OVERLAY_WHITEOUTS + || full_name == format::XATTR_OVERLAY_OPAQUE + || full_name == format::XATTR_USERXATTR_WHITEOUT + || full_name == format::XATTR_USERXATTR_WHITEOUTS + || full_name == format::XATTR_USERXATTR_OPAQUE + { return Ok(None); } // Unescape: trusted.overlay.overlay.X -> trusted.overlay.X - if let Some(rest) = full_name.strip_prefix(b"trusted.overlay.overlay.") { - let mut unescaped = b"trusted.overlay.".to_vec(); + if let Some(rest) = full_name.strip_prefix(format::XATTR_OVERLAY_ESCAPED_PREFIX) { + let mut unescaped = format::XATTR_OVERLAY_PREFIX.to_vec(); unescaped.extend_from_slice(rest); let name = Box::from(OsStr::from_bytes(&unescaped)); let value = Box::from(xattr.value()?); return Ok(Some((name, value))); } // Skip all other trusted.overlay.* xattrs (internal to composefs) - if full_name.starts_with(b"trusted.overlay.") { + if full_name.starts_with(format::XATTR_OVERLAY_PREFIX) { return Ok(None); } @@ -1393,6 +1772,25 @@ fn populate_directory( let name = OsStr::from_bytes(name_bytes); let child_inode = img.inode(nid)?; + // Skip overlay whiteout entries — but only in the root directory. + // C composefs only skips hex-named (00–ff) chardev(0,0) entries in root + // (lcfs-writer-erofs.c: "Skip real whiteouts (00-ff)"). + // A chardev(0,0) in a subdirectory is a legitimate device node. + // + // In V1 images the writer escapes whiteouts to regular files with + // trusted.overlay.overlay.whiteout xattr, so we must check both + // the native chardev form and the escaped regular-file form. + let is_root_dir = dir_nid == img.sb.root_nid.get() as u64; + let is_escaped_whiteout = is_escaped_v1_whiteout(img, &child_inode)?; + let is_native_whiteout = child_inode.is_whiteout(); + if is_root_dir + && (is_native_whiteout || is_escaped_whiteout) + && name_bytes.len() == 2 + && name_bytes.iter().all(|b| b.is_ascii_hexdigit()) + { + continue; + } + if child_inode.mode().is_dir() { n_subdirs = n_subdirs .checked_add(1) @@ -1427,7 +1825,14 @@ fn populate_directory( let content = match file_type { S_IFREG => { - if let Some(digest) = extract_metacopy_digest::(img, &child_inode)? { + // V1 images escape whiteouts (char dev rdev=0) to regular files. + // The is_escaped_whiteout flag was computed above (before the + // root-dir skip check), so reuse it here. + if is_escaped_whiteout { + tree::LeafContent::CharacterDevice(0) + } else if let Some(digest) = + extract_metacopy_digest::(img, &child_inode)? + { tree::LeafContent::Regular(tree::RegularFile::External( digest, child_inode.size(), @@ -1468,10 +1873,19 @@ fn populate_directory( _ => anyhow::bail!("unknown file type {:#o} for {:?}", file_type, name), }; + // Hardlinked whiteouts are semantically invalid: a whiteout represents the + // absence of a file in an overlay, so nlink > 1 is meaningless. + let on_disk_nlink = child_inode.nlink(); + if matches!(content, tree::LeafContent::CharacterDevice(0)) && on_disk_nlink > 1 { + anyhow::bail!( + "invalid composefs image: whiteout inode {:?} has nlink > 1", + name + ); + } + let leaf_id = builder.push_leaf(stat, content); // Track for hardlink detection if nlink > 1 - let on_disk_nlink = child_inode.nlink(); if on_disk_nlink > 1 { builder.hardlinks.insert(nid, leaf_id); } @@ -1572,7 +1986,7 @@ mod tests { use super::*; use crate::{ dumpfile::{dumpfile_to_filesystem, write_dumpfile}, - erofs::writer::mkfs_erofs, + erofs::writer::{ValidatedFileSystem, mkfs_erofs}, fsverity::Sha256HashValue, }; use std::collections::HashMap; @@ -1653,7 +2067,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); // Root should have . and .. and empty_dir @@ -1698,7 +2112,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); // Find dir1 @@ -1743,7 +2157,7 @@ mod tests { } let fs = dumpfile_to_filesystem::(&dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); // Find bigdir @@ -1793,7 +2207,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); // Navigate through the structure @@ -1831,7 +2245,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); let root_inode = img.root().unwrap(); @@ -1877,7 +2291,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); // This should traverse all directories without error let result = collect_objects::(&image); @@ -1953,7 +2367,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); // Verify root entries @@ -2000,7 +2414,7 @@ mod tests { write_dumpfile(&mut orig_output, &fs_orig).unwrap(); let orig_str = String::from_utf8(orig_output).unwrap(); - let image = mkfs_erofs(&fs_orig); + let image = mkfs_erofs(&mut ValidatedFileSystem::new(fs_orig).unwrap()); let fs_rt = erofs_to_filesystem::(&image).unwrap(); let mut rt_output = Vec::new(); @@ -2105,7 +2519,8 @@ mod tests { "#; let fs_orig = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs_orig); + let mut vfs_orig = ValidatedFileSystem::new(fs_orig).unwrap(); + let image = mkfs_erofs(&mut vfs_orig); let fs_rt = erofs_to_filesystem::(&image).unwrap(); // Verify hardlink sharing via LeafId @@ -2120,7 +2535,7 @@ mod tests { // Verify dumpfile round-trips correctly let mut orig_output = Vec::new(); - write_dumpfile(&mut orig_output, &fs_orig).unwrap(); + write_dumpfile(&mut orig_output, &vfs_orig.0).unwrap(); let orig_str = String::from_utf8(orig_output).unwrap(); let mut rt_output = Vec::new(); @@ -2149,7 +2564,7 @@ mod tests { // Build a minimal valid composefs image (just a root directory). let dumpfile = "/ 0 40755 2 0 0 0 1000.0 - - -\n"; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let base_image = mkfs_erofs(&fs); + let base_image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); // Sanity: the unmodified image passes restrict_to_composefs(). Image::open(&base_image) @@ -2278,7 +2693,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let base_image = mkfs_erofs(&fs); + let base_image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); // Sanity: unmodified image round-trips fine erofs_to_filesystem::(&base_image) @@ -2335,9 +2750,14 @@ mod tests { for case in &cases { let mut image = base_image.clone(); - let offset = inline_offset + case.entry_byte_offset; + let entry_start = inline_offset + case.entry_byte_offset; // Write a bogus nid (0xDEAD) that doesn't match the directory's own nid - image[offset..offset + 8].copy_from_slice(&0xDEADu64.to_le_bytes()); + // Use zerocopy to get a typed &mut DirectoryEntryHeader instead of raw bytes. + let hdr = DirectoryEntryHeader::mut_from_bytes( + &mut image[entry_start..entry_start + size_of::()], + ) + .expect("entry slice must be a valid DirectoryEntryHeader"); + hdr.inode_offset = zerocopy::little_endian::U64::new(0xDEAD); let result = erofs_to_filesystem::(&image); let err = result.expect_err(&format!("{}: should have been rejected", case.name)); @@ -2369,7 +2789,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let base_image = mkfs_erofs(&fs); + let base_image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); // Sanity check erofs_to_filesystem::(&base_image) @@ -2380,22 +2800,26 @@ mod tests { let root_nid = img.sb.root_nid.get() as u64; let file_nid = img.find_child_nid(root_nid, b"file").unwrap().unwrap(); - // Compute byte offset of the file's inode in the image - let block_size = img.block_size; - let meta_start = img.sb.meta_blkaddr.get() as usize * block_size; - let inode_byte_offset = meta_start + file_nid as usize * 32; - let is_extended = base_image[inode_byte_offset] & 1 != 0; + // Use the typed Image API to locate the inode slot without raw byte arithmetic. + let inode = img.inode(file_nid).unwrap(); + let is_extended = matches!(inode, InodeType::Extended(_)); + let inodes_start = img.image.len() - img.inodes.len(); + let inode_slot_start = inodes_start + file_nid as usize * 32; + drop(inode); drop(img); let mut image = base_image.clone(); + let slot = &mut image[inode_slot_start..]; if is_extended { - // ExtendedInodeHeader.nlink is U32 at byte offset 44 - let nlink_offset = inode_byte_offset + 44; - image[nlink_offset..nlink_offset + 4].copy_from_slice(&5u32.to_le_bytes()); + let hdr = + ExtendedInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid ExtendedInodeHeader"); + hdr.nlink = zerocopy::little_endian::U32::new(5); } else { - // CompactInodeHeader.nlink is U16 at byte offset 6 - let nlink_offset = inode_byte_offset + 6; - image[nlink_offset..nlink_offset + 2].copy_from_slice(&5u16.to_le_bytes()); + let hdr = + CompactInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid CompactInodeHeader"); + hdr.nlink = zerocopy::little_endian::U16::new(5); } let result = erofs_to_filesystem::(&image); @@ -2421,7 +2845,7 @@ mod tests { "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let base_image = mkfs_erofs(&fs); + let base_image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); // Sanity check erofs_to_filesystem::(&base_image) @@ -2432,21 +2856,26 @@ mod tests { let root_nid = img.sb.root_nid.get() as u64; let dir_nid = img.find_child_nid(root_nid, b"dir").unwrap().unwrap(); - let block_size = img.block_size; - let meta_start = img.sb.meta_blkaddr.get() as usize * block_size; - let inode_byte_offset = meta_start + dir_nid as usize * 32; - let is_extended = base_image[inode_byte_offset] & 1 != 0; + // Use the typed Image API to locate the inode slot without raw byte arithmetic. + let inode = img.inode(dir_nid).unwrap(); + let is_extended = matches!(inode, InodeType::Extended(_)); + let inodes_start = img.image.len() - img.inodes.len(); + let inode_slot_start = inodes_start + dir_nid as usize * 32; + drop(inode); drop(img); let mut image = base_image.clone(); + let slot = &mut image[inode_slot_start..]; if is_extended { - // ExtendedInodeHeader.nlink is U32 at byte offset 44 - let nlink_offset = inode_byte_offset + 44; - image[nlink_offset..nlink_offset + 4].copy_from_slice(&99u32.to_le_bytes()); + let hdr = + ExtendedInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid ExtendedInodeHeader"); + hdr.nlink = zerocopy::little_endian::U32::new(99); } else { - // CompactInodeHeader.nlink is U16 at byte offset 6 - let nlink_offset = inode_byte_offset + 6; - image[nlink_offset..nlink_offset + 2].copy_from_slice(&99u16.to_le_bytes()); + let hdr = + CompactInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid CompactInodeHeader"); + hdr.nlink = zerocopy::little_endian::U16::new(99); } let result = erofs_to_filesystem::(&image); @@ -2471,30 +2900,35 @@ mod tests { // stays the same and the inode still parses successfully. let dumpfile = "/ 0 40755 1 0 0 0 0.0 - - -\n"; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let mut image = mkfs_erofs(&fs); + let mut image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); let img = Image::open(&image).unwrap(); - let root_nid = img.sb.root_nid.get() as usize; + let root_nid = img.sb.root_nid.get() as u64; let block_size = img.block_size; - let meta_start = img.sb.meta_blkaddr.get() as usize * block_size; - let inode_offset = meta_start + root_nid * 32; - // Determine inode layout from the first byte - let is_extended = image[inode_offset] & 1 != 0; + + // Use the typed Image API to locate the inode slot without raw byte arithmetic. + let inode = img.inode(root_nid).unwrap(); + let is_extended = matches!(inode, InodeType::Extended(_)); + let inodes_start = img.image.len() - img.inodes.len(); + let inode_slot_start = inodes_start + root_nid as usize * 32; + drop(inode); drop(img); // Use a huge size that is a multiple of block_size (4096) so inline // tail size stays 0 and the inode remains parseable. let huge_size: u64 = (block_size as u64) * 1_000_000_000; + let slot = &mut image[inode_slot_start..]; if is_extended { - // ExtendedInodeHeader.size is a U64 at byte offset 8 - let size_offset = inode_offset + 8; - image[size_offset..size_offset + 8].copy_from_slice(&huge_size.to_le_bytes()); + let hdr = + ExtendedInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid ExtendedInodeHeader"); + hdr.size = zerocopy::little_endian::U64::new(huge_size); } else { - // CompactInodeHeader.size is a U32 at byte offset 8 - let size_offset = inode_offset + 8; - let truncated = huge_size as u32; - image[size_offset..size_offset + 4].copy_from_slice(&truncated.to_le_bytes()); + let hdr = + CompactInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid CompactInodeHeader"); + hdr.size = zerocopy::little_endian::U32::new(huge_size as u32); } let img = Image::open(&image).unwrap(); @@ -2510,43 +2944,552 @@ mod tests { mod proptest_tests { use super::*; + use crate::erofs::{format::FormatVersion, writer::mkfs_erofs_versioned}; use crate::fsverity::Sha512HashValue; - use crate::test::proptest_strategies::{build_filesystem, filesystem_spec}; + use crate::test::proptest_strategies::{ + FsSpec, build_filesystem, build_unusual_filesystem, filesystem_spec, + unusual_filesystem_spec, + }; use proptest::prelude::*; - /// Round-trip a FileSystem through erofs and compare dumpfile output. - fn round_trip_filesystem( - fs_orig: &tree::FileSystem, - ) { - let mut orig_output = Vec::new(); - write_dumpfile(&mut orig_output, fs_orig).unwrap(); - - let image = mkfs_erofs(fs_orig); + /// Round-trip a FileSystem through V2 erofs and compare dumpfile output. + /// + /// V2 EROFS does not store mtime nanoseconds: the on-disk `mtime_nsec` + /// field is always zero. Build the expected dumpfile from a copy of the + /// filesystem with `mtime_nsec` zeroed so the comparison reflects what + /// V2 actually stores, not what the in-memory tree carries. + fn round_trip_filesystem(spec: FsSpec) { + // fs_write → source for the EROFS image. + // fs_expected → reference with mtime_nsec=0, matching V2 on-disk format. + let fs_write = build_filesystem::(spec.clone()); + let mut fs_expected = build_filesystem::(spec); + // V2 EROFS does not store mtime nanoseconds; zero them before comparing. + fs_expected.for_each_stat_mut(|s| s.st_mtim_nsec = 0); + + let mut expected_output = Vec::new(); + write_dumpfile(&mut expected_output, &fs_expected).unwrap(); + + let image = mkfs_erofs(&mut ValidatedFileSystem::new(fs_write).unwrap()); let fs_rt = erofs_to_filesystem::(&image).unwrap(); let mut rt_output = Vec::new(); write_dumpfile(&mut rt_output, &fs_rt).unwrap(); similar_asserts::assert_eq!( - String::from_utf8_lossy(&orig_output), + String::from_utf8_lossy(&expected_output), String::from_utf8_lossy(&rt_output) ); } + /// Round-trip a FileSystem through V1 erofs and compare dumpfile output. + /// + /// V1 uses compact inodes (when mtime matches the minimum), BFS ordering, + /// and includes overlay whiteout character device entries in the root. + /// The writer adds `trusted.overlay.opaque` to the root; the reader strips + /// internal overlay xattrs. Whiteout char-device entries (00–ff in root) + /// are also stripped, matching C composefs reader behaviour. + fn round_trip_filesystem_v1(spec: FsSpec) { + // Build two separate filesystems from the same spec so we avoid + // Rc::strong_count issues from sharing leaf Rcs. + let fs_write = build_filesystem::(spec.clone()); + let fs_expected = build_filesystem::(spec); + + // The writer internally adds trusted.overlay.opaque=y to root and + // the 256 V1 whiteout stubs; the reader strips all trusted.overlay.* + // but the reader strips all trusted.overlay.* xattrs that aren't + // escaped user xattrs. So the expected filesystem should NOT have it. + + // Generate the V1 image from the write filesystem. + let image = mkfs_erofs_versioned( + &mut ValidatedFileSystem::new(fs_write).unwrap(), + FormatVersion::V1, + ); + + // Validate the layout invariant: no FlatInline inode should + // trigger EUCLEAN on kernels < 6.12. This catches the + // block-boundary bug even when proptest doesn't generate a + // case large enough to trip it at mount time. + Image::open(&image) + .unwrap() + .fsck_metadata() + .expect("V1 image should have valid inline layout for pre-6.12 kernels"); + + // Read back from the image. + let fs_rt = erofs_to_filesystem::(&image).unwrap(); + + // Compare via dumpfile serialization. + let mut expected_output = Vec::new(); + write_dumpfile(&mut expected_output, &fs_expected).unwrap(); + + let mut rt_output = Vec::new(); + write_dumpfile(&mut rt_output, &fs_rt).unwrap(); + + if expected_output != rt_output { + let expected_str = String::from_utf8_lossy(&expected_output); + let rt_str = String::from_utf8_lossy(&rt_output); + panic!( + "V1 round-trip mismatch:\n--- expected ---\n{expected_str}\n--- got ---\n{rt_str}" + ); + } + } + + /// Verify that C composefs-info can parse an EROFS image we generated, + /// and that its dump output matches our Rust reader's interpretation. + /// + /// This is the critical compatibility test: it proves that EROFS images + /// produced by our writer are consumable by the C implementation. + fn verify_c_composefs_info_reads_image(image: &[u8]) { + use std::io::Write; + + // Validate layout invariant before testing C reader compatibility. + Image::open(image) + .unwrap() + .fsck_metadata() + .expect("image should have valid inline layout for pre-6.12 kernels"); + + // Write image to a tempfile + let mut tmp = tempfile::NamedTempFile::new().unwrap(); + tmp.write_all(image).unwrap(); + tmp.flush().unwrap(); + + // Run C composefs-info dump on the image with a timeout. + let child = std::process::Command::new("composefs-info") + .arg("dump") + .arg(tmp.path()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .unwrap(); + + let output = { + let (tx, rx) = std::sync::mpsc::channel(); + std::thread::spawn(move || { + let _ = tx.send(child.wait_with_output()); + }); + rx.recv_timeout(std::time::Duration::from_secs(10)) + .expect("composefs-info timed out after 10 seconds") + .unwrap() + }; + + if !output.status.success() { + panic!( + "C composefs-info dump failed (exit {:?}):\nstderr: {}", + output.status.code(), + String::from_utf8_lossy(&output.stderr), + ); + } + + let c_dump = String::from_utf8(output.stdout).expect("C dump should be valid UTF-8"); + + // Get our Rust reader's interpretation of the same image + let fs_rt = erofs_to_filesystem::(image).unwrap(); + let mut rust_dump_bytes = Vec::new(); + write_dumpfile(&mut rust_dump_bytes, &fs_rt).unwrap(); + let rust_dump = String::from_utf8(rust_dump_bytes).unwrap(); + + // Parse both dumps into structured entries, then normalize and + // compare. This avoids fragile string munging and lets the + // dumpfile parser handle escaping, field splitting, etc. + // + // Apply the C reader empty-xattr workaround to the Rust dump as + // well: we are testing C-reader compatibility here, so we strip + // the same entries C would silently drop. Rust-only round-trip + // tests (test_erofs_round_trip_*) compare dumpfiles directly + // without this workaround, catching Rust writer bugs without masking them. + let c_entries = parse_c_dump(&c_dump); + let rust_entries = parse_c_dump(&rust_dump); + + similar_asserts::assert_eq!(c_entries, rust_entries); + } + + /// Parse a dump produced by C composefs-info and normalize for comparison. + /// + /// Applies the empty-xattr workaround for the known C reader bug: the + /// inline-xattr loop uses strict `<` instead of `<=` when checking the + /// end pointer, so it silently skips the last entry whenever it is exactly + /// 4 bytes (header only: name_len=0, value_size=0). This occurs for + /// system.posix_acl_access/default with empty values, where the prefix + /// index encodes the full key leaving a zero-length suffix. + fn parse_c_dump(dump: &str) -> Vec { + normalize_dump(dump, true) + } + + /// Parse a dump produced by our Rust reader and normalize for comparison. + /// + /// Does NOT apply the C reader empty-xattr workaround — Rust output must + /// be left unfiltered so any Rust writer bugs producing empty xattrs are + /// caught rather than silently masked. + /// + /// For C compat tests, use [`parse_c_dump`] on both sides so the + /// comparison accounts for the known C reader limitation. + + fn normalize_dump(dump: &str, strip_empty_xattrs: bool) -> Vec { + use crate::dumpfile_parse::{Entry, Item}; + use std::os::unix::ffi::OsStrExt; + + dump.lines() + .filter(|line| !line.is_empty()) + .filter_map(|line| { + let mut entry = Entry::parse(line).unwrap_or_else(|e| { + panic!("Failed to parse dump line: {e}\n line: {line}") + }); + + // C composefs-info (lcfs_build_node_from_image) unconditionally + // treats any chardev with rdev=0 as a whiteout and skips it, + // returning ENOTSUP regardless of where in the tree it appears: + // + // if (type == S_IFCHR && node->inode.st_rdev == 0) { + // errno = ENOTSUP; + // return NULL; + // } + // + // Our Rust reader preserves chardev(0,0) entries in subdirectories + // (it only strips the root-level 00–ff overlay whiteout stubs). + // Strip all chardev(0,0) entries from both sides of the comparison + // so the test reflects what C actually outputs. + if let Item::Device { rdev: 0, .. } = entry.item { + if (entry.mode & 0o170000) == 0o20000 { + return None; + } + } + + if strip_empty_xattrs { + entry.xattrs.retain(|x| !x.value.is_empty()); + } + // Strip overlay xattrs that the C reader keeps but our Rust reader + // strips as composefs-internal: + // - user.overlay.opaque: OVERLAY_XATTR_USERXATTR_OPAQUE, kept by C + // - trusted.overlay.opaque: the C reader unescapes + // trusted.overlay.overlay.opaque to this; Rust strips the + // escaped form before unescaping so it never appears in Rust + // output. Normalizing both sides makes the comparison test + // semantic content rather than internal overlay state. + entry.xattrs.retain(|x| { + x.key.as_bytes() != b"user.overlay.opaque" + && x.key.as_bytes() != b"trusted.overlay.opaque" + }); + Some(entry.to_string()) + }) + .collect() + } + proptest! { - #![proptest_config(ProptestConfig::with_cases(64))] + #![proptest_config(ProptestConfig::with_cases(200))] #[test] fn test_erofs_round_trip_sha256(spec in filesystem_spec()) { - let fs = build_filesystem::(spec); - round_trip_filesystem(&fs); + round_trip_filesystem::(spec); } #[test] fn test_erofs_round_trip_sha512(spec in filesystem_spec()) { - let fs = build_filesystem::(spec); - round_trip_filesystem(&fs); + round_trip_filesystem::(spec); + } + + #[test] + fn test_erofs_round_trip_v1_sha256(spec in filesystem_spec()) { + round_trip_filesystem_v1::(spec); + } + + #[test] + fn test_erofs_round_trip_v1_sha512(spec in filesystem_spec()) { + round_trip_filesystem_v1::(spec); + } + + } + + /// Verify C composefs-info can parse random V1 (C-compatible) EROFS + /// images generated by our writer, and that its dump output matches + /// our Rust reader's interpretation. + #[test_with::executable(composefs-info)] + #[test] + fn test_c_composefs_info_reads_v1() { + let mut runner = + proptest::test_runner::TestRunner::new(ProptestConfig::with_cases(200)); + runner + .run(&filesystem_spec(), |spec| { + let fs = build_filesystem::(spec); + let image = mkfs_erofs_versioned( + &mut ValidatedFileSystem::new(fs).unwrap(), + FormatVersion::V1, + ); + verify_c_composefs_info_reads_image(&image); + Ok(()) + }) + .unwrap(); + } + + /// Verify C composefs-info can parse random V2 (Rust-native) EROFS + /// images generated by our writer. + #[test_with::executable(composefs-info)] + #[test] + fn test_c_composefs_info_reads_v2() { + let mut runner = + proptest::test_runner::TestRunner::new(ProptestConfig::with_cases(200)); + runner + .run(&filesystem_spec(), |spec| { + let fs = build_filesystem::(spec); + let image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); + verify_c_composefs_info_reads_image(&image); + Ok(()) + }) + .unwrap(); + } + + /// Verify C composefs-info can parse random V2 EROFS images generated from + /// unusual content (whiteout escaping, ACLs, multiple overlay xattrs, large + /// external files, cross-type hardlinks), and that its dump output matches + /// our Rust reader's interpretation. + /// + /// Mirrors `test_v1_binary_identical_unusual_content` but for V2 images + /// where byte-for-byte C identity is not the goal (V2 is Rust-native); + /// instead we verify semantic equivalence via normalized dump comparison. + #[test_with::executable(composefs-info)] + #[test] + fn test_c_composefs_info_reads_v2_unusual() { + let mut runner = + proptest::test_runner::TestRunner::new(ProptestConfig::with_cases(200)); + runner + .run(&unusual_filesystem_spec(), |spec| { + let fs = build_unusual_filesystem::(spec); + let image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); + verify_c_composefs_info_reads_image(&image); + Ok(()) + }) + .unwrap(); + } + + /// Run `debug_img` on an image and return the structured dump as a String. + fn debug_dump(image: &[u8]) -> String { + use crate::erofs::debug::debug_img; + let mut out = Vec::new(); + debug_img(&mut out, image).expect("debug_img failed"); + String::from_utf8(out).expect("debug_img produced non-UTF8") + } + + /// Diff two debug dumps, returning a unified-diff-style string of the differences. + fn diff_debug_dumps(label_a: &str, a: &str, label_b: &str, b: &str) -> String { + use std::fmt::Write; + let a_lines: Vec<&str> = a.lines().collect(); + let b_lines: Vec<&str> = b.lines().collect(); + let mut out = String::new(); + let max = a_lines.len().max(b_lines.len()); + let mut diffs = 0usize; + for i in 0..max { + let la = a_lines.get(i).copied().unwrap_or(""); + let lb = b_lines.get(i).copied().unwrap_or(""); + if la != lb { + diffs += 1; + if diffs <= 40 { + writeln!(out, "line {i}:").unwrap(); + writeln!(out, " {label_a}: {la}").unwrap(); + writeln!(out, " {label_b}: {lb}").unwrap(); + } + } + } + if diffs > 40 { + writeln!(out, "... and {} more differing lines", diffs - 40).unwrap(); } + if diffs == 0 { + out.push_str("(no differences)"); + } + out + } + + /// Run C `mkcomposefs --from-file -` on a dumpfile string and return the raw image bytes. + fn c_mkcomposefs_from_dumpfile(dumpfile: &str) -> Vec { + use std::io::{Read, Seek, SeekFrom, Write}; + // Write dumpfile to a tempfile + let mut tf = tempfile::tempfile().unwrap(); + tf.write_all(dumpfile.as_bytes()).unwrap(); + tf.seek(SeekFrom::Start(0)).unwrap(); + // Run mkcomposefs --from-file - - + let out_tf = tempfile::tempfile().unwrap(); + let mut child = std::process::Command::new("mkcomposefs") + .args(["--from-file", "-", "-"]) + .stdin(std::process::Stdio::from(tf)) + .stdout(std::process::Stdio::from(out_tf.try_clone().unwrap())) + .stderr(std::process::Stdio::inherit()) + .spawn() + .expect("failed to spawn mkcomposefs"); + let status = child.wait().unwrap(); + assert!(status.success(), "mkcomposefs failed: {status}"); + let mut out_tf = out_tf; + out_tf.seek(SeekFrom::Start(0)).unwrap(); + let mut bytes = Vec::new(); + out_tf.read_to_end(&mut bytes).unwrap(); + bytes + } + + /// Verify that our Rust V1 writer produces byte-for-byte identical EROFS images + /// to C mkcomposefs for the same user-level input. + /// + /// This is a stronger check than `test_c_composefs_info_reads_v1`: instead of + /// comparing parsed dump output (which won't catch wrong binary layout like the + /// EUCLEAN block-boundary bug), we compare raw image bytes. If our V1 writer + /// disagrees with the C reference even on a single padding byte, this fails. + /// + /// The test mirrors the production flow: C receives a dumpfile of the user-level + /// tree (no whiteout stubs) and adds the 256 stubs internally; the Rust V1 writer + /// also adds the stubs automatically during image generation. + /// + /// On failure the structural diff from `debug_img` is printed to make the + /// divergence immediately obvious without a separate manual step. + #[test_with::executable(mkcomposefs)] + #[test] + fn test_v1_binary_identical_to_c_mkcomposefs() { + let mut runner = + proptest::test_runner::TestRunner::new(ProptestConfig::with_cases(200)); + runner + .run(&filesystem_spec(), |spec| { + // Build two independent filesystems from the same spec: + // fs_c — user entries only, serialized as dumpfile and fed to + // C mkcomposefs (which adds the 256 whiteout stubs internally) + // fs_rs — user entries only, fed directly to our Rust V1 writer + // (the writer adds the 256 whiteout stubs automatically) + // + // Using the same spec for both ensures the user-level content matches. + let fs_c = build_filesystem::(spec.clone()); + let fs_rs = build_filesystem::(spec); + + // Serialize the pre-whiteout tree for C (no stubs in dumpfile) + let mut dumpfile_bytes = Vec::new(); + write_dumpfile(&mut dumpfile_bytes, &fs_c).unwrap(); + let dumpfile = String::from_utf8(dumpfile_bytes).unwrap(); + + // Get C mkcomposefs binary output (C adds stubs internally) + let c_image = c_mkcomposefs_from_dumpfile(&dumpfile); + + // Get our Rust V0 writer binary output (stubs added automatically by writer) + let rust_image = mkfs_erofs_versioned( + &mut ValidatedFileSystem::new(fs_rs).unwrap(), + FormatVersion::V0, + ); + + if c_image != rust_image.as_ref() { + let c_debug = debug_dump(&c_image); + let rust_debug = debug_dump(&rust_image); + similar_asserts::assert_eq!( + c_debug, + rust_debug, + "binary mismatch (c={} bytes, rust={} bytes)\ndumpfile:\n{dumpfile}", + c_image.len(), + rust_image.len(), + ); + } + Ok(()) + }) + .unwrap(); + } + + /// Binary-compatibility test using the unusual-content generator. + /// + /// Covers corner cases in the V1 writer that the ordinary random generator almost + /// never exercises: whiteout escaping, multiple trusted.overlay.* xattrs per inode, + /// system.posix_acl_access (HAS_ACL flag), large external file sizes, and + /// cross-type hardlinks (to symlinks, whiteouts, devices, FIFOs). + /// + /// Runs 64 cases against C mkcomposefs byte-for-byte. + #[test_with::executable(mkcomposefs)] + #[test] + fn test_v1_binary_identical_unusual_content() { + let mut runner = + proptest::test_runner::TestRunner::new(ProptestConfig::with_cases(200)); + runner + .run(&unusual_filesystem_spec(), |spec| { + let fs_c = build_unusual_filesystem::(spec.clone()); + let fs_rs = build_unusual_filesystem::(spec); + + let mut dumpfile_bytes = Vec::new(); + write_dumpfile(&mut dumpfile_bytes, &fs_c).unwrap(); + let dumpfile = String::from_utf8(dumpfile_bytes).unwrap(); + + let c_image = c_mkcomposefs_from_dumpfile(&dumpfile); + let rust_image = mkfs_erofs_versioned( + &mut ValidatedFileSystem::new(fs_rs).unwrap(), + FormatVersion::V0, + ); + + if c_image != rust_image.as_ref() { + let c_debug = debug_dump(&c_image); + let rust_debug = debug_dump(&rust_image); + similar_asserts::assert_eq!( + c_debug, + rust_debug, + "binary mismatch (c={} bytes, rust={} bytes)\ndumpfile:\n{dumpfile}", + c_image.len(), + rust_image.len(), + ); + } + Ok(()) + }) + .unwrap(); + } + + /// Diagnostic: dump the structural diff between C mkcomposefs and our Rust V1 + /// writer for a known-failing minimal case (large flat directory, no xattrs). + /// + /// This test is `#[ignore]` — run it manually with: + /// cargo test -p composefs --lib -- erofs::reader::tests::proptest_tests::test_v1_binary_diff_diagnostic --ignored --nocapture + /// + /// It uses `debug_img` (our injective EROFS structure dumper) to show exactly + /// which fields diverge between the two images, making it easy to pinpoint + /// the bug in the writer without manually parsing hex dumps. + #[test_with::executable(mkcomposefs)] + #[test] + #[ignore] + fn test_v1_binary_diff_diagnostic() { + // Known-failing proptest case: use the exact dumpfile from a proptest failure. + // The flow matches the proptest exactly: + // - fs_c is built from spec and serialized to dumpfile (no stubs) for C + // - fs_rs is built from the same spec and fed to the Rust V1 writer + // (which adds the 256 whiteout stubs automatically) + let dumpfile = "\ +/ 0 40000 3 0 0 0 0.0 - - -\n\ +/B 0 47123 2 32924 6322 0 334277904.419157028 - - - user.test_3=\\x14\\x11\\xf5\\xbe\\xf0\\x1f\\x15<\\\\\\x84Gu(\\x17T\\xdb\\xca\\xd5\n\ +/B/\\x06\\xc3} 43 102747 1 14780 50024 0 1909128638.32940851 - X\\xb8\\xac\\xf9[\\x8br\\x1a\\x11\\xed\\x96]\\x9c\\xed\\xba\\x8f\\x13\\xcc/i\\x12\\x7fE\\x18\\xf8n\\xaeV_E\\x8bS]x\\x93/g\\x92\\x0f?\\xd8\\xf4\\xf5 - security.capability=r\\x93\\x84\\x18M user.test_3=&+\\xf2\\xee\\x89sz user.test_4=\n\ +/B/\\x1f\\xe3\\x17\\xcb\\xe9\\x81\\x9aT\\xd2\\x13\\x19\\xf2\\xaf\\xee\\x20\\xba\\xb3 43 102274 1 41061 21812 0 446804811.557100600 - <\\x10@Z\\x00\\xc5\\xf9\\xca\\xe1=\\xfc\\xe0\\x81)p\\xa4\\x9f\\xa8\\x18+\\x88\\x0e\\xc3\\xa2\\xdf0\\x82*\\xc2q[x\\x86\\x88\\x80\\xf1]b$\\\\\\x1f]\\xeb - system.posix_acl_access= trusted.test_0=\\x92 trusted.test_2=\\\\\\xec\\x83\\x89\\x85\"\\xf9\\x9b\\xbc\\xa5\\xb0\\xef\\xbcC\\xe8Z\\x88F\\x83\\x17 user.test_1=\\xc4\\xc1\\x08\\xff\\xfa\\xd3\\xed\\xad\\x9bS6f\\tS\\x8d\n\ +/B/#\\xcd\\x17\\xb2\\xf0\\x03g\\xea\\x87iI\\xe3{_\\xe1 7 100554 1 50668 49879 0 1545457558.133147722 - \\xb6\\xa1$?\\xd2:\\xb9 - system.posix_acl_default=\\x97\\xde\\xd1S;,; user.test_4=\\xf7\\x82S\\xa5\\xc3,?\\x98\\x84p\\xbf\\x14&\\x91+\\x8e\\xdb\n\ +/B/3\\xf4\\xf5\\xc2e\\x07\\xb5\\xacC\\xa1 45 106705 1 56683 56444 0 1577642975.579080132 - \\xdf[\\x83j\\x1e\\x99\\xd8\\xc0[\\x8ba\\xc0f\\xec\\xe0\\x8b*\\xee\\x031\\x91\\x0f38\\x0f\\x08\\xc0\\xcd\\xa9\\x1a^\\x90]\\xc9!>\\xa9S*\\x94\\x8c\\x17\\xa8h\\xc3 - security.ima=E\\x04L\\tb@9\\x07!h) trusted.overlay.custom=~\\x16\\x1f-\\xfc\\xa3\\x07\\x17\\xd1\\xa0 trusted.test_2=O\n\ +/B/Eap_z828H.-6-_S 0 14476 1 4557 40071 0 206142614.191638235 - - - security.ima=H\\xfd\\x9e&\\x9a:\\xe5\\x93\\xa4 system.posix_acl_access=N\\x1c|\\xc7$O3\\x198%\\xb4\\xe8 trusted.overlay.origin=Y+\\xa4\\xd1\\x16r\\xdd|\\xfaG user.test_4=\n\ +/B/Gv7O_..._.faB2-_-22dNscP_eGqkxP35_.0l.w.hfrZXl_v4h.MGEE7___GGF221-V-__WgP-h-6Th_NIB_._j.-U.Qj_2_iA.P_3_-_..9.1oxn4_mM_6XEAJ196_.6Z9iR_YM-Wr0L_.kz.icFqb_EzB27-___AC7bGW_.t_rwee8rtQ4_0rD_t1-J__5iR.r1_8cNUQXai5w4.e2_G-.7j.DyiD__Rfv6Lhgfzn-QFr_-J 44 124140 1 29304 30605 0 620161379.796821778 ____SlN/.yp1zAst_-P/5_RO_-cy7O_Z__310L__d2yo - -\n\ +/B/IP-_jBs 1 126270 1 31623 24545 0 1072774021.893731176 \\xcb - -\n\ +/B/KAS.d8m.y6U 16 125603 1 24529 17343 0 340236667.19836524 9\\x14\\xe2{\\xe9[\\x96q\\x08h;\\xc8\\x83\\xa4\\xb3\\xb9 - - trusted.overlay.origin=b\\xec'\\x8c\\x16\\xea\\xcb\\x10\\xc8\\xbe\\x18\\xf7*\\x0c\\x04\\xb8\\xb1 trusted.overlay.overlay.nested= trusted.test_1=e\\x08#\n\ +/B/Mp 27 106753 1 37244 13252 0 91373000.857571176 - OV\\x8e!\\xfdw9I\\xab\\x8f\\x9a;!\\xb4]f\\n]\\xc8\\x7f\\xa5\\x94\\x07\\xd4%\\x97\\x85 -\n\ +/B/Ze.7.-.9_._Ocl1k2_ 46 107670 1 14097 58513 0 488459452.877162371 - \\xc1\\x17\\x1d\\xa7\\x14S)\\xcd}\\xc9/~\\xa4d\\x1cN\\xbeN\\x184\\x90\\xa9A\\x12\\x8bY/(\\x1a,%\"\\xe3\\xb3\\xf2\\x86\\xec\\x20\\xf6\"Ug;\\x84\\\\A - trusted.overlay.origin=\\xfe\\xda7D\\xbf\\xb0\\xe9\\x9ct0Q user.test_4=-\\xdc\n\ +/B/]\\x05\\x19i\\x97\\xeb\\x8c\\xc4k\\x02\\\\jB`j\\x8f\\xb4\\xb6\\xfbw5\\xef\\xf3\\x0fd 0 23230 1 31997 45657 7135 105859383.867998730 - - - system.posix_acl_default=\\xb1p\\x96\\xe45\\xdcC\\x8bI\\x0e\\xfd#\\x8d\n\ +/B/_tvW.__t_l_-jK.4j 554649 106606 1 29300 51208 0 705049404.750293896 e5/39a0e32972ef85332212be14f7b863409d9e4113f80603285d1cd52a852822 - e539a0e32972ef85332212be14f7b863409d9e4113f80603285d1cd52a852822 user.test_4=\\xbf\\xbbL\\xe9\\xbc\\x92$\\xa3\\xf9\\xc6\\x06.\\x3d^\n\ +/B/q._v.T_.Mba__ 32 122305 1 29088 34366 0 881062039.274688283 _C_Kn1_.r_.IK/TGai6_zqLoTt___w_e - - trusted.overlay.overlay.nested=6\\x03\\xee\\xff\\xdbI\\xdcu(\\\\\\xe1\\x9a\\xee\\xd3e\\x06 user.test_2=\\x9a\\xc4$\\xe1\n\ +/B/u 25 105023 2 14652 44878 0 294073763.291036424 - \\x84R\\xd6@\\x0e\\x8b\\x04\\xb4(e\\x93\\xe9\\x86\\xdc\\x03\\xc7\\xbf\\xe1,OmC\\xe9U\\xf1 - trusted.overlay.origin=\\xc4mH\\x9a\n\ +/B/\\x81X\\xef\\r\\xce\\x12\\xf4U(p\\xc3\\xb2\\x19\\xe3r\\xd2v9\\x1c\\x02\\xca 46 121141 1 3272 11859 0 1219611767.718731195 jfsk35_Gz__n4tv4xzFFcj_.Z_AV__IJS_k_1I__FuSb.2 - - security.selinux= trusted.overlay.upper=\\x07\\xe8\\xa1%\\xbe\\xb0\\xc8)\\xcf\\xc2\\xf8\\xbah\\x19\\xae_\\xccH\\x9f\\xf0 trusted.test_1=i\\xe6\\xd9\\xd0 user.test_2=\\xc8\\xa0K\\xb2\\xa0V\\xb0\\xb7\\xd1\\xec(\\x95\\xfe\\xbb`\n\ +/B/\\xc4\\xf8\\x92\\xc2}<4\\xc8\\xec\\xd2\\xa5\\xe6\\x9ee\\xf0\\x95\\xf8(dumpfile).unwrap(); + + let c_image = c_mkcomposefs_from_dumpfile(dumpfile); + let rust_image = mkfs_erofs_versioned( + &mut ValidatedFileSystem::new(fs_rs).unwrap(), + FormatVersion::V1, + ); + + let c_debug = debug_dump(&c_image); + let rust_debug = debug_dump(&rust_image); + + println!("=== C mkcomposefs ({} bytes) ===", c_image.len()); + println!("{c_debug}"); + println!("=== Rust V1 writer ({} bytes) ===", rust_image.len()); + println!("{rust_debug}"); + println!("=== Structural diff (c vs rust) ==="); + println!("{}", diff_debug_dumps("c", &c_debug, "rust", &rust_debug)); + + assert_eq!( + c_image, + rust_image.as_ref(), + "images differ — see structural diff above" + ); } } @@ -2561,7 +3504,7 @@ mod tests { /bbb 5 100644 1 0 0 0 1000.0 - world - "#; let fs = dumpfile_to_filesystem::(dumpfile).unwrap(); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); // Sanity: the unmodified image round-trips fine erofs_to_filesystem::(&image).unwrap(); @@ -2582,4 +3525,651 @@ mod tests { "unexpected error: {msg}" ); } + + /// Regression test for the block-boundary EUCLEAN bug (bug.md). + /// + /// Old kernels (< 6.12) return EFSCORRUPTED from erofs_fill_symlink() when: + /// (inode_offset % block_size) + inode_and_xattr_size + symlink_len > block_size + /// + /// The V1 writer previously used the wrong condition (derived from the + /// non-symlink branch of the C reference) and padded the wrong target + /// (inline_start rather than inode_start), silently producing images that + /// would EUCLEAN on CentOS Stream 9 (kernel 5.14) for symlinks with large + /// SELinux xattrs such as those in /etc/pki/ca-trust/extracted/pem/directory-hash/. + /// + /// This test: + /// 1. Builds a V1 image that forces a symlink inode near a block boundary + /// by packing enough filler inodes before it. + /// 2. Asserts the validator passes (writer fixed the layout). + /// 3. Asserts the symlink round-trips correctly. + /// + /// The construction: inode table starts at offset 1152. We add enough + /// compact filler inodes (FIFOs, 32 bytes each with min mtime) to push + /// the subsequent symlink to a position where the old code would have + /// placed it straddling the 4096-byte boundary. + #[test] + fn test_v1_symlink_block_boundary_euclean_regression() { + use crate::erofs::{format::FormatVersion, writer::mkfs_erofs_versioned}; + + // A realistic SELinux label of the kind found on ca-trust symlinks. + // 76 bytes — enough that header(64) + xattr(~140) + symlink(23) > 4096 + // when the inode starts near offset 3968 within a block. + let selinux_label = "system_u:object_r:cert_t:s0\x00".repeat(2); + // Trim to exactly 56 bytes so xattr body is predictable + let selinux_label = &selinux_label[..selinux_label.len().min(56)]; + + // Build the dumpfile: root + many compact filler FIFOs + the victim symlink. + // + // Filler FIFOs: mtime=0, no xattrs → compact inode (32 bytes each in V1). + // The inode table starts at 1152. We need to fill up to offset ~3968 within + // some 4096-block, which is (3968 - 1152) % 4096 = 2816 bytes = 88 compact inodes + // in the first block. Add a few more to cross into block 1 and land the + // victim at the right position in block 1. + // + // We overshoot slightly and rely on the writer's fix to pad correctly. + // The validator then confirms no inode violates the kernel condition. + let mut dumpfile = String::from("/ 0 40755 2 0 0 0 0.0 - - -\n"); + for i in 0..120usize { + dumpfile.push_str(&format!("/filler{i:03} 0 10644 1 0 0 0 0.0 - - -\n")); + } + // Victim: symlink with a large SELinux xattr. + let target = "/etc/pki/ca-trust/source"; // 24-byte target + let target_len = target.len(); + let xattr_val_hex: String = selinux_label + .bytes() + .map(|b| format!("\\x{b:02x}")) + .collect(); + dumpfile.push_str(&format!( + "/victim {target_len} 120777 1 0 0 0 0.0 {target} - - security.selinux={xattr_val_hex}\n" + )); + + let fs = dumpfile_to_filesystem::(&dumpfile).unwrap(); + let image = mkfs_erofs_versioned( + &mut ValidatedFileSystem::new(fs).unwrap(), + FormatVersion::V1, + ); + + // The validator must pass: the writer should have padded the inode + // to a block boundary so the kernel condition is never violated. + Image::open(&image) + .unwrap() + .fsck_metadata() + .expect("V1 writer should produce valid inline layout (block-boundary fix)"); + + // The symlink target must round-trip correctly. + let fs_rt = + erofs_to_filesystem::(&image).expect("image should parse cleanly"); + let victim_id = fs_rt + .root + .leaf_id(std::ffi::OsStr::new("victim")) + .expect("victim symlink not found in round-tripped filesystem"); + let link_target = match &fs_rt.leaves[victim_id.0].content { + crate::tree::LeafContent::Symlink(t) => t.clone(), + other => panic!("victim should be a symlink, got {other:?}"), + }; + assert_eq!( + link_target.as_ref(), + std::ffi::OsStr::new(target), + "symlink target mismatch after V1 round-trip" + ); + } + + /// Tests that `fsck_metadata` catches a V1 image where symlink + /// padding was suppressed, causing the inode+inline data to cross a block + /// boundary. Uses `WriterFaults` to inject the fault rather than raw byte + /// surgery, so the image is otherwise structurally coherent. + #[test] + fn test_v1_inline_layout_validator_catches_bad_layout() { + use crate::erofs::{ + format::FormatVersion, + writer::{WriterFaults, mkfs_erofs_versioned, mkfs_erofs_with_faults}, + }; + + // Layout math (all sizes in bytes, block_size = 4096): + // + // A symlink crosses a block boundary when: + // symlink_pos % 4096 + 32 (inode) + target_len > 4096 + // => symlink_pos % 4096 > 4096 - 32 - target_len + // + // With target_len = SYMLINK_MAX = 1024 (crate::SYMLINK_MAX): + // symlink_pos % 4096 > 3040 (i.e. slot >= 96 within a block) + // + // Inode table layout (V1): + // Bytes 0..1152 : composefs header (32 B) + pad to 1024 + EROFS superblock (128 B) + // = 36 slots (NID 0-35) + // NID 36 : root inode (32 B inode header) + // NID 36 inline : root dir entries (inline, variable) + // + // With 50 filler files named "f00".."f49" (sort before "link"): + // - 51 dirents: 51 * 12 = 612 B + // - names: 50*3 + 4 = 154 B + // - total inline: 766 B + // - root occupies: 32 + ~766 = 798 B (slot-padded) + // - 50 empty files: 50 * 32 = 1600 B + // - symlink (without block-boundary padding): NID 113, pos_in_block=3616 + // 3616 + 32 + 1024 = 4672 > 4096 → crossing condition ✓ + // + // Note: the *good* image places the symlink at pos_in_block == 0 because + // the writer correctly pads it to a block boundary. We verify crossing + // by checking the *bad* image (padding suppressed) instead. + + // filler_count=50 places the symlink at NID 113 (pos_in_block=3616). + // Without the block-boundary padding: 3616 + 32 + 1024 = 4672 > 4096 ✓ + // The assertion below verifies this whenever the test runs. + let filler_count = 50usize; + let mut lines = String::from("/ 0 40755 2 0 0 0 0.0 - - -\n"); + for i in 0..filler_count { + lines.push_str(&format!("/f{i:02} 0 100644 1 0 0 0 0.0 - - -\n")); + } + let target = "a".repeat(crate::SYMLINK_MAX); + lines.push_str(&format!( + "/link {len} 120777 1 0 0 0 0.0 {target} - -\n", + len = target.len(), + target = target, + )); + let fs = dumpfile_to_filesystem::(&lines).unwrap(); + let mut vfs = ValidatedFileSystem::new(fs).unwrap(); + + // The good image must pass validation. + let good_image = mkfs_erofs_versioned(&mut vfs, FormatVersion::V1); + Image::open(&good_image) + .unwrap() + .fsck_metadata() + .expect("valid image should pass"); + + // Build the faulted image (symlink pad suppressed). + let mut faults = WriterFaults::new(42); + faults.skip_symlink_pad_rate = 1.0; // always skip padding + let bad_image = mkfs_erofs_with_faults(&mut vfs, FormatVersion::V1, faults); + + // Confirm the symlink in the bad image actually crosses a block boundary — + // i.e. the fault injection put the symlink at a dangerous slot. + { + let img = Image::open(&bad_image).unwrap(); + let root_nid = img.sb.root_nid.get() as u64; + let link_nid = img + .find_child_nid(root_nid, b"link") + .unwrap() + .expect("link nid not found"); + let link_offset = (link_nid * 32) as usize; + let pos_in_block = link_offset % 4096; + assert!( + pos_in_block + 32 + crate::SYMLINK_MAX > 4096, + "symlink at pos_in_block={pos_in_block} does not cross a block boundary \ + in the bad image (32+{symlink_max}={total} ≤ 4096); \ + increase filler_count (currently {filler_count})", + symlink_max = crate::SYMLINK_MAX, + total = 32 + crate::SYMLINK_MAX, + ); + } + + // The faulted image must fail validation. + let result = Image::open(&bad_image).unwrap().fsck_metadata(); + assert!( + result.is_err(), + "validator should reject image with suppressed symlink padding" + ); + let msg = result.unwrap_err().to_string(); + assert!( + msg.contains("EUCLEAN") || msg.contains("nid"), + "error should mention EUCLEAN or nid, got: {msg}" + ); + } + + /// B2: Files with a negative `st_mtim_sec` (pre-epoch mtime) must not corrupt + /// the V1 superblock `build_time` field. + /// + /// `calculate_min_mtime` casts `st_mtim_sec as u64`. A value of -1 wraps to + /// `u64::MAX`, which is larger than any positive timestamp, so positive mtimes + /// are correctly selected as the minimum. This test verifies that a filesystem + /// containing one inode with mtime = -1 and one with mtime = 1000 produces a + /// V1 image whose superblock `build_time` equals 1000. + #[test] + fn test_negative_mtime_does_not_corrupt_build_time() { + use std::{collections::BTreeMap, ffi::OsStr}; + + use crate::{ + erofs::{format::FormatVersion, writer::mkfs_erofs_versioned}, + fsverity::Sha256HashValue, + generic_tree::{LeafContent, Stat}, + tree::{self, RegularFile}, + }; + + let root_stat = Stat { + st_mode: 0o40755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: BTreeMap::new(), + }; + + let mut fs = tree::FileSystem::::new(root_stat); + + // Inode with negative mtime (-1). As u64 this wraps to u64::MAX, which + // is larger than 1000, so it should NOT win the minimum comparison. + let neg_stat = Stat { + st_mode: 0o100644, + st_uid: 0, + st_gid: 0, + st_mtim_sec: -1, + st_mtim_nsec: 0, + xattrs: BTreeMap::new(), + }; + let leaf_id = fs.push_leaf( + neg_stat, + LeafContent::Regular(RegularFile::Inline(Box::new([]))), + ); + fs.root + .insert(OsStr::new("neg"), tree::Inode::leaf(leaf_id)); + + let image = mkfs_erofs_versioned( + &mut ValidatedFileSystem::new(fs).unwrap(), + FormatVersion::V1, + ); + let img = Image::open(&image).expect("failed to open V1 image"); + + // The superblock build_time must be 1000 (the root mtime), not u64::MAX or 0. + assert_eq!( + img.sb.build_time.get(), + 1000, + "build_time should be the positive minimum mtime (1000), \ + not the wrapped negative value" + ); + } + + /// B3: Directories with enough entries to span multiple 4096-byte blocks must + /// survive a round-trip through the V2 EROFS writer. + /// + /// Each dirent is 12 bytes (header) + name length bytes. With 50 entries of + /// 90-byte names: 50 × (12 + 90) = 5100 bytes > 4096, which forces + /// `Directory::from_entries` to split across at least two blocks. + /// + /// This test verifies that all entry names survive the round-trip intact. + #[test] + fn test_multiblock_directory_round_trip() { + use std::{collections::BTreeMap, ffi::OsStr}; + + use crate::{ + erofs::writer::mkfs_erofs, + fsverity::Sha256HashValue, + generic_tree::{LeafContent, Stat}, + tree::{self, RegularFile}, + }; + + let root_stat = Stat { + st_mode: 0o40755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: BTreeMap::new(), + }; + + let leaf_stat = Stat { + st_mode: 0o100644, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: BTreeMap::new(), + }; + + let mut fs = tree::FileSystem::::new(root_stat.clone()); + + const N: usize = 50; + let mut expected_names: Vec = vec![".".into(), "..".into()]; + + // Build a subdirectory with N entries, each with a 90-byte name. + // N × (12 + 90) = 5100 bytes — forces a multi-block directory. + let mut subdir = tree::Directory::::new(root_stat); + for i in 0..N { + let name = format!("{:0>90}", i); + let leaf_id = fs.push_leaf( + leaf_stat.clone(), + LeafContent::Regular(RegularFile::Inline(Box::new([]))), + ); + subdir.insert(OsStr::new(&name), tree::Inode::leaf(leaf_id)); + expected_names.push(name); + } + + fs.root.insert( + OsStr::new("bigdir"), + tree::Inode::Directory(Box::new(subdir)), + ); + + let image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); + let img = Image::open(&image).expect("failed to open image"); + + // Locate "bigdir" in root + let root_nid = img.sb.root_nid.get() as u64; + let bigdir_nid = img + .find_child_nid(root_nid, b"bigdir") + .expect("find_child_nid error") + .expect("bigdir not found in root"); + + // Collect all entry names from bigdir (blocks + inline) + let bigdir_inode = img.inode(bigdir_nid).unwrap(); + let mut found_names: Vec = Vec::new(); + if let Some(inline) = bigdir_inode.inline() { + let inline_block = DirectoryBlock::ref_from_bytes(inline).unwrap(); + for entry in inline_block.entries().unwrap() { + let entry = entry.unwrap(); + found_names.push(String::from_utf8(entry.name.to_vec()).unwrap()); + } + } + for blkid in img.inode_blocks(&bigdir_inode).unwrap() { + let block = img.directory_block(blkid).unwrap(); + for entry in block.entries().unwrap() { + let entry = entry.unwrap(); + found_names.push(String::from_utf8(entry.name.to_vec()).unwrap()); + } + } + + found_names.sort(); + expected_names.sort(); + + assert_eq!( + found_names, expected_names, + "multi-block directory lost entries after round-trip" + ); + + // Verify the image is a valid EROFS filesystem that can be round-tripped + let _fs_rt = erofs_to_filesystem::(&image) + .expect("erofs_to_filesystem failed on multi-block directory image"); + + // Sanity: verify the image passes fsck.erofs if available + if let Some(ok) = run_fsck_erofs(&image) { + assert!( + ok, + "fsck.erofs reported errors in multi-block directory image" + ); + } + } + + /// `ValidatedFileSystem::new` must reject a hardlinked whiteout. + /// A whiteout (chardev rdev=0) with nlink > 1 is semantically invalid. + #[test] + fn test_hardlinked_whiteout_writer_rejects() { + use std::ffi::OsStr; + + use crate::{ + erofs::writer::ValidatedFileSystem, + fsverity::Sha256HashValue, + generic_tree::{LeafContent, Stat}, + tree, + }; + + let root_stat = Stat { + st_mode: 0o40755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: Default::default(), + }; + let whiteout_stat = Stat { + st_mode: 0o20000, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: Default::default(), + }; + + let mut fs = tree::FileSystem::::new(root_stat); + let leaf_id = fs.push_leaf(whiteout_stat, LeafContent::CharacterDevice(0)); + fs.root + .insert(OsStr::new("whiteout"), tree::Inode::leaf(leaf_id)); + fs.root.insert( + OsStr::new("hardlink_to_whiteout"), + tree::Inode::leaf(leaf_id), + ); + + let result = ValidatedFileSystem::new(fs); + assert!( + result.is_err(), + "ValidatedFileSystem::new should reject hardlinked whiteout" + ); + let err = result.unwrap_err(); + assert!( + err.to_string().contains("whiteout inode has nlink > 1"), + "unexpected error message: {err}" + ); + } + + // ── Epoch-invariant validation tests ────────────────────────────────────── + + /// Helper: build a simple filesystem from a dumpfile and write it as an + /// Epoch2 (V2) EROFS image. + fn build_epoch2_image(dumpfile: &str) -> Box<[u8]> { + use crate::erofs::writer::{ValidatedFileSystem, mkfs_erofs}; + let fs = crate::dumpfile::dumpfile_to_filesystem::(dumpfile).unwrap(); + mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()) + } + + /// Helper: build a filesystem from a dumpfile and write it as an Epoch1 (V1) + /// EROFS image. + fn build_epoch1_image(dumpfile: &str) -> Box<[u8]> { + use crate::erofs::writer::ValidatedFileSystem; + use crate::erofs::{format::FormatVersion, writer::mkfs_erofs_versioned}; + let fs = crate::dumpfile::dumpfile_to_filesystem::(dumpfile).unwrap(); + mkfs_erofs_versioned( + &mut ValidatedFileSystem::new(fs).unwrap(), + FormatVersion::V1, + ) + } + + /// An Epoch2 image produced by the Rust writer must pass epoch-invariant validation. + #[test] + fn test_epoch2_image_passes_epoch_invariants() { + let dumpfile = r#"/ 0 40755 2 0 0 0 1000.0 - - - +/file 5 100644 1 0 0 0 1000.0 - hello - +/dir 0 40755 2 0 0 0 1000.0 - - - +/dir/nested 3 100644 1 0 0 0 1000.0 - abc - +"#; + let image = build_epoch2_image(dumpfile); + Image::open(&image) + .unwrap() + .fsck_metadata() + .expect("Epoch2 image should pass epoch-invariant validation"); + } + + /// An Epoch1 image produced by the Rust V1 writer must pass epoch-invariant + /// validation (exactly 256 root stubs, no native whiteouts elsewhere). + #[test] + fn test_epoch1_image_passes_epoch_invariants() { + let dumpfile = r#"/ 0 40755 2 0 0 0 1000.0 - - - +/file 5 100644 1 0 0 0 1000.0 - hello - +/dir 0 40755 2 0 0 0 1000.0 - - - +/dir/nested 3 100644 1 0 0 0 1000.0 - abc - +"#; + let image = build_epoch1_image(dumpfile); + Image::open(&image) + .unwrap() + .fsck_metadata() + .expect("Epoch1 image should pass epoch-invariant validation"); + } + + /// An Epoch2 image must not contain escaped whiteouts (regular file + + /// trusted.overlay.overlay.whiteout xattr). We craft one by building a V1 + /// image with a user whiteout (which gets escaped in V1) and patching + /// composefs_version → 2, then verify that validate_epoch_invariants rejects it. + #[test] + fn test_epoch2_image_with_escaped_whiteout_fails_epoch_invariants() { + use crate::erofs::format::FormatVersion; + use crate::erofs::writer::{ValidatedFileSystem, mkfs_erofs_versioned}; + use crate::generic_tree::{LeafContent, Stat}; + use std::ffi::OsStr; + + // Build a filesystem with a user whiteout (char-device rdev=0). When + // written as V1, the writer escapes it to a regular file + xattr. + let mut fs = crate::tree::FileSystem::::new(Stat { + st_mode: 0o755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 0, + st_mtim_nsec: 0, + xattrs: Default::default(), + }); + let wh_id = fs.push_leaf( + Stat { + st_mode: 0o777, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 0, + st_mtim_nsec: 0, + xattrs: Default::default(), + }, + LeafContent::CharacterDevice(0), // rdev=0 → whiteout + ); + fs.root + .insert(OsStr::new("mywhiteout"), crate::tree::Inode::leaf(wh_id)); + + let mut image = mkfs_erofs_versioned( + &mut ValidatedFileSystem::new(fs).unwrap(), + FormatVersion::V1, + ) + .to_vec(); + + // Patch composefs_version → 2; the escaped whiteout xattr is still present. + const COMPOSEFS_VERSION_OFFSET: usize = 12; + image[COMPOSEFS_VERSION_OFFSET..COMPOSEFS_VERSION_OFFSET + 4] + .copy_from_slice(&2u32.to_le_bytes()); + + let result = Image::open(&image).unwrap().fsck_metadata(); + let err = result.expect_err("Epoch2 image with escaped whiteout should be rejected"); + let msg = format!("{err}"); + assert!( + msg.contains("escaped whiteout") || msg.contains("Epoch2"), + "expected escaped-whiteout error, got: {msg}", + ); + } + + /// An Epoch1 image with fewer than 256 root stubs must be rejected. + /// We craft this by building a V2 (Epoch2) image and patching + /// composefs_version → 1, turning it into an "Epoch1" image that lacks the + /// 256 required stubs. + #[test] + fn test_epoch1_image_missing_stubs_fails_epoch_invariants() { + // A V2 image has no stubs at all. Patching its composefs_version to 1 + // makes the validator treat it as Epoch1 and count 0 stubs (≠ 256). + let dumpfile = "/ 0 40755 2 0 0 0 1000.0 - - -\n"; + let mut image = build_epoch2_image(dumpfile).to_vec(); + + // composefs_version is at offset 12 in ComposefsHeader (4th U32). + const COMPOSEFS_VERSION_OFFSET: usize = 12; + image[COMPOSEFS_VERSION_OFFSET..COMPOSEFS_VERSION_OFFSET + 4] + .copy_from_slice(&1u32.to_le_bytes()); + + let result = Image::open(&image).unwrap().fsck_metadata(); + let err = result.expect_err("Epoch1 image missing stubs should be rejected"); + let msg = format!("{err}"); + assert!( + msg.contains("hex-named") || msg.contains("256") || msg.contains("Epoch1"), + "expected stub-count error, got: {msg}", + ); + } + + /// The reader must reject an image with a hardlinked whiteout. + /// + /// We build a valid image with a hardlinked chardev(rdev=1), which the writer + /// accepts. We then patch the inode's `u` field (rdev) from 1 to 0 in the raw + /// image bytes, turning it into a whiteout on-disk while leaving nlink > 1. + /// The reader must detect this and return an error. + #[test] + fn test_hardlinked_whiteout_reader_rejects() { + use std::ffi::OsStr; + + use crate::{ + fsverity::Sha256HashValue, + generic_tree::{LeafContent, Stat}, + tree, + }; + + let root_stat = Stat { + st_mode: 0o40755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: Default::default(), + }; + let chardev_stat = Stat { + st_mode: 0o20000, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 1000, + st_mtim_nsec: 0, + xattrs: Default::default(), + }; + + let mut fs = tree::FileSystem::::new(root_stat); + // Use rdev=1 (not a whiteout) so the writer accepts the hardlink. + let leaf_id = fs.push_leaf(chardev_stat, LeafContent::CharacterDevice(1)); + fs.root + .insert(OsStr::new("chardev"), tree::Inode::leaf(leaf_id)); + fs.root.insert( + OsStr::new("hardlink_to_chardev"), + tree::Inode::leaf(leaf_id), + ); + + use crate::erofs::writer::mkfs_erofs; + let base_image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); + + // Sanity: the unpatched image must be accepted. + erofs_to_filesystem::(&base_image) + .expect("unmodified image with rdev=1 hardlink should be accepted"); + + // Locate the chardev inode in the image using the erofs Image API. + let img = Image::open(&base_image).unwrap(); + let root_nid = img.sb.root_nid.get() as u64; + let chardev_nid = img + .find_child_nid(root_nid, b"chardev") + .unwrap() + .expect("chardev entry must exist"); + + // Parse the inode via the Image API to learn its layout (compact vs + // extended) and locate its slot in the image. We record what we need + // before releasing the shared borrow so we can take `&mut` afterwards. + let inode = img.inode(chardev_nid).unwrap(); + let is_extended = matches!(inode, InodeType::Extended(_)); + // The inode region is the `inodes` sub-slice of `image`; the slot for + // NID n starts at n*32 bytes into that region. + let inodes_start = img.image.len() - img.inodes.len(); + let inode_slot_start = inodes_start + chardev_nid as usize * 32; + drop(inode); + drop(img); + + // Mutate a copy of the image: set the `u` field (rdev) from 1 → 0, + // turning the chardev into a whiteout on-disk while leaving nlink > 1. + // Use zerocopy to reinterpret the slot bytes as the concrete header type + // so we get a typed `&mut` rather than raw byte arithmetic. + let mut image = base_image.to_vec(); + let slot = &mut image[inode_slot_start..]; + if is_extended { + use core::mem::size_of; + let hdr = + ExtendedInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid ExtendedInodeHeader"); + assert_eq!(hdr.u.get(), 1, "expected rdev=1 before patching"); + hdr.u = zerocopy::little_endian::U32::new(0); + } else { + use core::mem::size_of; + let hdr = + CompactInodeHeader::mut_from_bytes(&mut slot[..size_of::()]) + .expect("inode slot must be a valid CompactInodeHeader"); + assert_eq!(hdr.u.get(), 1, "expected rdev=1 before patching"); + hdr.u = zerocopy::little_endian::U32::new(0); + } + + // The reader must reject the patched image. + let result = erofs_to_filesystem::(&image); + let err = result.expect_err("reader should reject image with hardlinked whiteout"); + let err_msg = format!("{err:#}"); + assert!( + err_msg.contains("nlink"), + "error message should mention nlink, got: {err_msg}" + ); + } } diff --git a/crates/composefs/src/erofs/writer.rs b/crates/composefs/src/erofs/writer.rs index 7fe59e61..34993d15 100644 --- a/crates/composefs/src/erofs/writer.rs +++ b/crates/composefs/src/erofs/writer.rs @@ -1,12 +1,49 @@ -//! EROFS image generation and writing functionality. +//! EROFS image generation from composefs trees. //! -//! This module provides functionality to generate EROFS filesystem images -//! from composefs tree structures, handling inode layout, directory blocks, -//! and metadata serialization. +//! The public entry points are: +//! +//! - [`mkfs_erofs`] — generate a single EROFS image using the repository's default format +//! - [`mkfs_erofs_versioned`] — generate an image for a specific [`FormatVersion`] +//! +//! Both require a [`ValidatedFileSystem`], which is the type-safe gate: constructing +//! one runs [`fsck`](crate::tree::FileSystem::fsck) and checks EROFS-specific invariants +//! (e.g. whiteout inodes must not be hardlinked). A validated filesystem cannot panic the +//! writer. +//! +//! ## Format versions +//! +//! Three on-disk formats are supported, selected by [`FormatVersion`]: +//! +//! **V0** (`FormatVersion::V0`) is byte-for-byte compatible with C `mkcomposefs` default +//! output. It uses compact inodes (32 bytes) where the inode fits, extended (64 bytes) +//! otherwise; collects inodes in BFS order; includes a 256-entry whiteout stub table at the +//! start of the inode area; sets `build_time` to the minimum mtime; and encodes user-visible +//! whiteout files (chr 0,0) via `trusted.overlay.opaque=x` xattrs rather than storing them +//! directly. The `composefs_version` header field is `0` normally and auto-upgrades to `1` +//! when user whiteouts are present. +//! +//! **V1** (`FormatVersion::V1`, the default) uses the same EROFS layout as V0 but always +//! writes `composefs_version=1` in the header — equivalent to C `mkcomposefs --min-version=1`. +//! This is the recommended default for new repositories. +//! +//! **V2** (`FormatVersion::V2`) is the composefs-rs native format. It always uses extended +//! inodes (64 bytes), collects inodes in DFS order, omits the whiteout stub table, sets +//! `build_time` to 0, and sets `composefs_version` to 2. Whiteout files are stored without +//! escaping. +//! +//! ## Two-pass layout + emit design +//! +//! `write_erofs` is called twice on the same inode list. The first pass uses a `FirstPass` +//! output that counts bytes and records the byte offset of every inode, block, and data +//! region without writing anything. The second pass uses a `SecondPass` output that +//! serializes bytes into a buffer. EROFS node IDs (nids) and cross-region offsets can only +//! be computed after the first pass, so all [`InodeRef::Known`] references are resolved +//! between the two passes. use std::{ - collections::{BTreeMap, HashMap}, + collections::{BTreeMap, HashMap, HashSet}, mem::size_of, + num::NonZeroUsize, os::unix::ffi::OsStrExt, }; @@ -15,48 +52,291 @@ use xxhash_rust::xxh32::xxh32; use zerocopy::{Immutable, IntoBytes}; use crate::{ - erofs::{composefs::OverlayMetacopy, format, reader::round_up}, + erofs::{ + composefs::OverlayMetacopy, + format::{self, FormatEpoch}, + reader::round_up, + }, fsverity::FsVerityHashValue, generic_tree::LeafId, tree, }; -#[derive(Clone, Copy, Debug)] -enum Offset { - Header, - Superblock, - Inode, - XAttr, - Block, - End, +/// A composefs filesystem tree validated for EROFS serialization. +/// +/// Can only be constructed via [`ValidatedFileSystem::new`], which checks +/// that the tree satisfies all EROFS invariants — for example, that no +/// whiteout inode (character device with rdev=0) has `nlink > 1`. +/// +/// Passing a `ValidatedFileSystem` to [`mkfs_erofs`] or +/// [`mkfs_erofs_versioned`] therefore cannot panic. +pub struct ValidatedFileSystem(pub(crate) tree::FileSystem); + +impl std::fmt::Debug + for ValidatedFileSystem +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("ValidatedFileSystem").field(&self.0).finish() + } +} + +impl ValidatedFileSystem { + /// Validate `fs` and wrap it. Returns an error if any invariant is violated. + pub fn new(fs: tree::FileSystem) -> anyhow::Result { + validate_filesystem(&fs)?; + Ok(Self(fs)) + } +} + +impl std::ops::Deref for ValidatedFileSystem { + type Target = tree::FileSystem; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +pub(crate) fn validate_filesystem( + fs: &tree::FileSystem, +) -> anyhow::Result<()> { + // Check structural invariants: leaf ref bounds, no orphaned leaves. + fs.fsck() + .map_err(|e| anyhow::anyhow!("invalid composefs filesystem: {e}"))?; + + // Check EROFS-specific constraint: whiteout inodes (chardev rdev=0) must not be hardlinked. + let nlinks = fs.nlinks(); + for (idx, leaf) in fs.leaves.iter().enumerate() { + if matches!(leaf.content, tree::LeafContent::CharacterDevice(0)) { + let nlink = nlinks[idx]; + if nlink > 1 { + anyhow::bail!("invalid composefs filesystem: whiteout inode has nlink > 1"); + } + } + } + Ok(()) +} + +/// Size of one EROFS inode slot in bytes. All inode offsets must be a multiple of this. +const INODE_SLOT_SIZE: usize = 32; + +/// EROFS xattr values are addressed in 4-byte words; all xattr offsets and counts use this unit. +const XATTR_WORD_SIZE: usize = size_of::(); + +/// Size of the InodeXAttrHeader in bytes, used in xattr_icount calculation. +const INODE_XATTR_HEADER_SIZE: usize = size_of::(); + +/// Returns the byte offset of `pos` within its EROFS block (i.e. `pos % BLOCK_SIZE`). +/// +/// `BLOCK_SIZE` (4096) is a nonzero constant, so this operation never panics. +fn block_offset(pos: u64) -> u64 { + pos % u64::from(format::BLOCK_SIZE) +} + +/// Returns the number of bytes from `pos` to the next EROFS block boundary, +/// or `None` if `pos` is already block-aligned (no padding needed). +/// +/// When `Some`, the result is always in `1..BLOCK_SIZE`. +fn bytes_to_block_boundary(pos: u64) -> Option { + let offset = block_offset(pos); + if offset == 0 { + return None; + } + let block_size = u64::from(format::BLOCK_SIZE); + let padding = block_size + .checked_sub(offset) + .expect("block_offset(pos) < BLOCK_SIZE by construction"); + debug_assert!(padding >= 1 && padding < block_size); + Some(padding) +} + +/// Deterministic fault injector for writer tests. +/// +/// Each field is a probability in [0.0, 1.0]: +/// 0.0 = never inject this fault +/// 1.0 = always inject this fault +/// +/// Construct with `WriterFaults::new(seed)` then set the rates you need. +/// Because `write_erofs` runs twice (layout pass then emit pass), decisions +/// are recorded during the first pass and replayed during the second so that +/// both passes make identical choices and produce a structurally coherent image. +#[cfg(test)] +pub(crate) struct WriterFaults { + rng: rand::rngs::SmallRng, + /// Skip the symlink block-boundary padding (produces a malformed image). + pub skip_symlink_pad_rate: f64, + /// Decisions recorded during the first pass; replayed during the second. + decisions: Vec, + /// Index into `decisions` during replay. + replay_idx: usize, + /// True after `start_replay()` is called. + replaying: bool, +} + +#[cfg(test)] +impl WriterFaults { + pub fn new(seed: u64) -> Self { + use rand::SeedableRng; + Self { + rng: rand::rngs::SmallRng::seed_from_u64(seed), + skip_symlink_pad_rate: 0.0, + decisions: Vec::new(), + replay_idx: 0, + replaying: false, + } + } + + /// Call between first and second pass to switch to replay mode. + pub(crate) fn start_replay(&mut self) { + self.replaying = true; + self.replay_idx = 0; + } + + fn should_skip_symlink_pad(&mut self) -> bool { + if self.replaying { + let decision = self.decisions[self.replay_idx]; + self.replay_idx += 1; + decision + } else { + use rand::RngExt; + let decision = self.rng.random::() < self.skip_symlink_pad_rate; + self.decisions.push(decision); + decision + } + } +} + +/// Bundles the parameters that are constant across a single `write_erofs` call. +struct WriteContext { + version: format::FormatVersion, + min_mtime: (u64, u32), + header_flags: u32, + /// The `composefs_version` value written to the ComposefsHeader. + /// + /// For V2: always 2 (COMPOSEFS_VERSION). + /// For V1: 0 normally, but 1 when the tree contains user-land whiteouts (char + /// devices with rdev=0 that were escaped by the V1 writer). This matches C + /// mkcomposefs, which bumps `options->version` from 0 to 1 when it encounters + /// a whiteout in the input tree (before adding the 256 overlay stubs). + composefs_version: u32, + #[cfg(test)] + faults: Option, } trait Output { - fn note_offset(&mut self, offset_type: Offset); - fn get(&self, offset_type: Offset, idx: usize) -> usize; + // --- Recording (first pass only, no-ops in second pass) --- + fn note_header_emitted(&mut self); + fn note_superblock_emitted(&mut self); + fn note_inode(&mut self); + fn note_inodes_end(&mut self); + fn note_xattr(&mut self); + fn note_block(&mut self); + fn note_end(&mut self); + + // --- Retrieval (None in first pass when offsets not yet known, Some in second pass) --- + fn get_inode_offset(&self, idx: usize) -> Option; + fn get_inodes_end(&self) -> Option; + fn get_xattr_offset(&self, idx: usize) -> Option; + fn get_block_offset(&self, idx: usize) -> Option; + fn get_end(&self) -> Option; + + // --- I/O --- fn write(&mut self, data: &[u8]); fn pad(&mut self, alignment: usize); fn len(&self) -> usize; - fn get_div(&self, offset_type: Offset, idx: usize, div: usize) -> usize { - let offset = self.get(offset_type, idx); - assert_eq!(offset % div, 0); - offset / div + /// Write `n` zero bytes. Default implementation avoids heap allocation. + fn write_zeros(&mut self, n: usize) { + const BUF: [u8; 1024] = [0u8; 1024]; + let mut remaining = n; + while remaining > 0 { + let chunk = remaining.min(BUF.len()); + self.write(&BUF[..chunk]); + remaining -= chunk; + } } - fn get_nid(&self, idx: usize) -> u64 { - self.get_div(Offset::Inode, idx, 32) as u64 + // --- Typed write methods: note + write bundled, removing duplication --- + + /// Write the composefs header and pad to 1024 bytes. + fn write_composefs_header(&mut self, hdr: format::ComposefsHeader) { + self.note_header_emitted(); + self.write(hdr.as_bytes()); + self.pad(1024); } - fn get_xattr(&self, idx: usize) -> u32 { - self.get_div(Offset::XAttr, idx, 4).try_into().unwrap() + /// Write the EROFS superblock. + fn write_superblock(&mut self, sb: format::Superblock) { + self.note_superblock_emitted(); + self.write(sb.as_bytes()); } + // --- Derived helpers --- + fn write_struct(&mut self, st: impl IntoBytes + Immutable) { self.write(st.as_bytes()); } + + /// Node ID for inode `idx`, or 0 as a placeholder in the first pass. + fn get_nid(&self, idx: usize) -> u64 { + let Some(offset) = self.get_inode_offset(idx) else { + return 0; + }; + assert_eq!(offset.get() % INODE_SLOT_SIZE, 0); + (offset.get() / INODE_SLOT_SIZE) as u64 + } + + /// Shared xattr reference value (V1 format), or 0 as a placeholder in the first pass. + fn get_xattr_v1(&self, idx: usize) -> u32 { + let (Some(absolute_offset), Some(inodes_end)) = + (self.get_xattr_offset(idx), self.get_inodes_end()) + else { + return 0; + }; + let (absolute_offset, inodes_end) = (absolute_offset.get(), inodes_end.get()); + let offset_within_block = inodes_end % format::BLOCK_SIZE as usize; + let xattr_offset_from_inodes_end = absolute_offset + .checked_sub(inodes_end) + .expect("shared xattr offset must be >= inode table end"); + let raw_ref = (offset_within_block + xattr_offset_from_inodes_end) / XATTR_WORD_SIZE; + raw_ref + .try_into() + .expect("xattr reference index exceeds u32::MAX") + } + + /// Shared xattr reference value (V2 format), or 0 as a placeholder in the first pass. + fn get_xattr_v2(&self, idx: usize) -> u32 { + let Some(offset) = self.get_xattr_offset(idx) else { + return 0; + }; + assert_eq!(offset.get() % XATTR_WORD_SIZE, 0); + (offset.get() / XATTR_WORD_SIZE) + .try_into() + .expect("xattr reference index exceeds u32::MAX") + } + + /// Byte offset of inode `idx`'s block data, or 0 as a placeholder in the first pass. + fn get_block_start(&self, idx: usize) -> usize { + self.get_block_offset(idx).map_or(0, NonZeroUsize::get) + } + + /// Block index of the V1 xattr region, or 0 as a placeholder in the first pass. + fn get_xattr_blkaddr(&self) -> u32 { + self.get_inodes_end() + .map_or(0, |end| (end.get() / format::BLOCK_SIZE as usize) as u32) + } + + /// Total number of blocks in the image, or 0 as a placeholder in the first pass. + fn get_block_count(&self) -> u32 { + self.get_end() + .map_or(0, |end| (end.get() / format::BLOCK_SIZE as usize) as u32) + } } +/// Extended attribute stored in EROFS format. +/// +/// The derived Ord sorts by (prefix, suffix, value) which is used for V2. +/// For V1, use `cmp_by_full_key` which sorts by full key name (prefix string + suffix) +/// to match C mkcomposefs behavior. #[derive(PartialOrd, PartialEq, Eq, Ord, Clone)] struct XAttr { prefix: u8, @@ -64,6 +344,32 @@ struct XAttr { value: Box<[u8]>, } +impl XAttr { + /// Compare by full key name (prefix string + suffix), then by value. + /// This matches C mkcomposefs `cmp_xattr` which uses `strcmp(na->key, nb->key)`. + /// Uses lazy iterator chaining to avoid heap allocation on every comparison. + /// + /// Value tiebreaker uses length-first comparison to match C `xattrs_ht_sort()`, + /// which compares `value_len` before `memcmp`. This differs from Rust's + /// lexicographic `[u8]::cmp` when values have different lengths (e.g. + /// `\x00\x00` vs `\xee`: lexicographic says `\x00\x00 < \xee`, but + /// length-first says `\xee < \x00\x00` because 1 < 2). + fn cmp_by_full_key(&self, other: &Self) -> std::cmp::Ordering { + let self_key = format::XATTR_PREFIXES[self.prefix as usize] + .iter() + .chain(self.suffix.iter()); + let other_key = format::XATTR_PREFIXES[other.prefix as usize] + .iter() + .chain(other.suffix.iter()); + self_key.cmp(other_key).then_with(|| { + self.value + .len() + .cmp(&other.value.len()) + .then_with(|| self.value.cmp(&other.value)) + }) + } +} + #[derive(Clone, Default)] struct InodeXAttrs { shared: Vec, @@ -71,13 +377,37 @@ struct InodeXAttrs { filter: u32, } +/// Index into [`InodeCollector::inodes`]. This is NOT an EROFS nid; the nid is computed +/// from the byte offset of the inode during the second pass via [`Output::get_nid`]. +type InodeIdx = usize; + +/// Reference to an inode in a directory entry. +/// +/// Used in [`DirEnt`] during BFS in [`InodeCollector::collect_tree`]. When a hardlink's +/// canonical occurrence hasn't been BFS-processed yet, the entry is stored as +/// `Deferred(leaf_id)` and resolved to `Known(nid)` in the post-BFS resolution pass. +#[derive(Debug, Clone, Copy)] +enum InodeRef { + Known(InodeIdx), + Deferred(LeafId), +} + #[derive(Debug)] struct DirEnt<'a> { name: &'a [u8], - inode: usize, + inode: InodeRef, file_type: format::FileType, } +/// Metadata returned by `Inode::inode_meta` used to fill inode header fields. +struct InodeMeta { + layout: format::DataLayout, + /// The `i_u` field: meaning depends on layout (rdev, chunk format, or block offset / BLOCK_SIZE). + i_u: u32, + size: u64, + nlink: usize, +} + #[derive(Debug, Default)] struct Directory<'a> { blocks: Box<[Box<[DirEnt<'a>]>]>, @@ -102,6 +432,9 @@ struct Inode<'a, ObjectID: FsVerityHashValue> { stat: &'a tree::Stat, xattrs: InodeXAttrs, content: InodeContent<'a, ObjectID>, + /// V1 only: this inode was originally a char device with rdev=0 (overlay whiteout) + /// and has been escaped to a regular file per C mkcomposefs v1.0.8 behavior. + escaped_whiteout: bool, } impl XAttr { @@ -113,13 +446,26 @@ impl XAttr { }); output.write(&self.suffix); output.write(&self.value); - output.pad(4); + output.pad(XATTR_WORD_SIZE); } } impl InodeXAttrs { - fn add(&mut self, name: &[u8], value: &[u8]) { + /// Returns the serialized byte size of this xattr block. + fn byte_size(&self, version: format::FormatVersion) -> usize { + let mut counter = FirstPass::default(); + self.write(&mut counter, version); + counter.offset + } + + fn add(&mut self, name: &[u8], value: &[u8], version: format::FormatVersion) { for (idx, prefix) in format::XATTR_PREFIXES.iter().enumerate().rev() { + // V1 compatibility: C mkcomposefs v1.0.8 does not include lustre. (index 5) + // in its prefix table, so lustre.* xattrs use index 0 (raw fallback) in C. + // Skip index 5 for V1 images to match that behavior. + if version.epoch() == FormatEpoch::Epoch1 && idx == 5 { + continue; + } if let Some(suffix) = name.strip_prefix(*prefix) { self.filter |= 1 << (xxh32(suffix, format::XATTR_FILTER_SEED + idx as u32) % 32); self.local.push(XAttr { @@ -133,7 +479,7 @@ impl InodeXAttrs { unreachable!("{:?}", std::str::from_utf8(name)); // worst case: we matched the empty prefix (0) } - fn write(&self, output: &mut impl Output) { + fn write(&self, output: &mut impl Output, version: format::FormatVersion) { if self.filter != 0 { trace!(" write xattrs block"); output.write_struct(format::InodeXAttrHeader { @@ -143,7 +489,11 @@ impl InodeXAttrs { }); for idx in &self.shared { trace!(" shared {} @{}", idx, output.len()); - output.write(&output.get_xattr(*idx).to_le_bytes()); + let xattr_ref = match version.epoch() { + FormatEpoch::Epoch1 => output.get_xattr_v1(*idx), + FormatEpoch::Epoch2 => output.get_xattr_v2(*idx), + }; + output.write(&xattr_ref.to_le_bytes()); } for attr in &self.local { trace!(" local @{}", output.len()); @@ -226,9 +576,13 @@ impl<'a> Directory<'a> { nameofs, output.len() ); + let inode_idx = match entry.inode { + InodeRef::Known(idx) => idx, + InodeRef::Deferred(_) => panic!("all inodes must be resolved before writing"), + }; output.write_struct(format::DirectoryEntryHeader { name_offset: (nameofs as u16).into(), - inode_offset: output.get_nid(entry.inode).into(), + inode_offset: output.get_nid(inode_idx).into(), file_type: entry.file_type.into(), ..Default::default() }); @@ -260,21 +614,65 @@ impl<'a> Directory<'a> { } } - fn inode_meta(&self, block_offset: usize) -> (format::DataLayout, u32, u64, usize) { - let (layout, u) = if self.inline.is_empty() { - (format::DataLayout::FlatPlain, block_offset as u32 / 4096) + fn inode_meta(&self, block_offset: usize) -> InodeMeta { + let blkaddr: u32 = (block_offset / 4096) + .try_into() + .expect("block address exceeds u32::MAX"); + let (layout, i_u) = if self.inline.is_empty() { + (format::DataLayout::FlatPlain, blkaddr) } else if !self.blocks.is_empty() { - (format::DataLayout::FlatInline, block_offset as u32 / 4096) + (format::DataLayout::FlatInline, blkaddr) } else { (format::DataLayout::FlatInline, 0) }; - (layout, u, self.size, self.nlink) + InodeMeta { + layout, + i_u, + size: self.size, + nlink: self.nlink, + } } } +/// Calculates the chunk format bits for an external file based on its size. +/// +/// For EROFS chunk-based inodes, the `u` field contains the chunk format +/// which encodes the chunk size as `chunkbits - BLOCK_BITS`. +/// +/// The algorithm matches the C implementation: +/// 1. Calculate chunkbits = ilog2(size - 1) + 1 +/// 2. Clamp to at least BLOCK_BITS (12) +/// 3. Clamp to at most BLOCK_BITS + 31 (max representable) +/// 4. Return chunkbits - BLOCK_BITS +fn compute_chunk_format(file_size: u64) -> u32 { + const BLOCK_BITS: u32 = format::BLOCK_BITS as u32; + const CHUNK_FORMAT_BLKBITS_MASK: u32 = 0x001F; // 31 + + // Compute the chunkbits to use for the file size. + // We want as few chunks as possible, but not an unnecessarily large chunk. + let mut chunkbits = if file_size > 1 { + // ilog2(file_size - 1) + 1 + 64 - (file_size - 1).leading_zeros() + } else { + 1 + }; + + // At least one logical block + if chunkbits < BLOCK_BITS { + chunkbits = BLOCK_BITS; + } + + // Not larger chunks than max possible + if chunkbits - BLOCK_BITS > CHUNK_FORMAT_BLKBITS_MASK { + chunkbits = CHUNK_FORMAT_BLKBITS_MASK + BLOCK_BITS; + } + + chunkbits - BLOCK_BITS +} + impl Leaf<'_, ObjectID> { - fn inode_meta(&self) -> (format::DataLayout, u32, u64, usize) { - let (layout, u, size) = match &self.content { + fn inode_meta(&self, version: format::FormatVersion) -> InodeMeta { + let (layout, i_u, size) = match &self.content { tree::LeafContent::Regular(tree::RegularFile::Inline(data)) => { if data.is_empty() { (format::DataLayout::FlatPlain, 0, data.len() as u64) @@ -283,10 +681,19 @@ impl Leaf<'_, ObjectID> { } } tree::LeafContent::Regular(tree::RegularFile::External(.., size)) => { - (format::DataLayout::ChunkBased, 31, *size) + // V1: compute chunk format from file size + // V2: hardcode 31 (origin/main behavior) + let chunk_format = match version.epoch() { + FormatEpoch::Epoch1 => compute_chunk_format(*size), + FormatEpoch::Epoch2 => 31, + }; + (format::DataLayout::ChunkBased, chunk_format, *size) } tree::LeafContent::CharacterDevice(rdev) | tree::LeafContent::BlockDevice(rdev) => { - (format::DataLayout::FlatPlain, *rdev as u32, 0) + let rdev32: u32 = (*rdev) + .try_into() + .expect("device number exceeds EROFS u32 limit"); + (format::DataLayout::FlatPlain, rdev32, 0) } tree::LeafContent::Fifo | tree::LeafContent::Socket => { (format::DataLayout::FlatPlain, 0, 0) @@ -301,7 +708,12 @@ impl Leaf<'_, ObjectID> { (format::DataLayout::FlatInline, 0, target.len() as u64) } }; - (layout, u, size, self.nlink) + InodeMeta { + layout, + i_u, + size, + nlink: self.nlink, + } } fn write_inline(&self, output: &mut impl Output) { @@ -316,6 +728,11 @@ impl Leaf<'_, ObjectID> { impl Inode<'_, ObjectID> { fn file_type(&self) -> format::FileType { + // V1 whiteout escaping: char device (rdev=0) entries are written as regular files + // to match C mkcomposefs v1.0.8 behavior. + if self.escaped_whiteout { + return format::FileType::RegularFile; + } match &self.content { InodeContent::Directory(..) => format::FileType::Directory, InodeContent::Leaf(leaf) => match &leaf.content { @@ -329,84 +746,247 @@ impl Inode<'_, ObjectID> { } } - fn write_inode(&self, output: &mut impl Output, idx: usize) { - let (layout, u, size, nlink) = match &self.content { - InodeContent::Directory(dir) => dir.inode_meta(output.get(Offset::Block, idx)), - InodeContent::Leaf(leaf) => leaf.inode_meta(), + /// Check if this inode can use compact format (32 bytes instead of 64). + /// + /// Compact format is used when: + /// - mtime matches min_mtime (stored in superblock build_time) + /// - nlink, uid, gid fit in u16 + /// - size fits in u32 + fn fits_in_compact(&self, min_mtime: (u64, u32), size: u64, nlink: usize) -> bool { + // mtime (both sec and nsec) must match the minimum (which will be stored in superblock + // build_time / build_time_nsec). The C implementation requires both to match. + if self.stat.st_mtim_sec as u64 != min_mtime.0 { + return false; + } + if self.stat.st_mtim_nsec != min_mtime.1 { + return false; + } + + // nlink must fit in u16 + if nlink > u16::MAX as usize { + return false; + } + + // uid and gid must fit in u16 + if self.stat.st_uid > u16::MAX as u32 || self.stat.st_gid > u16::MAX as u32 { + return false; + } + + // size must fit in u32 + if size > u32::MAX as u64 { + return false; + } + + true + } + + /// Handle inline tail padding for V1 format. + /// + /// Port of C mkcomposefs `compute_erofs_inode_padding_for_tail()`. + /// + /// Two branches based on file type: + /// - Symlinks: pad the *inode start* to a block boundary whenever the inode + xattrs + + /// symlink target would cross into a new block (prevents EFSCORRUPTED on old kernels). + /// - All other FlatInline types (dirs, inline files): pad the *tail* only if it would + /// cross into yet another block after inline_start. + fn pad_inline_tail_v1( + &self, + output: &mut impl Output, + inode_and_xattr_size: u64, + size: u64, + #[cfg(test)] ctx: &mut WriteContext, + #[cfg(not(test))] _ctx: &mut WriteContext, + ) { + let block_size = u64::from(format::BLOCK_SIZE); + let current_pos: u64 = output.len().try_into().unwrap(); + let inline_size = size % block_size; + + if matches!(self.file_type(), format::FileType::Symlink) { + // Symlink branch: pad *inode start* to a block boundary when + // inode + xattrs + symlink target would cross into a new block. + // Matches C: pos_block != end_block. + // + // Old kernels (< 6.12) return EFSCORRUPTED from erofs_fill_symlink() + // when (inode_offset % block_size) + inode_and_xattr_size + inline_size + // > block_size. Padding the inode start to a block boundary prevents + // this because then inode_offset % block_size == 0. + #[cfg(test)] + let skip_pad = ctx + .faults + .as_mut() + .map(|f| f.should_skip_symlink_pad()) + .unwrap_or(false); + #[cfg(not(test))] + let skip_pad = false; + + if !skip_pad { + let total_size = inode_and_xattr_size + inline_size; + // Does [current_pos, current_pos+total_size) cross a block boundary? + // block_offset tells us how far into the current block we are; + // if adding total_size exceeds block_size, we spill into the next block. + if block_offset(current_pos) + total_size > block_size { + // Align inode start to the next block boundary so the inode + // doesn't straddle a block (prevents EUCLEAN on old kernels). + // block_size (4096) is divisible by 32 (EROFS slot size), + // so slot alignment is preserved after this padding. + // None means current_pos is already block-aligned; no padding needed. + if let Some(pad_size) = bytes_to_block_boundary(current_pos) { + output.write_zeros(pad_size as usize); + } + } + } + } else { + // Non-symlink branch (dirs, inline files): pad the *tail* to fit + // within the block that inline_start lands in. + // Matches C: block_remainder < inline_size, pad = block_remainder + // rounded up to the next 32-byte slot boundary. + let inline_start = current_pos + .checked_add(inode_and_xattr_size) + .expect("image position + inode header size cannot overflow u64"); + // If inline_start is block-aligned, block_remainder would be BLOCK_SIZE which + // always exceeds inline_size (< BLOCK_SIZE), so no padding — None is correct. + if let Some(block_remainder) = bytes_to_block_boundary(inline_start) + && block_remainder < inline_size + { + let pad_size = (block_remainder.div_ceil(INODE_SLOT_SIZE as u64) + * INODE_SLOT_SIZE as u64) as usize; + output.write_zeros(pad_size); + } + } + } + + /// Handle inline tail padding for V2 format (origin/main algorithm). + fn pad_inline_tail_v2(&self, output: &mut impl Output, inode_and_xattr_size: u64, size: u64) { + let block_size = u64::from(format::BLOCK_SIZE); + let inline_start: u64 = output.len().try_into().unwrap(); + let inline_start = inline_start + .checked_add(inode_and_xattr_size) + .expect("image position + inode header size cannot overflow u64"); + // Restore origin/main logic: end_of_metadata is the last byte of the metadata, + // inline_end is the last byte of the inline data. If they land in different + // blocks we must pad so the inline data starts at a fresh block boundary. + let end_of_metadata = inline_start - 1; + let inline_end = inline_start + (size % block_size); + if end_of_metadata / block_size != inline_end / block_size { + let pad_size = (block_size - end_of_metadata % block_size) as usize; + output.write_zeros(pad_size); + output.pad(INODE_SLOT_SIZE); + } + } + + fn write_inode(&self, output: &mut impl Output, idx: usize, ctx: &mut WriteContext) { + let version = ctx.version; + let min_mtime = ctx.min_mtime; + let meta = match &self.content { + InodeContent::Directory(dir) => dir.inode_meta(output.get_block_start(idx)), + InodeContent::Leaf(leaf) => leaf.inode_meta(version), }; + let InodeMeta { + layout, + i_u: u, + size, + nlink, + } = meta; + + let xattr_size = self.xattrs.byte_size(version); - let xattr_size = { - let mut xattr = FirstPass::default(); - self.xattrs.write(&mut xattr); - xattr.offset + // V1: compact inodes when possible; V2: always extended + let use_compact = + version.epoch() == FormatEpoch::Epoch1 && self.fits_in_compact(min_mtime, size, nlink); + + let inode_header_size = if use_compact { + size_of::() + } else { + size_of::() }; // We need to make sure the inline part doesn't overlap a block boundary - output.pad(32); + output.pad(INODE_SLOT_SIZE); if matches!(layout, format::DataLayout::FlatInline) { - let block_size = u64::from(format::BLOCK_SIZE); - let inode_and_xattr_size: u64 = (size_of::() + xattr_size) - .try_into() - .unwrap(); - let inline_start: u64 = output.len().try_into().unwrap(); - let inline_start = inline_start + inode_and_xattr_size; - let end_of_metadata = inline_start - 1; - let inline_end = inline_start + (size % block_size); - if end_of_metadata / block_size != inline_end / block_size { - // If we proceed, then we'll violate the rule about crossing block boundaries. - // The easiest thing to do is to add padding so that the inline data starts close - // to the start of a fresh block boundary, while ensuring inode alignment. - // pad_size is always < block_size (4096), so fits in usize - let pad_size = (block_size - end_of_metadata % block_size) as usize; - let pad = vec![0; pad_size]; - trace!("added pad {}", pad.len()); - output.write(&pad); - output.pad(32); + let inode_and_xattr_size: u64 = (inode_header_size + xattr_size).try_into().unwrap(); + + match version.epoch() { + FormatEpoch::Epoch1 => { + self.pad_inline_tail_v1(output, inode_and_xattr_size, size, ctx); + } + FormatEpoch::Epoch2 => { + self.pad_inline_tail_v2(output, inode_and_xattr_size, size); + } } } - let format = format::InodeLayout::Extended | layout; + let xattr_icount: u16 = match xattr_size { + 0 => 0, + n => { + let word_count = n + .checked_sub(INODE_XATTR_HEADER_SIZE) + .expect("non-empty xattr block must be >= header size") + / XATTR_WORD_SIZE; + (1 + word_count) as u16 + } + }; - trace!( - "write inode {idx} nid {} {:?} {:?} xattrsize{xattr_size} icount{} inline{} @{}", - output.len() / 32, - format, - self.file_type(), - match xattr_size { - 0 => 0, - n => (1 + (n - 12) / 4) as u16, - }, - size % 4096, - output.len() - ); + output.note_inode(); + + if use_compact { + let format = format::InodeLayout::Compact | layout; + + // V1: use sequential ino + let ino = idx as u32; + + output.write_struct(format::CompactInodeHeader { + format, + xattr_icount: xattr_icount.into(), + mode: self.file_type() | self.stat.st_mode, + nlink: (nlink as u16).into(), + size: (size as u32).into(), + reserved: 0.into(), + u: u.into(), + ino: ino.into(), + uid: (self.stat.st_uid as u16).into(), + gid: (self.stat.st_gid as u16).into(), + reserved2: [0; 4], + }); + } else { + let format = format::InodeLayout::Extended | layout; - output.note_offset(Offset::Inode); - output.write_struct(format::ExtendedInodeHeader { - format, - xattr_icount: match xattr_size { - 0 => 0, - n => (1 + (n - 12) / 4) as u16, - } - .into(), - mode: self.file_type() | self.stat.st_mode, - size: size.into(), - u: u.into(), - ino: ((output.len() / 32) as u32).into(), - uid: self.stat.st_uid.into(), - gid: self.stat.st_gid.into(), - mtime: (self.stat.st_mtim_sec as u64).into(), - nlink: (nlink as u32).into(), - ..Default::default() - }); + // V1 uses the BFS index as i_ino (matching C mkcomposefs behaviour). + // V2 uses the NID (byte offset / INODE_SLOT_SIZE) for 32-bit stat compatibility. + let ino = match version.epoch() { + FormatEpoch::Epoch1 => idx as u32, + FormatEpoch::Epoch2 => (output.len() / INODE_SLOT_SIZE) as u32, + }; - self.xattrs.write(output); + // Epoch2 does not store sub-second mtime precision (mtime_nsec=0). + // Epoch1 (V0/V1) preserves full nanosecond precision. + let mtime_nsec: u32 = match version.epoch() { + FormatEpoch::Epoch1 => self.stat.st_mtim_nsec, + FormatEpoch::Epoch2 => 0, + }; + output.write_struct(format::ExtendedInodeHeader { + format, + xattr_icount: xattr_icount.into(), + mode: self.file_type() | self.stat.st_mode, + size: size.into(), + u: u.into(), + ino: ino.into(), + uid: self.stat.st_uid.into(), + gid: self.stat.st_gid.into(), + mtime: (self.stat.st_mtim_sec as u64).into(), + mtime_nsec: mtime_nsec.into(), + nlink: (nlink as u32).into(), + ..Default::default() + }); + } + + self.xattrs.write(output, version); match &self.content { InodeContent::Directory(dir) => dir.write_inline(output), InodeContent::Leaf(leaf) => leaf.write_inline(output), }; - output.pad(32); + output.pad(INODE_SLOT_SIZE); } fn write_blocks(&self, output: &mut impl Output) { @@ -418,13 +998,18 @@ impl Inode<'_, ObjectID> { struct InodeCollector<'a, ObjectID: FsVerityHashValue> { inodes: Vec>, - hardlinks: HashMap, + hardlinks: HashMap, fs: &'a tree::FileSystem, - nlink_map: &'a [u32], + nlink_map: Vec, + version: format::FormatVersion, } impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { - fn push_inode(&mut self, stat: &'a tree::Stat, content: InodeContent<'a, ObjectID>) -> usize { + fn push_inode( + &mut self, + stat: &'a tree::Stat, + content: InodeContent<'a, ObjectID>, + ) -> InodeIdx { let mut xattrs = InodeXAttrs::default(); // We need to record extra xattrs for some files. These come first. @@ -434,23 +1019,28 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { }) = content { xattrs.add( - b"trusted.overlay.metacopy", + format::XATTR_OVERLAY_METACOPY, OverlayMetacopy::new(id).as_bytes(), + self.version, ); let redirect = format!("/{}", id.to_object_pathname()); - xattrs.add(b"trusted.overlay.redirect", redirect.as_bytes()); + xattrs.add( + format::XATTR_OVERLAY_REDIRECT, + redirect.as_bytes(), + self.version, + ); } // Add the normal xattrs. They're already listed in sorted order. for (name, value) in stat.xattrs.iter() { let name = name.as_bytes(); - if let Some(escapee) = name.strip_prefix(b"trusted.overlay.") { - let escaped = [b"trusted.overlay.overlay.", escapee].concat(); - xattrs.add(&escaped, value); + if let Some(escapee) = name.strip_prefix(format::XATTR_OVERLAY_PREFIX) { + let escaped = [format::XATTR_OVERLAY_ESCAPED_PREFIX, escapee].concat(); + xattrs.add(&escaped, value, self.version); } else { - xattrs.add(name, value); + xattrs.add(name, value, self.version); } } @@ -461,11 +1051,12 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { stat, xattrs, content, + escaped_whiteout: false, }); inode } - fn collect_leaf(&mut self, leaf_id: LeafId) -> usize { + fn collect_leaf(&mut self, leaf_id: LeafId) -> InodeIdx { let nlink = self.nlink_map[leaf_id.0] as usize; if nlink > 1 @@ -475,6 +1066,14 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { } let leaf = self.fs.leaf(leaf_id); + + // Hardlinked whiteouts are semantically invalid: a whiteout represents the + // absence of a file in an overlay, so having nlink > 1 is meaningless. + // ValidatedFileSystem guarantees this invariant was checked at construction time. + debug_assert!( + !(matches!(leaf.content, tree::LeafContent::CharacterDevice(0)) && nlink > 1), + "ValidatedFileSystem guarantees whiteout nlink == 1" + ); let inode = self.push_inode( &leaf.stat, InodeContent::Leaf(Leaf { @@ -490,27 +1089,24 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { inode } - fn insert_sorted( - entries: &mut Vec>, - name: &'a [u8], - inode: usize, - file_type: format::FileType, - ) { - let entry = DirEnt { - name, - inode, - file_type, - }; - let point = entries.partition_point(|e| e.name < entry.name); - entries.insert(point, entry); - } - - fn collect_dir(&mut self, dir: &'a tree::Directory, parent: usize) -> usize { + /// Collect inodes using depth-first traversal (V2 / origin/main behavior). + fn collect_dir(&mut self, dir: &'a tree::Directory, parent: InodeIdx) -> InodeIdx { // The root inode number needs to fit in a u16. That more or less compels us to write the // directory inode before the inode of the children of the directory. Reserve a slot. let me = self.push_inode(&dir.stat, InodeContent::Directory(Directory::default())); - let mut entries = vec![]; + let mut entries = vec![ + DirEnt { + name: b".", + inode: InodeRef::Known(me), + file_type: format::FileType::Directory, + }, + DirEnt { + name: b"..", + inode: InodeRef::Known(parent), + file_type: format::FileType::Directory, + }, + ]; for (name, inode) in dir.sorted_entries() { let child = match inode { @@ -519,34 +1115,359 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { }; entries.push(DirEnt { name: name.as_bytes(), - inode: child, + inode: InodeRef::Known(child), file_type: self.inodes[child].file_type(), }); } - // We're expected to add those, too - Self::insert_sorted(&mut entries, b".", me, format::FileType::Directory); - Self::insert_sorted(&mut entries, b"..", parent, format::FileType::Directory); + entries.sort_unstable_by_key(|e| e.name); // Now that we know the actual content, we can write it to our reserved slot self.inodes[me].content = InodeContent::Directory(Directory::from_entries(entries)); me } + /// Returns true if this leaf entry is an overlay whiteout stub generated internally + /// by `add_overlay_whiteouts()`, as opposed to a user-provided whiteout. These stubs + /// must NOT be escaped during V1 whiteout processing. + fn is_overlay_whiteout_stub( + &self, + name: &[u8], + leaf_id: LeafId, + me: InodeIdx, + root_inode: InodeIdx, + ) -> bool { + let root_stat = &self.fs.root.stat; + let leaf_stat = &self.fs.leaf(leaf_id).stat; + let selinux_key = std::ffi::OsStr::new("security.selinux"); + let expected_xattrs = if root_stat.xattrs.contains_key(selinux_key) { + 1 + } else { + 0 + }; + let has_correct_xattrs = leaf_stat.xattrs.len() == expected_xattrs + && (expected_xattrs == 0 + || leaf_stat.xattrs.get(selinux_key) == root_stat.xattrs.get(selinux_key)); + + me == root_inode + && name.len() == 2 + && name + .iter() + .all(|b| b.is_ascii_digit() || matches!(b, b'a'..=b'f')) + && leaf_stat.st_mode == 0o644 + && leaf_stat.st_uid == root_stat.st_uid + && leaf_stat.st_gid == root_stat.st_gid + && leaf_stat.st_mtim_sec == root_stat.st_mtim_sec + && leaf_stat.st_mtim_nsec == root_stat.st_mtim_nsec + && has_correct_xattrs + } + + /// Returns true if a leaf content is a V1 overlay whiteout (char device, rdev=0). + fn is_v1_whiteout(content: &tree::LeafContent) -> bool { + matches!(content, tree::LeafContent::CharacterDevice(0)) + } + + /// Collect all inodes using queue-based breadth-first traversal (V1). + /// + /// This algorithm matches the C mkcomposefs `lcfs_compute_tree()` function which uses + /// a linked-list queue to process directories. All nodes at depth N are assigned inode + /// numbers before any nodes at depth N+1. + /// + /// For V1, char device entries with rdev=0 (overlay whiteouts) are escaped to regular + /// files matching C mkcomposefs v1.0.8 `add_overlayfs_xattrs()` behavior: + /// - Child entry: converted to regular file + gets `trusted.overlay.overlay.whiteout=""` + /// and `user.overlay.whiteout=""` xattrs. + /// - Parent directory: gets `trusted.overlay.overlay.whiteouts=""`, + /// `user.overlay.whiteouts=""`, `trusted.overlay.overlay.opaque=x`, + /// `user.overlay.opaque=x` xattrs (added at most once per directory). + fn collect_tree(&mut self, root: &'a tree::Directory) { + use std::collections::VecDeque; + + // Pre-pass: for each multi-link leaf, find which directory holds the canonical + // (first DFS sorted-order) occurrence. + // + // In C mkcomposefs, when a dumpfile is parsed, the first occurrence of each + // inode (same content / nlink target) is the "original" and subsequent occurrences + // are "hardlinks" (with link_to pointer). During BFS, hardlinks are SKIPPED — only + // originals get inode numbers. Hardlink directory entries use the original's nid. + // + // The dumpfile is written in DFS sorted order (see write_dumpfile). So the canonical + // occurrence is whichever path appears first in that DFS traversal. + // + // We replicate this: when BFS encounters a non-canonical occurrence of a multi-link + // leaf (its canonical directory doesn't match the current directory), we defer the + // nid assignment until the canonical occurrence is processed. + // + // KEY: we record the DIRECTORY POINTER of the canonical occurrence, not just the + // leaf_id, because two occurrences of the same leaf share the same leaf_id — we + // need the directory pointer to distinguish canonical from non-canonical at BFS time. + let canonical_dirs = Self::find_canonical_dirs(root, &self.nlink_map); + + let root_inode = self.push_inode(&root.stat, InodeContent::Directory(Directory::default())); + let mut queue: VecDeque<(&'a tree::Directory, InodeIdx, InodeIdx)> = + VecDeque::new(); + queue.push_back((root, root_inode, root_inode)); + + // dir_entries: accumulates (me, parent, entries) for each directory processed in BFS order. + // Leaf entries whose canonical occurrence hasn't been BFS-processed yet are stored as + // InodeRef::Deferred(leaf_id) and resolved in a single post-BFS pass once all canonical + // inodes have been assigned. + let mut dir_entries: Vec<(InodeIdx, InodeIdx, Vec>)> = vec![]; // (me, parent, entries) + + while let Some((dir, parent, me)) = queue.pop_front() { + let mut entries = vec![ + DirEnt { + name: b".", + inode: InodeRef::Known(me), + file_type: format::FileType::Directory, + }, + DirEnt { + name: b"..", + inode: InodeRef::Known(parent), + file_type: format::FileType::Directory, + }, + ]; + let mut dir_has_whiteout = false; + + for (name, inode) in dir.sorted_entries() { + match inode { + tree::Inode::Directory(subdir) => { + let child = self.push_inode( + &subdir.stat, + InodeContent::Directory(Directory::default()), + ); + queue.push_back((subdir, me, child)); + entries.push(DirEnt { + name: name.as_bytes(), + inode: InodeRef::Known(child), + file_type: format::FileType::Directory, + }); + } + tree::Inode::Leaf(leaf_id, _) => { + // V1 whiteout escaping: char device with rdev=0 → regular file. + // Matches C mkcomposefs v1.0.8 `rewrite_tree_node_for_erofs()`, which + // escapes user-provided char devices. + // + // IMPORTANT: the 256 stubs added by add_overlay_whiteouts() are NOT + // escaped in C — they are added AFTER `rewrite_tree_node_for_erofs()` + // so they never go through escaping. We skip them by detecting root-level + // 2-char hex entries (the names used by add_overlay_whiteouts()) THAT ALSO + // exactly match the metadata applied by add_overlay_whiteouts(). This + // correctly distinguishes them from user-provided whiteouts that happen + // to have a 2-char hex name. + let name_bytes = name.as_bytes(); + let is_stub = + self.is_overlay_whiteout_stub(name_bytes, *leaf_id, me, root_inode); + + // Determine if this occurrence is canonical (first in DFS order). + // + // For multi-link leaves (nlink > 1), the canonical occurrence is the + // one in the directory recorded by find_canonical_dirs(). We compare + // the current directory pointer to identify it precisely. + // + // For single-link leaves (nlink = 1), there is only one occurrence, + // so it is always canonical (no entry in canonical_dirs). + let nlink = self.nlink_map[leaf_id.0]; + let is_canonical = if nlink > 1 { + // Multi-link: canonical iff this is the recorded canonical directory. + // We use pointer identity (std::ptr::eq) to match the current + // directory reference against the one recorded during the DFS + // pre-pass. The pointers are stable borrows from the tree, which + // outlives this entire function. + canonical_dirs + .get(leaf_id) + .is_some_and(|&p| std::ptr::eq(p, dir)) + } else { + // Single-link: always canonical + true + }; + + let child_ref = if is_canonical { + // Canonical occurrence: create nid now. + InodeRef::Known(self.collect_leaf(*leaf_id)) + } else if let Some(&nid) = self.hardlinks.get(leaf_id) { + // Non-canonical, and the canonical has already been processed. + InodeRef::Known(nid) + } else { + // Non-canonical, and canonical hasn't been assigned a nid yet + // (canonical is in a deeper directory, not yet BFS-processed). + // Store as Deferred; resolved in the post-BFS pass. + InodeRef::Deferred(*leaf_id) + }; + + // Apply whiteout escaping on the first canonical occurrence only. + // + // `is_canonical` is true for any entry whose directory pointer matches + // the canonical directory, so if a whiteout leaf and its hardlink both + // live in the same directory, both appear "canonical" by that check. + // We guard with `!escaped_whiteout` to ensure the xattrs are added + // exactly once — on the very first encounter of the inode. + if is_canonical + && matches!(child_ref, InodeRef::Known(_)) + && self.version.epoch() == FormatEpoch::Epoch1 + && !is_stub + && Self::is_v1_whiteout(&self.fs.leaf(*leaf_id).content) + { + let InodeRef::Known(child) = child_ref else { + unreachable!() + }; + if !self.inodes[child].escaped_whiteout { + self.inodes[child].escaped_whiteout = true; + // Add per-entry whiteout xattrs (already-escaped names): + // C adds OVERLAY_XATTR_ESCAPED_WHITEOUT and OVERLAY_XATTR_USERXATTR_WHITEOUT. + self.inodes[child].xattrs.add( + format::XATTR_OVERLAY_WHITEOUT, + b"", + self.version, + ); + self.inodes[child].xattrs.add( + format::XATTR_USERXATTR_WHITEOUT, + b"", + self.version, + ); + dir_has_whiteout = true; + } + } + + // file_type for the dir entry: for Deferred entries, use a placeholder; + // it will be corrected in the post-BFS resolution pass. + let file_type = if let InodeRef::Known(child) = child_ref { + // file_type() already returns RegularFile when escaped_whiteout=true + self.inodes[child].file_type() + } else { + // Deferred; file_type will be updated in the resolution pass + format::FileType::RegularFile + }; + + entries.push(DirEnt { + name: name.as_bytes(), + inode: child_ref, + file_type, + }); + } + } + } + + // V1: if this directory had whiteout children, add parent xattrs. + // C adds these once per directory, on first whiteout child found. + // Matches OVERLAY_XATTR_ESCAPED_WHITEOUTS, OVERLAY_XATTR_USERXATTR_WHITEOUTS, + // OVERLAY_XATTR_ESCAPED_OPAQUE (=x), OVERLAY_XATTR_USERXATTR_OPAQUE (=x). + if self.version.epoch() == FormatEpoch::Epoch1 && dir_has_whiteout { + self.inodes[me] + .xattrs + .add(format::XATTR_OVERLAY_WHITEOUTS, b"", self.version); + self.inodes[me] + .xattrs + .add(format::XATTR_USERXATTR_WHITEOUTS, b"", self.version); + self.inodes[me] + .xattrs + .add(format::XATTR_OVERLAY_OPAQUE, b"x", self.version); + self.inodes[me] + .xattrs + .add(format::XATTR_USERXATTR_OPAQUE, b"x", self.version); + } + + entries.sort_unstable_by_key(|e| e.name); + + dir_entries.push((me, parent, entries)); + } + + // Post-BFS: resolve all Deferred entries. + // At this point all canonical leaves have been assigned nids and are in self.hardlinks. + for (_me, _parent, entries) in &mut dir_entries { + for entry in entries.iter_mut() { + if let InodeRef::Deferred(leaf_id) = entry.inode { + let nid = *self + .hardlinks + .get(&leaf_id) + .expect("canonical leaf must have been assigned a nid during BFS"); + entry.inode = InodeRef::Known(nid); + entry.file_type = self.inodes[nid].file_type(); + } + } + } + + // Build directory content for each directory inode. + for (me, _parent, entries) in dir_entries { + self.inodes[me].content = InodeContent::Directory(Directory::from_entries(entries)); + } + } + + /// DFS pre-pass: find which directory contains the canonical occurrence of each + /// multi-link leaf (first encounter in DFS sorted order). + /// + /// C mkcomposefs parses dumpfiles in DFS sorted order. The first occurrence of each + /// leaf (by `LeafId`) is the "original"; subsequent occurrences are "hardlinks". + /// Only originals get inode numbers in BFS; hardlinks reuse the original's nid. + /// + /// The dumpfile writer (`write_dumpfile`) uses DFS sorted traversal, so we replicate + /// the same traversal here to determine canonical occurrences. + /// + /// Note: we cannot simplify to "first BFS encounter wins" because DFS and BFS visit + /// directories at different depths in different order (e.g. DFS visits `/a/deep/` + /// before `/b/`, while BFS visits `/b/` first). Changing the canonical ordering + /// would break binary compatibility with C mkcomposefs. + /// + /// Returns a `HashMap` mapping each multi-link leaf to the + /// directory pointer where its canonical (first DFS) occurrence lives. + /// Single-link leaves are NOT in the map (they're trivially canonical anywhere). + /// + /// We use raw pointers for directory identity comparison (`std::ptr::eq`) rather + /// than dereferencing. The pointers are stable `&'a` borrows from the tree which + /// outlives the entire `collect_tree` call. + fn find_canonical_dirs( + root: &'a tree::Directory, + nlink_map: &[u32], + ) -> HashMap> { + let mut seen: HashSet = HashSet::new(); + let mut canonical_dirs: HashMap> = HashMap::new(); + Self::dfs_find_canonical(root, nlink_map, &mut seen, &mut canonical_dirs); + canonical_dirs + } + + fn dfs_find_canonical( + dir: &'a tree::Directory, + nlink_map: &[u32], + seen: &mut HashSet, + canonical_dirs: &mut HashMap>, + ) { + let dir_ptr: *const tree::Directory = dir; + for (_, inode) in dir.sorted_entries() { + match inode { + tree::Inode::Directory(subdir) => { + Self::dfs_find_canonical(subdir, nlink_map, seen, canonical_dirs); + } + tree::Inode::Leaf(leaf_id, _) => { + if nlink_map[leaf_id.0] > 1 && seen.insert(*leaf_id) { + // First DFS encounter → canonical occurrence is in this directory. + canonical_dirs.insert(*leaf_id, dir_ptr); + // Second+ encounter → non-canonical (hardlink), dir not recorded + } + // Single-link leaves are always canonical; no need to record them + } + } + } + } + pub fn collect( fs: &'a tree::FileSystem, - nlink_map: &'a [u32], + version: format::FormatVersion, ) -> Vec> { let mut this = Self { inodes: vec![], hardlinks: HashMap::new(), fs, - nlink_map, + nlink_map: fs.nlinks(), + version, }; - // '..' of the root directory is the root directory again - let root_inode = this.collect_dir(&fs.root, 0); - assert_eq!(root_inode, 0); + match version.epoch() { + FormatEpoch::Epoch1 => this.collect_tree(&fs.root), + FormatEpoch::Epoch2 => { + let root_inode = this.collect_dir(&fs.root, 0); + assert_eq!(root_inode, 0); + } + } this.inodes } @@ -554,9 +1475,23 @@ impl<'a, ObjectID: FsVerityHashValue> InodeCollector<'a, ObjectID> { /// Takes a list of inodes where each inode contains only local xattr values, determines which /// xattrs (key, value) pairs appear more than once, and shares them. -fn share_xattrs(inodes: &mut [Inode]) -> Vec { +/// +/// For V1: sorts locals by full key, reverses shared table, uses InodesEnd-relative xattr offsets. +/// For V2: uses natural BTreeMap order (derived Ord), ascending shared table. +fn share_xattrs( + inodes: &mut [Inode], + version: format::FormatVersion, +) -> Vec { let mut xattrs: BTreeMap = BTreeMap::new(); + // V1: sort local xattrs by full key to match C behavior + // V2: don't sort (insertion order is fine, BTreeMap handles shared ordering) + if version.epoch() == FormatEpoch::Epoch1 { + for inode in inodes.iter_mut() { + inode.xattrs.local.sort_by(|a, b| a.cmp_by_full_key(b)); + } + } + // Collect all xattrs from the inodes for inode in inodes.iter() { for attr in &inode.xattrs.local { @@ -571,88 +1506,155 @@ fn share_xattrs(inodes: &mut [Inode]) -> Vec { // Share only xattrs with more than one user xattrs.retain(|_k, v| *v > 1); - // Repurpose the refcount field as an index lookup - for (idx, value) in xattrs.values_mut().enumerate() { - *value = idx; - } + let (xattrs, shared): (BTreeMap, Vec) = match version.epoch() { + FormatEpoch::Epoch1 => { + // C mkcomposefs sorts shared xattrs by full key string (strcmp), then writes + // them in DESCENDING order in the shared xattr block. Our BTreeMap is ordered + // by (prefix_index, suffix, value) which differs from strcmp order when prefix + // indices don't sort the same way as prefix strings (e.g. "security."=6 sorts + // numerically after "trusted."=4, but 'security.' < 'trusted.' lexicographically). + // Collect into a Vec, sort by full key ascending, then reverse = descending. + let mut sorted: Vec<_> = xattrs.into_iter().collect(); + sorted.sort_by(|(a, _), (b, _)| a.cmp_by_full_key(b)); + let n_shared = sorted.len(); + // Assign indices in descending order: first entry on disk gets the highest ref. + // After reversal, sorted[0] (ascending-smallest) ends up last on disk. + // We iterate ascending-sorted and assign index = n-1-i so that the entry + // written LAST (smallest key in ascending order) gets the SMALLEST index. + // Reconstruct a map for the lookup phase below. + let xattrs_map: BTreeMap = sorted + .iter() + .enumerate() + .map(|(i, (k, _))| (k.clone(), n_shared - 1 - i)) + .collect(); + + // Return in descending full-key order (last in ascending = first written) + let mut out = sorted; + out.reverse(); + let shared_vec = out.into_iter().map(|(k, _)| k).collect(); + (xattrs_map, shared_vec) + } + FormatEpoch::Epoch2 => { + // Ascending order: sequential index assignment + for (idx, value) in xattrs.values_mut().enumerate() { + *value = idx; + } - // Visit each inode and change local xattrs into shared xattrs + // Return in ascending order (natural BTreeMap order) + let shared_vec = xattrs.keys().cloned().collect(); + (xattrs, shared_vec) + } + }; + + // Visit each inode and promote xattrs that are in the shared table. + // This is the same for both V1 and V2: remove from local, push index to shared. for inode in inodes.iter_mut() { inode.xattrs.local.retain(|attr| { if let Some(idx) = xattrs.get(attr) { inode.xattrs.shared.push(*idx); - false // drop the local xattr: we converted it + false } else { - true // retain the local xattr: we didn't convert it + true } }); } - // Return the shared xattrs as a vec - xattrs.into_keys().collect() + shared } fn write_erofs( output: &mut impl Output, inodes: &[Inode], xattrs: &[XAttr], + ctx: &mut WriteContext, ) { - // Write composefs header - output.note_offset(Offset::Header); - output.write_struct(format::ComposefsHeader { + let version = ctx.version; + let min_mtime = ctx.min_mtime; + let header_flags = ctx.header_flags; + let composefs_version: u32 = ctx.composefs_version; + // Epoch1 (V0/V1) uses minimum mtime for reproducibility; Epoch2 (V2) uses 0. + let (build_time, build_time_nsec) = match version.epoch() { + FormatEpoch::Epoch1 => min_mtime, + FormatEpoch::Epoch2 => (0, 0), + }; + + // Write composefs header (pads to 1024 bytes internally) + output.write_composefs_header(format::ComposefsHeader { magic: format::COMPOSEFS_MAGIC, version: format::VERSION, - flags: 0.into(), - composefs_version: format::COMPOSEFS_VERSION, + flags: header_flags.into(), + composefs_version: composefs_version.into(), ..Default::default() }); - output.pad(1024); - // Write superblock - output.note_offset(Offset::Superblock); - output.write_struct(format::Superblock { + // Epoch1 sets xattr_blkaddr; Epoch2 leaves it as 0. + let xattr_blkaddr = match version.epoch() { + FormatEpoch::Epoch1 => output.get_xattr_blkaddr(), + FormatEpoch::Epoch2 => 0, + }; + output.write_superblock(format::Superblock { magic: format::MAGIC_V1, blkszbits: format::BLOCK_BITS, feature_compat: (format::FEATURE_COMPAT_MTIME | format::FEATURE_COMPAT_XATTR_FILTER).into(), root_nid: (output.get_nid(0) as u16).into(), inos: (inodes.len() as u64).into(), - blocks: ((output.get(Offset::End, 0) / usize::from(format::BLOCK_SIZE)) as u32).into(), + blocks: output.get_block_count().into(), + build_time: build_time.into(), + build_time_nsec: build_time_nsec.into(), + xattr_blkaddr: xattr_blkaddr.into(), ..Default::default() }); // Write inode table for (idx, inode) in inodes.iter().enumerate() { // The inode may add padding to itself, so it notes its own offset - inode.write_inode(output, idx); + inode.write_inode(output, idx, ctx); } + // Mark end of inode table (slot-aligned) + output.pad(INODE_SLOT_SIZE); + output.note_inodes_end(); + // Write shared xattr table for xattr in xattrs { - output.note_offset(Offset::XAttr); + output.note_xattr(); xattr.write(output); } // Write blocks from inodes that have them output.pad(4096); for inode in inodes.iter() { - output.note_offset(Offset::Block); + output.note_block(); inode.write_blocks(output); } // That's it - output.note_offset(Offset::End); + output.note_end(); } +/// Offsets recorded during the first pass and consumed by the second pass. +/// Only contains values that are actually retrieved; singletons that are +/// write-only (header, superblock) are tracked as bools in `FirstPass`. #[derive(Default)] struct Layout { - offset_types: Vec, - offsets: Vec, + /// Byte offset of each inode, indexed by InodeIdx. + inodes: Vec, + /// Byte offset immediately after the last inode (slot-aligned). + inodes_end: Option, + /// Byte offset of each shared xattr entry, indexed sequentially. + xattrs: Vec, + /// Byte offset of each inode's block data region, indexed by InodeIdx. + blocks: Vec, + /// Total byte length of the image. + end: Option, } #[derive(Default)] struct FirstPass { offset: usize, layout: Layout, + header_emitted: bool, + superblock_emitted: bool, } struct SecondPass { @@ -660,87 +1662,335 @@ struct SecondPass { layout: Layout, } -impl Output for SecondPass { - fn note_offset(&mut self, _offset_type: Offset) { - /* no-op */ +impl Output for FirstPass { + fn note_header_emitted(&mut self) { + assert!(!self.header_emitted, "composefs header written twice"); + self.header_emitted = true; + } + fn note_superblock_emitted(&mut self) { + assert!(!self.superblock_emitted, "superblock written twice"); + self.superblock_emitted = true; + } + fn note_inode(&mut self) { + self.layout + .inodes + .push(NonZeroUsize::new(self.offset).expect("inode recorded at offset 0")); + } + fn note_inodes_end(&mut self) { + assert!( + self.layout.inodes_end.is_none(), + "inodes_end recorded twice" + ); + self.layout.inodes_end = NonZeroUsize::new(self.offset); + } + fn note_xattr(&mut self) { + self.layout + .xattrs + .push(NonZeroUsize::new(self.offset).expect("xattr recorded at offset 0")); + } + fn note_block(&mut self) { + debug_assert_eq!( + self.offset % format::BLOCK_SIZE as usize, + 0, + "block data must start at a block-aligned offset" + ); + self.layout + .blocks + .push(NonZeroUsize::new(self.offset).expect("block recorded at offset 0")); + } + fn note_end(&mut self) { + assert!(self.layout.end.is_none(), "end recorded twice"); + self.layout.end = NonZeroUsize::new(self.offset); } - fn get(&self, offset_type: Offset, idx: usize) -> usize { - let start = self.layout.offset_types[offset_type as usize]; - self.layout.offsets[start + idx] + fn get_inode_offset(&self, _idx: usize) -> Option { + None + } + fn get_inodes_end(&self) -> Option { + None + } + fn get_xattr_offset(&self, _idx: usize) -> Option { + None + } + fn get_block_offset(&self, _idx: usize) -> Option { + None + } + fn get_end(&self) -> Option { + None } fn write(&mut self, data: &[u8]) { - self.output.extend_from_slice(data); + self.offset += data.len(); } - fn pad(&mut self, alignment: usize) { - self.output - .resize(round_up(self.output.len(), alignment), 0); + self.offset = round_up(self.offset, alignment); } - fn len(&self) -> usize { - self.output.len() + self.offset } } -impl Output for FirstPass { - fn note_offset(&mut self, offset_type: Offset) { - while self.layout.offset_types.len() <= offset_type as usize { - self.layout.offset_types.push(self.layout.offsets.len()); - } - assert_eq!(self.layout.offset_types.len(), offset_type as usize + 1); - - trace!( - "{:?} #{} @{}", - offset_type, - self.layout.offsets.len() - self.layout.offset_types[offset_type as usize], - self.offset +impl Output for SecondPass { + fn note_header_emitted(&mut self) {} + fn note_superblock_emitted(&mut self) {} + fn note_inode(&mut self) {} + fn note_inodes_end(&mut self) { + debug_assert_eq!( + self.output.len(), + self.layout + .inodes_end + .expect("inodes_end not recorded") + .get(), + "second pass diverged from first at inodes_end" + ); + } + fn note_xattr(&mut self) {} + fn note_block(&mut self) {} + fn note_end(&mut self) { + debug_assert_eq!( + self.output.len(), + self.layout.end.expect("end not recorded").get(), + "second pass diverged from first at end" ); - self.layout.offsets.push(self.offset); } - fn get(&self, _: Offset, _: usize) -> usize { - 0 // We don't know offsets in the first pass, so fake it + fn get_inode_offset(&self, idx: usize) -> Option { + Some(self.layout.inodes[idx]) + } + fn get_inodes_end(&self) -> Option { + Some(self.layout.inodes_end.expect("inodes_end not recorded")) + } + fn get_xattr_offset(&self, idx: usize) -> Option { + Some(self.layout.xattrs[idx]) + } + fn get_block_offset(&self, idx: usize) -> Option { + Some(self.layout.blocks[idx]) + } + fn get_end(&self) -> Option { + Some(self.layout.end.expect("end not recorded")) } fn write(&mut self, data: &[u8]) { - self.offset += data.len(); + self.output.extend_from_slice(data); } - fn pad(&mut self, alignment: usize) { - self.offset = round_up(self.offset, alignment); + self.output + .resize(round_up(self.output.len(), alignment), 0); } - fn len(&self) -> usize { - self.offset + self.output.len() } } -/// Creates an EROFS filesystem image from a composefs tree +/// Calculates the minimum mtime across all inodes in the collection. +/// +/// This is used for V1 compatibility where build_time is set to the +/// minimum mtime for reproducibility. Returns `(0, 0)` for an empty slice. +fn calculate_min_mtime(inodes: &[Inode]) -> (u64, u32) { + inodes + .iter() + .map(|inode| (inode.stat.st_mtim_sec as u64, inode.stat.st_mtim_nsec)) + .reduce(|(a_sec, a_nsec), (b_sec, b_nsec)| { + if (b_sec, b_nsec) < (a_sec, a_nsec) { + (b_sec, b_nsec) + } else { + (a_sec, a_nsec) + } + }) + .unwrap_or((0, 0)) +} + +/// Return type of [`prepare_erofs_inodes`]: +/// `(inodes, shared_xattrs, min_mtime, header_flags, composefs_version)`. +type PreparedInodes<'a, ObjectID> = (Vec>, Vec, (u64, u32), u32, u32); + +/// Shared setup for all `mkfs_erofs_*` entry points. +/// +/// Collects inodes from the filesystem, injects the Epoch1 opaque xattr on the +/// root directory, computes `header_flags` and `composefs_version`, promotes +/// repeated xattrs to the shared table, and calculates `min_mtime`. +/// +/// The `composefs_version` in the header is determined solely by [`FormatVersion`]: +/// - [`FormatVersion::V0`]: `0` normally, auto-bumped to `1` when user whiteouts present +/// - [`FormatVersion::V1`]: always `1` +/// - [`FormatVersion::V2`]: always `2` +/// +/// Returns `(inodes, shared_xattrs, min_mtime, header_flags, composefs_version)`. +fn prepare_erofs_inodes<'a, ObjectID: FsVerityHashValue>( + fs: &'a tree::FileSystem, + version: format::FormatVersion, +) -> PreparedInodes<'a, ObjectID> { + let mut inodes = InodeCollector::collect(fs, version); + + // For Epoch1 formats, add trusted.overlay.opaque xattr to root directory. + // This is done after collection (and thus after xattr escaping) to match + // the C implementation behavior. + if version.epoch() == FormatEpoch::Epoch1 && !inodes.is_empty() { + inodes[0] + .xattrs + .add(format::XATTR_OVERLAY_OPAQUE_ROOT, b"y", version); + } + + // For Epoch1, compute header flags and composefs_version matching C mkcomposefs behavior. + // This must be checked before share_xattrs(), while all xattrs are still local. + let (header_flags, composefs_version) = match version.epoch() { + FormatEpoch::Epoch1 => { + // COMPOSEFS_FLAGS_HAS_ACL (bit 0) is set when any inode has POSIX ACL xattrs. + let has_acl = inodes.iter().any(|inode| { + inode.xattrs.local.iter().any(|xattr| { + xattr.prefix == format::XATTR_INDEX_POSIX_ACL_ACCESS + || xattr.prefix == format::XATTR_INDEX_POSIX_ACL_DEFAULT + }) + }); + let flags = if has_acl { + format::COMPOSEFS_FLAGS_HAS_ACL.get() + } else { + 0 + }; + + // V0: auto-bump composefs_version to 1 when user whiteouts present. + // V1: always write composefs_version=1 (the version enum encodes this directly). + let cfs_ver = match version { + format::FormatVersion::V0 => { + let has_user_whiteout = inodes.iter().any(|inode| inode.escaped_whiteout); + if has_user_whiteout { 1u32 } else { 0u32 } + } + _ => version.composefs_version().get(), + }; + + (flags, cfs_ver) + } + FormatEpoch::Epoch2 => (0u32, format::COMPOSEFS_VERSION.get()), + }; + + let xattrs = share_xattrs(&mut inodes, version); + let min_mtime = calculate_min_mtime(&inodes); + + (inodes, xattrs, min_mtime, header_flags, composefs_version) +} + +/// Creates an EROFS filesystem image from a composefs tree using the default format (V2). /// /// This function performs a two-pass generation: /// 1. First pass determines the layout and sizes of all structures /// 2. Second pass writes the actual image data /// /// Returns the complete EROFS image as a byte array. -pub fn mkfs_erofs(fs: &tree::FileSystem) -> Box<[u8]> { - // Create the intermediate representation: flattened inodes and shared xattrs - let nlink_map = fs.nlinks(); - let mut inodes = InodeCollector::collect(fs, &nlink_map); - let xattrs = share_xattrs(&mut inodes); +pub fn mkfs_erofs( + fs: &mut ValidatedFileSystem, +) -> Box<[u8]> { + mkfs_erofs_inner( + &mut fs.0, + format::FormatVersion::default(), + #[cfg(test)] + None, + ) +} + +/// Internal two-pass EROFS image generator shared by all public entry points. +/// +/// Runs a layout pass (first pass) followed by an emit pass (second pass). +/// When `faults` is `Some`, decisions are recorded during the first pass and +/// replayed during the second so both passes make identical choices. +pub(crate) fn mkfs_erofs_inner( + fs: &mut tree::FileSystem, + version: format::FormatVersion, + #[cfg(test)] faults: Option, +) -> Box<[u8]> { + if version.epoch() == FormatEpoch::Epoch1 { + fs.add_overlay_whiteouts(); + } + let (inodes, xattrs, min_mtime, header_flags, composefs_version) = + prepare_erofs_inodes(fs, version); + + let mut ctx = WriteContext { + version, + min_mtime, + header_flags, + composefs_version, + #[cfg(test)] + faults, + }; - // Do a first pass with the writer to determine the layout + // First pass: determine the layout. let mut first_pass = FirstPass::default(); - write_erofs(&mut first_pass, &inodes, &xattrs); + write_erofs(&mut first_pass, &inodes, &xattrs, &mut ctx); + + // Switch fault injector to replay mode so the second pass makes identical choices. + #[cfg(test)] + if let Some(ref mut f) = ctx.faults { + f.start_replay(); + } - // Do a second pass with the writer to get the actual bytes + // Second pass: emit the actual bytes. let mut second_pass = SecondPass { output: vec![], layout: first_pass.layout, }; - write_erofs(&mut second_pass, &inodes, &xattrs); + write_erofs(&mut second_pass, &inodes, &xattrs, &mut ctx); - // That's it second_pass.output.into_boxed_slice() } + +/// Creates an EROFS filesystem image from a composefs tree with an explicit format version. +/// +/// The `version` parameter controls the format version: +/// - [`FormatVersion::V0`]: C `mkcomposefs` compatible (compact inodes, BFS, whiteout table). +/// `composefs_version` is `0` normally, auto-bumped to `1` when user whiteouts are present. +/// - [`FormatVersion::V1`]: Same layout as V0 but `composefs_version` is always `1`. +/// Equivalent to C `mkcomposefs --min-version=1`. +/// - [`FormatVersion::V2`]: Rust-native format (extended inodes, DFS, `composefs_version=2`). +/// +/// Whiteout stubs for Epoch1 formats (V0/V1) are added automatically. +/// +/// Returns the complete EROFS image as a byte array. +pub fn mkfs_erofs_versioned( + fs: &mut ValidatedFileSystem, + version: format::FormatVersion, +) -> Box<[u8]> { + mkfs_erofs_inner( + &mut fs.0, + version, + #[cfg(test)] + None, + ) +} + +/// Test-only: write a versioned EROFS image with fault injection. +/// +/// `faults` controls which writer invariants are intentionally violated. +/// Pass `WriterFaults::new(seed)` with the desired rates set. +#[cfg(test)] +pub(crate) fn mkfs_erofs_with_faults( + fs: &mut ValidatedFileSystem, + version: format::FormatVersion, + faults: WriterFaults, +) -> Box<[u8]> { + mkfs_erofs_inner(&mut fs.0, version, Some(faults)) +} + +#[cfg(test)] +mod tests { + use super::compute_chunk_format; + + /// Unit tests for `compute_chunk_format` with boundary values. + /// + /// The function converts a file size into the EROFS chunk-format field: + /// chunkbits = ilog2(size - 1) + 1, clamped to [BLOCK_BITS=12, 43] + /// result = chunkbits - BLOCK_BITS + #[test] + fn test_compute_chunk_format_boundary_values() { + // size=1: file_size <= 1 branch → chunkbits=1 → clamped to 12 → result 0 + assert_eq!(compute_chunk_format(1), 0, "size=1"); + // size=2: ilog2(1)+1=1 → clamped to 12 → result 0 + assert_eq!(compute_chunk_format(2), 0, "size=2"); + // size=4096: ilog2(4095)+1=12 → no clamp → result 0 + assert_eq!(compute_chunk_format(4096), 0, "size=4096"); + // size=4097: ilog2(4096)+1=13 → result 1 + assert_eq!(compute_chunk_format(4097), 1, "size=4097"); + // size=1<<20: ilog2((1<<20)-1)+1=20 → result 8 + assert_eq!(compute_chunk_format(1 << 20), 8, "size=1<<20"); + // size=(1<<20)+1: ilog2(1<<20)+1=21 → result 9 + assert_eq!(compute_chunk_format((1 << 20) + 1), 9, "size=(1<<20)+1"); + } +} diff --git a/crates/composefs/src/filesystem_ops.rs b/crates/composefs/src/filesystem_ops.rs index 31012230..9e9da275 100644 --- a/crates/composefs/src/filesystem_ops.rs +++ b/crates/composefs/src/filesystem_ops.rs @@ -4,43 +4,100 @@ //! FileSystem objects, including computing image IDs, committing to //! repositories, and generating dumpfiles. +use std::collections::HashMap; + use anyhow::Result; use fn_error_context::context; use crate::{ dumpfile::write_dumpfile, - erofs::writer::mkfs_erofs, + erofs::{ + format::FormatVersion, + writer::{mkfs_erofs_inner, validate_filesystem}, + }, fsverity::{FsVerityHashValue, compute_verity}, repository::Repository, tree::FileSystem, }; impl FileSystem { + /// Commits this filesystem as EROFS images for each format version in the repository config. + /// + /// Returns a map from [`FormatVersion`] to the fsverity digest of the + /// stored image for that version. + /// + /// The `image_name` named ref (if provided) is assigned to the **default** + /// version from `repository.format_config()`. All `extra` versions are + /// stored anonymously (no named ref). + /// + /// Note: Callers should ensure root metadata is set before calling this, + /// typically via `copy_root_metadata_from_usr()` or `set_root_stat()`. + #[context("Committing filesystem as EROFS images")] + pub fn commit_images( + &mut self, + repository: &Repository, + image_name: Option<&str>, + ) -> Result> { + // Validate once before writing any version. + // add_overlay_whiteouts() for V1 is called inside mkfs_erofs_inner. + validate_filesystem(self)?; + let formats = repository.format_config(); + let mut result = HashMap::new(); + let mut first = true; + for version in formats.versions() { + // Only the default (first) version claims the named ref. + let name = if first { image_name } else { None }; + first = false; + let image_data = mkfs_erofs_inner( + self, + version, + #[cfg(test)] + None, + ); + let id = repository.write_image(name, &image_data)?; + result.insert(version, id); + } + Ok(result) + } + /// Commits this filesystem as an EROFS image to the repository. /// - /// Generates an EROFS filesystem image and writes it to the repository - /// with the optional name. Returns the fsverity digest of the committed image. + /// Generates an EROFS filesystem image using the repository's configured + /// EROFS format version and writes it with the optional name. Returns the + /// fsverity digest of the committed image for the default format version. /// /// Note: Callers should ensure root metadata is set before calling this, /// typically via `copy_root_metadata_from_usr()` or `set_root_stat()`. #[context("Committing filesystem as EROFS image")] pub fn commit_image( - &self, + &mut self, repository: &Repository, image_name: Option<&str>, ) -> Result { - repository.write_image(image_name, &mkfs_erofs(self)) + let version = repository.format_config().default; + let mut map = self.commit_images(repository, image_name)?; + Ok(map.remove(&version).expect("format version must be in map")) } /// Computes the fsverity digest for this filesystem as an EROFS image. /// - /// Generates the EROFS image and returns its fsverity digest without - /// writing to a repository. + /// The digest depends on the EROFS format version: V1 and V2 produce + /// different on-disk layouts and therefore different digests. Callers + /// must supply the version explicitly so that the digest matches what is + /// actually stored (or will be stored) in the repository. /// /// Note: Callers should ensure root metadata is set before calling this, /// typically via `copy_root_metadata_from_usr()` or `set_root_stat()`. - pub fn compute_image_id(&self) -> ObjectID { - compute_verity(&mkfs_erofs(self)) + pub fn compute_image_id(&mut self, version: FormatVersion) -> ObjectID { + // Callers are responsible for ensuring the tree is valid before calling this. + // In practice this is always called on freshly-built trees that don't have + // invalid constructs like hardlinked whiteouts. + compute_verity(&mkfs_erofs_inner( + self, + version, + #[cfg(test)] + None, + )) } /// Prints this filesystem in dumpfile format to stdout. diff --git a/crates/composefs/src/fs.rs b/crates/composefs/src/fs.rs index 0c33be72..a6fb7b96 100644 --- a/crates/composefs/src/fs.rs +++ b/crates/composefs/src/fs.rs @@ -408,6 +408,7 @@ fn stat_fd(fd: &OwnedFd, ifmt: FileType) -> Result<(rustix::fs::Stat, generic_tr st_uid: buf.st_uid, st_gid: buf.st_gid, st_mtim_sec: buf.st_mtime as i64, + st_mtim_nsec: buf.st_mtime_nsec as u32, xattrs: read_xattrs(fd)?, }, )) @@ -458,14 +459,16 @@ struct FilesystemScanner { inodes: HashMap, leaves: Vec>, handler: ChannelHandler, + hardlinks: HardlinkBehavior, } impl FilesystemScanner { - fn new(handler: ChannelHandler) -> Self { + fn new(handler: ChannelHandler, hardlinks: HardlinkBehavior) -> Self { Self { inodes: HashMap::new(), leaves: Vec::new(), handler, + hardlinks, } } @@ -560,20 +563,27 @@ impl FilesystemScanner { let (buf, stat) = stat_fd(&fd, ifmt)?; - // NB: We could check `st_nlink > 1` to find out if we should track a file as a potential - // hardlink or not, but some filesystems (like fuse-overlayfs) can report this incorrectly. - // Track all files. https://github.com/containers/fuse-overlayfs/issues/435 - let key = FileDevIno { - dev: buf.st_dev, - ino: buf.st_ino, - }; - if let Some(&id) = self.inodes.get(&key) { - Ok(id) - } else { + if self.hardlinks == HardlinkBehavior::Tracked { + // NB: We could check `st_nlink > 1` to find out if we should track a file as a + // potential hardlink or not, but some filesystems (like fuse-overlayfs) can report + // this incorrectly. Track all files. + // https://github.com/containers/fuse-overlayfs/issues/435 + let key = FileDevIno { + dev: buf.st_dev, + ino: buf.st_ino, + }; + if let Some(&id) = self.inodes.get(&key) { + return Ok(id); + } let content = self.scan_leaf_content(fd, &buf)?; let id = self.push_leaf(stat, content); self.inodes.insert(key, id); Ok(id) + } else { + // No deduplication: every path gets its own inode (i_nlink == 1), + // matching the behaviour of the C mkcomposefs tool. + let content = self.scan_leaf_content(fd, &buf)?; + Ok(self.push_leaf(stat, content)) } } @@ -714,6 +724,26 @@ pub fn read_file( // Async filesystem reading // --------------------------------------------------------------------------- +/// Controls whether host hard links are deduplicated during a filesystem scan. +/// +/// When scanning a live container rootfs the source hard links carry semantic +/// meaning (they represent the same file object) and must be preserved so that +/// the composefs image matches the one produced from the OCI tar layer stream. +/// +/// When running as a drop-in replacement for the C `mkcomposefs` tool the +/// default is [`HardlinkBehavior::Break`] for byte-for-byte compatibility; pass +/// [`HardlinkBehavior::Tracked`] to opt in to deduplication. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum HardlinkBehavior { + /// Deduplicate directory entries that share an inode into a single EROFS + /// leaf (i_nlink > 1). This is the correct behaviour when scanning a real + /// container filesystem. + Tracked, + /// Treat every directory entry as an independent inode (i_nlink == 1), + /// matching the default output of the C `mkcomposefs` tool. + Break, +} + /// Load a filesystem tree from the given path, parallelizing verity /// computation and object storage across available cores. /// @@ -721,7 +751,7 @@ pub fn read_file( /// scan discovers large files, they are immediately dispatched for /// verity hashing on the async runtime while the scan continues. /// -/// Hardlinks are deduplicated — each unique inode is processed only once. +/// Hard links are deduplicated — each unique inode is processed only once. /// /// If `repo` is `Some`, file objects are stored in the repository. /// If `None`, fsverity digests are computed without writing to disk. @@ -736,7 +766,7 @@ pub async fn read_filesystem( ) -> Result> { let store: Option>> = repo.map(|r| r as Arc>); - read_filesystem_impl(dirfd, path, store, None).await + read_filesystem_impl(dirfd, path, store, None, HardlinkBehavior::Tracked).await } /// Options for [`read_filesystem_with_opts`]. @@ -747,6 +777,8 @@ pub struct ReadFilesystemOpts { /// Override the default concurrency limit. When `None`, the semaphore is /// derived from the store (if any) or from [`available_parallelism`]. pub semaphore: Option>, + /// Controls how hard links are handled during the scan. + pub hardlinks: HardlinkBehavior, } impl std::fmt::Debug for ReadFilesystemOpts { @@ -754,6 +786,7 @@ impl std::fmt::Debug for ReadFilesystemOpts")) .field("semaphore", &self.semaphore) + .field("hardlinks", &self.hardlinks) .finish() } } @@ -763,11 +796,13 @@ impl Default for ReadFilesystemOpts { Self { store: None, semaphore: None, + hardlinks: HardlinkBehavior::Tracked, } } } -/// Like [`read_filesystem`] but with full control over store and concurrency. +/// Like [`read_filesystem`] but with full control over store, concurrency, and +/// hard-link behaviour. /// /// This is the preferred entry point when you need to supply a custom /// [`ObjectStore`] (e.g. [`FlatDigestStore`] for C-compatible `--digest-store` @@ -778,7 +813,7 @@ pub async fn read_filesystem_with_opts( path: PathBuf, opts: ReadFilesystemOpts, ) -> Result> { - read_filesystem_impl(dirfd, path, opts.store, opts.semaphore).await + read_filesystem_impl(dirfd, path, opts.store, opts.semaphore, opts.hardlinks).await } async fn read_filesystem_impl( @@ -786,6 +821,7 @@ async fn read_filesystem_impl( path: PathBuf, store: Option>>, semaphore_override: Option>, + hardlinks: HardlinkBehavior, ) -> Result> { let semaphore = semaphore_override.unwrap_or_else(|| { store @@ -819,7 +855,7 @@ async fn read_filesystem_impl( // items over the channel as files are discovered. tasks.spawn_blocking(move || { let handler = ChannelHandler { tx }; - let mut scanner = FilesystemScanner::new(handler); + let mut scanner = FilesystemScanner::new(handler, hardlinks); let fs = scanner .scan(&dirfd, path.as_os_str()) .with_context(|| format!("Async reading filesystem from {}", path.display()))?; @@ -940,6 +976,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: Default::default(), + st_mtim_nsec: Default::default(), xattrs: Default::default(), }; set_file_contents(&td, OsStr::new("testfile"), &st, b"new contents").unwrap(); diff --git a/crates/composefs/src/fsverity/mod.rs b/crates/composefs/src/fsverity/mod.rs index 447b1d17..031e5858 100644 --- a/crates/composefs/src/fsverity/mod.rs +++ b/crates/composefs/src/fsverity/mod.rs @@ -12,7 +12,7 @@ pub use digest::FsVerityHasher; use std::{ fs::File, - io::Seek, + io::{BufRead, BufReader, Seek}, os::{ fd::{AsFd, BorrowedFd, OwnedFd}, unix::fs::PermissionsExt, @@ -232,6 +232,35 @@ pub fn measure_verity_opt( } } +/// Try `FS_IOC_MEASURE_VERITY`; fall back to in-process Merkle-tree computation +/// when the kernel reports that verity is absent or the filesystem doesn't +/// support it. This mirrors `lcfs_fd_get_fsverity()` in the C composefs library. +/// +/// Other ioctl errors are returned as-is. +pub fn measure_verity_with_fallback( + fd: impl AsFd + std::io::Read, +) -> Result { + match measure_verity_opt(&fd) { + Ok(Some(digest)) => return Ok(digest), + Ok(None) => {} + Err(e) => return Err(e), + } + // Software fallback: stream the file through the Merkle-tree hasher. + let mut hasher = FsVerityHasher::::new(); + let mut reader = BufReader::with_capacity(FsVerityHasher::::BLOCK_SIZE * 2, fd); + loop { + let buf = reader.fill_buf().map_err(MeasureVerityError::Io)?; + if buf.is_empty() { + break; + } + let chunk = &buf[..buf.len().min(FsVerityHasher::::BLOCK_SIZE)]; + hasher.add_block(chunk); + let n = chunk.len(); + reader.consume(n); + } + Ok(hasher.digest()) +} + /// Check whether a file has fs-verity enabled, dispatching to the correct /// hash type based on the [`Algorithm`]. /// diff --git a/crates/composefs/src/generic_tree.rs b/crates/composefs/src/generic_tree.rs index 77eb5b4e..e5c36d31 100644 --- a/crates/composefs/src/generic_tree.rs +++ b/crates/composefs/src/generic_tree.rs @@ -21,6 +21,8 @@ pub struct Stat { pub st_gid: u32, /// Modification time in seconds since Unix epoch. pub st_mtim_sec: i64, + /// Modification time nanosecond component (0..999_999_999). + pub st_mtim_nsec: u32, /// Extended attributes as key-value pairs. pub xattrs: BTreeMap, Box<[u8]>>, } @@ -46,6 +48,7 @@ impl Stat { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } @@ -100,7 +103,7 @@ pub struct Leaf { } /// A directory node containing named entries. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Directory { /// Metadata for this directory. pub stat: Stat, @@ -109,7 +112,7 @@ pub struct Directory { } /// A filesystem inode representing either a directory or a leaf node. -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum Inode { /// A directory inode. Directory(Box>), @@ -508,17 +511,30 @@ impl Directory { } } + /// Retains only top-level entries whose names satisfy the predicate. + /// This is used for filtering dump output to specific entries. + pub fn retain_top_level(&mut self, mut f: impl FnMut(&str) -> bool) { + self.entries.retain(|name, _| { + // Convert OsStr to str for comparison; non-UTF8 names never match + name.to_str().is_some_and(&mut f) + }); + } + /// Recursively finds the newest modification time in this directory tree. /// /// Returns the maximum modification time among this directory's metadata - /// and all files and subdirectories it contains. + /// and all files and subdirectories it contains, as a `(sec, nsec)` tuple + /// for full nanosecond precision. /// /// The `leaves` table is needed to resolve leaf mtimes. - pub fn newest_file(&self, leaves: &[Leaf]) -> i64 { - let mut newest = self.stat.st_mtim_sec; + pub fn newest_file(&self, leaves: &[Leaf]) -> (i64, u32) { + let mut newest = (self.stat.st_mtim_sec, self.stat.st_mtim_nsec); for inode in self.entries.values() { let mtime = match inode { - Inode::Leaf(id, _) => leaves[id.0].stat.st_mtim_sec, + Inode::Leaf(id, _) => { + let s = &leaves[id.0].stat; + (s.st_mtim_sec, s.st_mtim_nsec) + } Inode::Directory(dir) => dir.newest_file(leaves), }; if mtime > newest { @@ -595,6 +611,60 @@ pub struct FileSystem { } impl FileSystem { + /// Add 256 overlay whiteout stub entries to the root directory. + /// + /// This is required for Format 1.0 compatibility with the C mkcomposefs. + /// Each whiteout is a character device named "00" through "ff" with rdev=0. + /// They inherit uid/gid/mtime from the root directory but have empty xattrs. + /// + /// These entries allow overlay filesystems to efficiently represent + /// deleted files using device stubs that match the naming convention. + /// + /// Adds the 256 two-character hex-named whiteout stub entries (`00`..`ff`) to + /// the root directory, skipping any that already exist. + /// + /// Matches C mkcomposefs v1.0.8 `add_overlay_whiteouts()`: each stub inherits + /// `uid`, `gid`, and `mtime` from root, gets mode `S_IFCHR|0644` with `rdev=0`, + /// and **only** the `security.selinux` xattr from root (if present). No other + /// xattrs are propagated — copying all root xattrs would make them appear on 257 + /// inodes instead of 1, causing the xattr-sharing pass to turn them into shared + /// references and bloating the inode body in a way C does not. + pub(crate) fn add_overlay_whiteouts(&mut self) { + use std::ffi::OsString; + + // C mkcomposefs only inherits security.selinux from root for the stubs. + // Copying all root xattrs would change shared-vs-local xattr storage and + // produce a binary-incompatible image. + let selinux_key = std::ffi::OsStr::new("security.selinux"); + let mut whiteout_xattrs = std::collections::BTreeMap::new(); + if let Some(val) = self.root.stat.xattrs.get(selinux_key) { + whiteout_xattrs.insert(Box::from(selinux_key), val.clone()); + } + + let whiteout_stat = Stat { + st_mode: 0o644, + st_uid: self.root.stat.st_uid, + st_gid: self.root.stat.st_gid, + st_mtim_sec: self.root.stat.st_mtim_sec, + st_mtim_nsec: self.root.stat.st_mtim_nsec, + xattrs: whiteout_xattrs, + }; + + for i in 0..=255u8 { + let name = OsString::from(format!("{:02x}", i)); + + // Skip if entry already exists + if self.root.entries.contains_key(name.as_os_str()) { + continue; + } + + let leaf_id = self.push_leaf(whiteout_stat.clone(), LeafContent::CharacterDevice(0)); + self.root + .entries + .insert(name.into_boxed_os_str(), Inode::leaf(leaf_id)); + } + } + /// Creates a new filesystem with a root directory having the given metadata. pub fn new(root_stat: Stat) -> Self { Self { @@ -643,6 +713,7 @@ impl FileSystem { let st_uid = usr.stat.st_uid; let st_gid = usr.stat.st_gid; let st_mtim_sec = usr.stat.st_mtim_sec; + let st_mtim_nsec = usr.stat.st_mtim_nsec; let xattrs = usr.stat.xattrs.clone(); // Apply copied metadata to root @@ -650,6 +721,7 @@ impl FileSystem { self.root.stat.st_uid = st_uid; self.root.stat.st_gid = st_gid; self.root.stat.st_mtim_sec = st_mtim_sec; + self.root.stat.st_mtim_nsec = st_mtim_nsec; self.root.stat.xattrs = xattrs; Ok(()) @@ -734,9 +806,10 @@ impl FileSystem { /// Returns an error if `/usr` does not exist (needed to get the mtime). pub fn canonicalize_run(&mut self) -> Result<(), ImageError> { if self.root.get_directory_opt(OsStr::new("run"))?.is_some() { - let usr_mtime = self.root.get_directory(OsStr::new("usr"))?.stat.st_mtim_sec; + let usr = self.root.get_directory(OsStr::new("usr"))?.stat.clone(); let run_dir = self.root.get_directory_mut(OsStr::new("run"))?; - run_dir.stat.st_mtim_sec = usr_mtime; + run_dir.stat.st_mtim_sec = usr.st_mtim_sec; + run_dir.stat.st_mtim_nsec = usr.st_mtim_nsec; run_dir.clear(); } Ok(()) @@ -991,7 +1064,9 @@ impl<'a, T> DirectoryRef<'a, T> { } /// Recursively finds the newest modification time in this directory tree. - pub fn newest_file(&self) -> i64 { + /// + /// Returns a `(sec, nsec)` tuple for full nanosecond precision. + pub fn newest_file(&self) -> (i64, u32) { self.dir.newest_file(self.leaves) } } @@ -1013,6 +1088,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } @@ -1024,6 +1100,7 @@ mod tests { st_uid: 1000, st_gid: 1000, st_mtim_sec: mtime, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } @@ -1253,27 +1330,27 @@ mod tests { let mut leaves = Vec::new(); let mut root = Directory::new(stat_with_mtime(5)); - assert_eq!(root.newest_file(&leaves), 5); + assert_eq!(root.newest_file(&leaves), (5, 0)); let leaf_id_10 = push_leaf_file(&mut leaves, 10); root.insert(OsStr::new("file1"), Inode::leaf(leaf_id_10)); - assert_eq!(root.newest_file(&leaves), 10); + assert_eq!(root.newest_file(&leaves), (10, 0)); let subdir_stat = stat_with_mtime(15); let mut subdir = Box::new(Directory::new(subdir_stat)); let leaf_id_12 = push_leaf_file(&mut leaves, 12); subdir.insert(OsStr::new("subfile1"), Inode::leaf(leaf_id_12)); root.insert(OsStr::new("subdir"), Inode::Directory(subdir)); - assert_eq!(root.newest_file(&leaves), 15); + assert_eq!(root.newest_file(&leaves), (15, 0)); if let Some(Inode::Directory(sd)) = root.entries.get_mut(OsStr::new("subdir")) { let leaf_id_20 = push_leaf_file(&mut leaves, 20); sd.insert(OsStr::new("subfile2"), Inode::leaf(leaf_id_20)); } - assert_eq!(root.newest_file(&leaves), 20); + assert_eq!(root.newest_file(&leaves), (20, 0)); root.stat.st_mtim_sec = 25; - assert_eq!(root.newest_file(&leaves), 25); + assert_eq!(root.newest_file(&leaves), (25, 0)); } #[test] @@ -1325,6 +1402,7 @@ mod tests { st_uid: 42, st_gid: 43, st_mtim_sec: 1234567890, + st_mtim_nsec: 0, xattrs: BTreeMap::from([( Box::from(OsStr::new("security.selinux")), Box::from(b"system_u:object_r:usr_t:s0".as_slice()), @@ -1370,6 +1448,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::from([ ( Box::from(OsStr::new("security.selinux")), @@ -1622,6 +1701,7 @@ mod tests { st_uid: 100, st_gid: 200, st_mtim_sec: 54321, + st_mtim_nsec: 0, xattrs: BTreeMap::from([( Box::from(OsStr::new("user.test")), Box::from(b"val".as_slice()), @@ -1794,4 +1874,81 @@ mod tests { assert_eq!(fs.root.stat.st_mtim_sec, 200); assert_eq!(fs.leaves[0].stat.st_mtim_sec, 400); } + + #[test] + fn test_add_overlay_whiteouts() { + let root_stat = Stat { + st_mode: 0o755, + st_uid: 1000, + st_gid: 2000, + st_mtim_sec: 12345, + st_mtim_nsec: 0, + xattrs: BTreeMap::from([( + Box::from(OsStr::new("security.selinux")), + Box::from(b"system_u:object_r:root_t:s0".as_slice()), + )]), + }; + let mut fs = FileSystem::::new(root_stat); + + // Add a pre-existing entry that should not be overwritten + let pre_id = fs.push_leaf( + stat_with_mtime(99999), + LeafContent::Regular(FileContents {}), + ); + fs.root.insert(OsStr::new("00"), Inode::leaf(pre_id)); + + fs.add_overlay_whiteouts(); + + // Should have 256 whiteout entries (255 new + 1 pre-existing) + assert_eq!(fs.root.entries.len(), 256); + + // The pre-existing "00" should still have its original mtime + if let Some(Inode::Leaf(id, _)) = fs.root.entries.get(OsStr::new("00")) { + assert_eq!(fs.leaf(*id).stat.st_mtim_sec, 99999); + } else { + panic!("Expected '00' to remain a leaf"); + } + + // Check a newly created whiteout entry + if let Some(Inode::Leaf(id, _)) = fs.root.entries.get(OsStr::new("ff")) { + let leaf = fs.leaf(*id); + // Should be a character device with rdev=0 + assert!(matches!(leaf.content, LeafContent::CharacterDevice(0))); + // Should have mode 0o644 + assert_eq!(leaf.stat.st_mode, 0o644); + // Should inherit uid/gid/mtime from root + assert_eq!(leaf.stat.st_uid, 1000); + assert_eq!(leaf.stat.st_gid, 2000); + assert_eq!(leaf.stat.st_mtim_sec, 12345); + // Should inherit xattrs from root (e.g. SELinux label) — matching + // C mkcomposefs behaviour where whiteout entries copy root metadata. + assert_eq!( + leaf.stat + .xattrs + .get(OsStr::new("security.selinux")) + .map(|v| v.as_ref()), + Some(b"system_u:object_r:root_t:s0".as_slice()) + ); + } else { + panic!("Expected 'ff' to be a leaf"); + } + + // Check some middle entries exist + assert!(fs.root.entries.contains_key(OsStr::new("7f"))); + assert!(fs.root.entries.contains_key(OsStr::new("a0"))); + } + + #[test] + fn test_add_overlay_whiteouts_empty_fs() { + let mut fs = FileSystem::::new(default_stat()); + + fs.add_overlay_whiteouts(); + + // Should have exactly 256 entries + assert_eq!(fs.root.entries.len(), 256); + + // Check first and last entries + assert!(fs.root.entries.contains_key(OsStr::new("00"))); + assert!(fs.root.entries.contains_key(OsStr::new("ff"))); + } } diff --git a/crates/composefs/src/repository.rs b/crates/composefs/src/repository.rs index 8b50c241..b8a84a1b 100644 --- a/crates/composefs/src/repository.rs +++ b/crates/composefs/src/repository.rs @@ -107,6 +107,7 @@ use rustix::{ }; use crate::{ + erofs::format::{FormatConfig, FormatVersion}, fsverity::{ Algorithm, CompareVerityError, DEFAULT_LG_BLOCKSIZE, EnableVerityError, FsVerityHashValue, FsVerityHasher, MeasureVerityError, compute_verity, enable_verity_maybe_copy, @@ -200,13 +201,19 @@ pub const REPO_FORMAT_VERSION: u32 = 1; /// but prevent any writes (adding objects, creating images, GC, …). /// - Unknown **incompatible** features cause the repository to be /// rejected entirely. -/// -/// There are currently no defined features. pub mod known_features { + /// The ro-compat feature flag for V1 EROFS repositories. + /// + /// When present in `read_only_compatible`, the repository uses the V1 + /// (C-tool compatible) EROFS format. Old tools that don't recognize this + /// flag will open the repository as read-only, preventing accidental V2 + /// image writes into a V1 repo. + pub const V1_EROFS: &str = "v1_erofs"; + /// Compatible features understood by this version. pub const COMPAT: &[&str] = &[]; /// Read-only compatible features understood by this version. - pub const RO_COMPAT: &[&str] = &[]; + pub const RO_COMPAT: &[&str] = &[V1_EROFS]; /// Incompatible features understood by this version. pub const INCOMPAT: &[&str] = &[]; } @@ -282,14 +289,22 @@ impl FeatureFlags { /// Repository metadata stored in `meta.json` at the repository root. /// /// This file records the repository's format version, digest algorithm, -/// and feature flags so that tools can detect misconfigured invocations -/// (e.g. opening a sha256 repo with `--hash sha512`) and so the -/// algorithm doesn't need to be specified on every command. +/// feature flags, and EROFS format configuration so that tools can detect +/// misconfigured invocations (e.g. opening a sha256 repo with `--hash sha512`) +/// and so the algorithm and format don't need to be specified on every command. /// /// The versioning model is inspired by Linux filesystem superblocks /// (ext4, XFS, EROFS): a base version integer for fundamental layout /// changes, plus three tiers of feature flags for finer-grained /// evolution. +/// +/// The EROFS format configuration is stored in two ways for compatibility: +/// - The `erofs_formats` field (new) persists the full [`FormatConfig`] directly. +/// - The `"v1_erofs"` ro_compat flag (legacy) is kept for old-tool compat. +/// +/// When reading old repos without an `erofs_formats` field, the serde default +/// gives `FormatConfig::single(V2)`, which is then overridden to `single(V1)` if +/// the `"v1_erofs"` flag is present. #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct RepoMetadata { /// Base repository format version. Tools must refuse to operate @@ -302,24 +317,91 @@ pub struct RepoMetadata { /// Feature flags. #[serde(default)] pub features: FeatureFlags, + + /// EROFS format configuration for images produced by this repository. + /// + /// Persisted directly so that dual-format repositories (e.g. V1+V2) can + /// round-trip through `meta.json`. For repositories created before this + /// field existed, the field is absent from JSON and defaults to + /// `FormatConfig::single(FormatVersion::V2)`; the `"v1_erofs"` flag is then + /// checked for seamless migration of old V1 repos. + #[serde(default)] + pub erofs_formats: FormatConfig, } impl RepoMetadata { - /// Build metadata for a repository using the given hash type. - pub fn for_hash() -> Self { - Self { - version: REPO_FORMAT_VERSION, - algorithm: Algorithm::for_hash::(), - features: FeatureFlags::default(), + /// Derive the effective EROFS format version for this repository. + /// + /// Uses `erofs_formats.default` when the field was explicitly set (i.e. is + /// not the serde default of `single(V2)`). For old repos that predate the + /// `erofs_formats` field, falls back to deriving the version from the + /// `"v1_erofs"` ro_compat flag: + /// + /// - `"v1_erofs"` present → [`FormatVersion::V1`] + /// - absent → [`FormatVersion::V2`] + pub fn erofs_version(&self) -> FormatVersion { + self.format_config().default + } + + /// Return the effective [`FormatConfig`] for this repository. + /// + /// If `erofs_formats` was explicitly set in `meta.json` (i.e. it is not the + /// serde default `single(V2)`), it is returned as-is. Otherwise the config + /// is derived from the legacy `"v1_erofs"` feature flag for backward + /// compatibility with repos created before this field existed. + pub fn format_config(&self) -> FormatConfig { + let default_config = FormatConfig::default(); + if self.erofs_formats != default_config { + // Field was explicitly stored — use it directly. + self.erofs_formats.clone() + } else if self + .features + .read_only_compatible + .iter() + .any(|f| f == known_features::V1_EROFS) + { + // Legacy V1 repo: flag present but new field absent/default. + FormatConfig::single(FormatVersion::V1) + } else { + // V2 default (new repos or old repos without the flag). + default_config } } +} - /// Build metadata from an explicit [`Algorithm`]. +impl RepoMetadata { + /// Build metadata for a repository using the given hash type, with the default EROFS version. + pub fn for_hash() -> Self { + Self::new_with_formats( + Algorithm::for_hash::(), + &FormatConfig::single(FormatVersion::default()), + ) + } + + /// Build metadata from an explicit [`Algorithm`], with the default EROFS format version. pub fn new(algorithm: Algorithm) -> Self { + Self::new_with_formats(algorithm, &FormatConfig::single(FormatVersion::default())) + } + + /// Build metadata with the correct feature flags for the given [`FormatConfig`]. + /// + /// The primary format version is encoded in two places for compatibility: + /// - `erofs_formats` field: stores the full [`FormatConfig`] directly. + /// - `"v1_erofs"` ro_compat flag: present when the primary version is V1, + /// so that older tools that don't know `erofs_formats` open the repository + /// read-only rather than writing images in the wrong format. + pub fn new_with_formats(algorithm: Algorithm, formats: &FormatConfig) -> Self { + let mut features = FeatureFlags::default(); + if formats.default == FormatVersion::V1 { + features + .read_only_compatible + .push(known_features::V1_EROFS.to_string()); + } Self { version: REPO_FORMAT_VERSION, algorithm, - features: FeatureFlags::default(), + features, + erofs_formats: formats.clone(), } } @@ -359,6 +441,81 @@ impl RepoMetadata { } } +/// Configuration for initializing a new composefs repository. +/// +/// Passed to [`Repository::init_path`] to specify the algorithm, +/// fs-verity policy, and default EROFS format version. +/// +/// fs-verity is **required by default**. Call [`set_insecure`](Self::set_insecure) +/// to opt out (e.g. on tmpfs or in tests). +/// +/// # Examples +/// +/// ```no_run +/// use composefs::repository::RepositoryConfig; +/// use composefs::fsverity::Algorithm; +/// +/// // Default: SHA-256, fs-verity required, EROFS V2. +/// let config = RepositoryConfig::default(); +/// +/// // SHA-512 with fs-verity required. +/// let config = RepositoryConfig::new(Algorithm::SHA512); +/// +/// // Insecure mode (tmpfs, testing). +/// let config = RepositoryConfig::default().set_insecure(); +/// +/// // Custom algorithm, insecure. +/// let config = RepositoryConfig::new(Algorithm::SHA512).set_insecure(); +/// ``` +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct RepositoryConfig { + /// The fs-verity hash algorithm for content-addressed objects. + pub algorithm: Algorithm, + /// EROFS format configuration for images produced by this repository. + /// + /// The full [`FormatConfig`] is written to `meta.json` (both the `default` + /// primary version and any `extra` versions), so dual-format repos round-trip + /// correctly. The `"v1_erofs"` ro_compat flag is also kept for old-tool compat. + /// + /// Use `FormatConfig::single(FormatVersion::V1)` for C-tool compatible output, + /// or `FormatConfig { default: FormatVersion::V1, extra: [FormatVersion::V2].into() }` + /// when both V1 and V2 images should be produced (e.g. for bootc workflows). + pub erofs_formats: FormatConfig, + /// When `true`, fs-verity is NOT enabled on `meta.json` and is not required + /// on stored objects. Use [`set_insecure`](Self::set_insecure) to set this. + insecure: bool, +} + +impl RepositoryConfig { + /// Create a config with the given algorithm and all other settings at their defaults + /// (fs-verity required, EROFS format = default version only). + pub fn new(algorithm: Algorithm) -> Self { + Self { + algorithm, + ..Self::default() + } + } + + /// Disable fs-verity for this repository. + /// + /// Suitable for use on filesystems that do not support fs-verity (tmpfs, + /// overlayfs) or in test environments. Returns `self` for chaining. + pub fn set_insecure(mut self) -> Self { + self.insecure = true; + self + } +} + +impl Default for RepositoryConfig { + fn default() -> Self { + Self { + algorithm: Algorithm::SHA256, + erofs_formats: FormatConfig::single(FormatVersion::default()), + insecure: false, + } + } +} + /// Read the fs-verity algorithm from a repository's `meta.json`. /// /// This is the public API for determining which algorithm a repository @@ -448,6 +605,15 @@ pub fn system_path() -> PathBuf { PathBuf::from("/sysroot/composefs") } +/// Derive the primary [`FormatConfig`] from a [`RepoMetadata`]. +/// +/// Delegates to [`RepoMetadata::format_config`], which prefers the explicit +/// `erofs_formats` field when present and falls back to the legacy `"v1_erofs"` +/// flag for backward compatibility. +fn repo_format_config_from_meta(meta: &RepoMetadata) -> FormatConfig { + meta.format_config() +} + /// Write `meta.json` into a repository directory fd. /// /// This atomically writes (via O_TMPFILE + linkat) the metadata file. @@ -749,6 +915,9 @@ pub struct Repository { write_concurrency: Option, insecure: bool, metadata: RepoMetadata, + /// Per-invocation EROFS version override set by [`set_erofs_version`](Self::set_erofs_version). + /// Does not rewrite `meta.json`; only affects this `Repository` instance. + erofs_version_override: Option, /// When true, SplitStreamWriter::done() writes old-format (pre-repr(C)) /// headers. Used to test backward compatibility with splitstreams /// written before #[repr(C)] was added to SplitstreamHeader. @@ -1097,17 +1266,17 @@ impl Repository { /// Initialize a new repository at the target path and open it. /// /// Creates the directory (mode 0700) if it does not exist, writes - /// `meta.json` for the given `algorithm`, and returns the opened + /// `meta.json` using the parameters from `config`, and returns the opened /// repository together with a flag indicating whether this was a /// fresh initialization (`true`) or an idempotent open of an /// existing repository with the same algorithm (`false`). /// - /// The `algorithm` must be compatible with this repository's + /// The `config.algorithm` must be compatible with this repository's /// `ObjectID` type (e.g. `Algorithm::Sha512` for /// `Repository`). /// - /// If `enable_verity` is true, fs-verity is enabled on `meta.json`, - /// signaling that all objects must also have verity. + /// Unless `config` has been made insecure via [`RepositoryConfig::set_insecure`], + /// fs-verity is enabled on `meta.json`, signaling that all objects must also have verity. /// /// If `meta.json` already exists with a different algorithm, an /// error is returned. @@ -1115,10 +1284,15 @@ impl Repository { pub fn init_path( dirfd: impl AsFd, path: impl AsRef, - algorithm: Algorithm, - enable_verity: bool, + config: RepositoryConfig, ) -> Result<(Self, bool)> { let path = path.as_ref(); + let RepositoryConfig { + algorithm, + erofs_formats, + insecure, + } = config; + let require_fsverity = !insecure; if !algorithm.is_compatible::() { bail!( @@ -1140,11 +1314,12 @@ impl Repository { ) .with_context(|| format!("opening repository directory {}", path.display()))?; - let meta = RepoMetadata::new(algorithm); + let meta = RepoMetadata::new_with_formats(algorithm, &erofs_formats); // Try to write meta.json. If it already exists, check for - // idempotency: same algorithm is fine, different is an error. - if let Err(write_err) = write_repo_metadata(&repo_fd, &meta, enable_verity) { + // idempotency: same config is fine; certain upgrades are allowed; + // incompatible changes are errors. + if let Err(write_err) = write_repo_metadata(&repo_fd, &meta, require_fsverity) { match read_repo_metadata(&repo_fd)? { Some(existing) if existing == meta => { // Idempotent: same config, already initialized. @@ -1153,10 +1328,13 @@ impl Repository { } Some(existing) => { bail!( - "repository already initialized with algorithm '{}'; \ - cannot re-initialize with '{}'", + "repository already initialized with different configuration \ + (algorithm: {}, erofs_version: {:?}); \ + cannot re-initialize with (algorithm: {}, erofs_version: {:?})", existing.algorithm, + existing.erofs_version(), meta.algorithm, + meta.erofs_version(), ); } None => { @@ -1207,6 +1385,7 @@ impl Repository { write_concurrency: None, insecure: !has_verity, metadata, + erofs_version_override: None, #[cfg(any(test, feature = "test"))] write_old_splitstream_format: std::sync::atomic::AtomicBool::new(false), _data: std::marker::PhantomData, @@ -1250,6 +1429,10 @@ impl Repository { ); } + // Use `new` (no `v1_erofs` flag) for legacy repos + // that pre-date the format-set feature. No feature flags → V2 + BOTH, which + // is correct: old repos may contain images of any version and should not be + // artificially restricted. let meta = RepoMetadata::new(algorithm); write_repo_metadata(&repo_fd, &meta, has_verity)?; @@ -1903,6 +2086,19 @@ impl Repository { self.insecure } + /// Override the EROFS format version for this repository session. + /// + /// Changes the in-memory default used by [`FileSystem::commit_image`] + /// and [`FileSystem::compute_image_id`] for the lifetime of this + /// Override the EROFS format version for this `Repository` instance only. + /// + /// Does **not** rewrite `meta.json`. Intended for CLI tools that accept a + /// per-invocation `--erofs-version` flag to override the repository's stored default. + pub fn set_erofs_version(&mut self, version: FormatVersion) -> &mut Self { + self.erofs_version_override = Some(version); + self + } + /// Mark this repository as insecure, disabling verification of /// fs-verity digests. This allows operation on filesystems /// without verity support. @@ -3331,6 +3527,41 @@ impl Repository { &self.metadata } + /// Returns the effective EROFS format version for this repository. + /// + /// Returns the per-invocation override set by [`set_erofs_version`](Self::set_erofs_version) + /// if one is active, otherwise returns the default version from the stored + /// [`FormatConfig`] (see [`format_config`](Self::format_config)). + pub fn erofs_version(&self) -> FormatVersion { + self.erofs_version_override + .unwrap_or_else(|| self.metadata.erofs_version()) + } + + /// Returns the effective [`FormatConfig`] for this repository. + /// + /// When a per-invocation version override is active (set via + /// [`set_erofs_version`](Self::set_erofs_version)), returns a single-version + /// config for that override — the override narrows generation to exactly one + /// format, discarding any `extra` versions from `meta.json`. + /// + /// Otherwise returns the full config from `meta.json`, including any `extra` + /// versions. For repositories created before the `erofs_formats` field was + /// added, the config is derived from the legacy `"v1_erofs"` ro_compat flag. + pub fn format_config(&self) -> FormatConfig { + if let Some(v) = self.erofs_version_override { + FormatConfig::single(v) + } else { + repo_format_config_from_meta(&self.metadata) + } + } + + /// Returns the primary [`FormatConfig`] configured for this repository. + /// + /// Alias for [`format_config`](Self::format_config). + pub fn default_format_config(&self) -> FormatConfig { + self.format_config() + } + /// Lists all named stream references under a given prefix. /// /// Returns (name, target) pairs where name is relative to the prefix. @@ -3492,7 +3723,11 @@ mod tests { /// Create a test repository in insecure mode (no fs-verity required). fn create_test_repo(path: &Path) -> Result>> { - let (repo, _) = Repository::init_path(CWD, path, Algorithm::SHA512, false)?; + let (repo, _) = Repository::init_path( + CWD, + path, + RepositoryConfig::new(Algorithm::SHA512).set_insecure(), + )?; Ok(Arc::new(repo)) } @@ -4002,6 +4237,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: Default::default(), } } @@ -4015,6 +4251,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: Default::default(), }, LeafContent::Regular(RegularFile::External(obj.clone(), size)), @@ -4037,7 +4274,7 @@ mod tests { let obj1_id = repo.ensure_object(&obj1)?; let obj2_id = repo.ensure_object(&obj2)?; - let fs = make_test_fs(&obj2_id, obj2_size); + let mut fs = make_test_fs(&obj2_id, obj2_size); let image1 = fs.commit_image(&repo, None)?; let image1_path = format!("images/{}", image1.to_hex()); @@ -4080,7 +4317,7 @@ mod tests { let obj1_id = repo.ensure_object(&obj1)?; let obj2_id = repo.ensure_object(&obj2)?; - let fs = make_test_fs(&obj2_id, obj2_size); + let mut fs = make_test_fs(&obj2_id, obj2_size); let image1 = fs.commit_image(&repo, None)?; let image1_path = format!("images/{}", image1.to_hex()); @@ -4129,7 +4366,7 @@ mod tests { let obj1_id = repo.ensure_object(&obj1)?; let obj2_id = repo.ensure_object(&obj2)?; - let fs = make_test_fs(&obj2_id, obj2_size); + let mut fs = make_test_fs(&obj2_id, obj2_size); let image1 = fs.commit_image(&repo, Some("ref-name"))?; let image1_path = format!("images/{}", image1.to_hex()); @@ -4177,6 +4414,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: Default::default(), }, LeafContent::Regular(RegularFile::External(obj2.clone(), size2)), @@ -4205,11 +4443,11 @@ mod tests { let obj3_id = repo.ensure_object(&obj3)?; let obj4_id = repo.ensure_object(&obj4)?; - let fs = make_test_fs_with_two_files(&obj2_id, obj2_size, &obj3_id, obj3_size); + let mut fs = make_test_fs_with_two_files(&obj2_id, obj2_size, &obj3_id, obj3_size); let image1 = fs.commit_image(&repo, None)?; let image1_path = format!("images/{}", image1.to_hex()); - let fs = make_test_fs_with_two_files(&obj2_id, obj2_size, &obj4_id, obj4_size); + let mut fs = make_test_fs_with_two_files(&obj2_id, obj2_size, &obj4_id, obj4_size); let image2 = fs.commit_image(&repo, None)?; let image2_path = format!("images/{}", image2.to_hex()); @@ -4657,7 +4895,7 @@ mod tests { let obj = generate_test_data(obj_size, 0xBB); let obj_id = repo.ensure_object(&obj)?; - let fs = make_test_fs(&obj_id, obj_size); + let mut fs = make_test_fs(&obj_id, obj_size); let image_id = fs.commit_image(&repo, None)?; repo.sync()?; @@ -4802,7 +5040,7 @@ mod tests { let obj = generate_test_data(obj_size, 0xCC); let obj_id = repo.ensure_object(&obj)?; - let fs = make_test_fs(&obj_id, obj_size); + let mut fs = make_test_fs(&obj_id, obj_size); let image_id = fs.commit_image(&repo, None)?; repo.sync()?; @@ -4841,8 +5079,7 @@ mod tests { #[tokio::test] async fn test_fsck_detects_corrupt_erofs_image() -> Result<()> { // Exercises fsck_image: corrupts the erofs image data so that - // parsing fails. The catch_unwind should catch the panic from - // the current erofs reader. + // parsing fails. fsck_image returns an error rather than panicking. let tmp = tempdir(); let repo = create_test_repo(&tmp.path().join("repo"))?; @@ -4850,7 +5087,7 @@ mod tests { let obj = generate_test_data(obj_size, 0xDD); let obj_id = repo.ensure_object(&obj)?; - let fs = make_test_fs(&obj_id, obj_size); + let mut fs = make_test_fs(&obj_id, obj_size); let image_id = fs.commit_image(&repo, None)?; repo.sync()?; @@ -4879,6 +5116,46 @@ mod tests { Ok(()) } + /// Helper to create a V1 (C-compatible) EROFS image and write it to the repo. + fn commit_v1_image( + repo: &Repository, + obj_id: &Sha512HashValue, + obj_size: u64, + ) -> Result { + use crate::erofs::writer::{ValidatedFileSystem, mkfs_erofs_versioned}; + + let fs = make_test_fs(obj_id, obj_size); + let image_data = mkfs_erofs_versioned( + &mut ValidatedFileSystem::new(fs).unwrap(), + FormatVersion::V1, + ); + repo.write_image(None, &image_data) + } + + #[tokio::test] + async fn test_fsck_validates_v1_erofs_image() -> Result<()> { + // V1 images (C-compatible format) should pass fsck just like V2. + // This catches regressions where fsck or the reader doesn't handle + // compact inodes, BFS ordering, or the whiteout table. + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + + let obj_size: u64 = 32 * 1024; + let obj = generate_test_data(obj_size, 0xBB); + let obj_id = repo.ensure_object(&obj)?; + + commit_v1_image(&repo, &obj_id, obj_size)?; + repo.sync()?; + + let result = repo.fsck().await?; + assert!( + result.is_ok(), + "V1 (C-compatible) erofs image should pass fsck: {result}" + ); + assert!(result.images_checked > 0, "should have checked the image"); + Ok(()) + } + // ---- Fsck metadata validation tests ---- #[tokio::test] @@ -4953,7 +5230,12 @@ mod tests { // Open a sha512 repo as sha256 → AlgorithmMismatch. let tmp = tempdir(); let path = tmp.path().join("sha512-repo"); - Repository::::init_path(CWD, &path, Algorithm::SHA512, false).unwrap(); + Repository::::init_path( + CWD, + &path, + RepositoryConfig::new(Algorithm::SHA512).set_insecure(), + ) + .unwrap(); assert!(matches!( Repository::::open_path(CWD, &path), Err(RepositoryOpenError::AlgorithmMismatch { .. }) @@ -5024,6 +5306,156 @@ mod tests { ); } + // ---- erofs_version / v1_erofs feature tests ---- + + #[test] + fn test_init_v1_repo_metadata() { + let meta = RepoMetadata::new_with_formats( + Algorithm::SHA256, + &FormatConfig::single(FormatVersion::V1), + ); + assert_eq!(meta.erofs_version(), FormatVersion::V1); + assert!( + meta.features + .read_only_compatible + .contains(&known_features::V1_EROFS.to_string()), + "V1 repo must list v1_erofs in ro_compat, got: {:?}", + meta.features.read_only_compatible + ); + } + + #[test] + fn test_init_v2_repo_metadata() { + let meta = RepoMetadata::new_with_formats( + Algorithm::SHA256, + &FormatConfig::single(FormatVersion::V2), + ); + assert_eq!(meta.erofs_version(), FormatVersion::V2); + assert!( + !meta + .features + .read_only_compatible + .contains(&known_features::V1_EROFS.to_string()), + "V2 repo must NOT list v1_erofs in ro_compat" + ); + } + + #[test] + fn test_init_path_erofs_version_mismatch() -> Result<()> { + let tmp = tempdir(); + let path = tmp.path().join("repo"); + + // First init: V1 + let config_v1 = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_formats: FormatConfig::single(FormatVersion::V1), + ..RepositoryConfig::default().set_insecure() + }; + Repository::::init_path(CWD, &path, config_v1)?; + + // Second init: V2 — should fail because meta.json already exists with V1 config + let config_v2 = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_formats: FormatConfig::single(FormatVersion::V2), + ..RepositoryConfig::default().set_insecure() + }; + let result = Repository::::init_path(CWD, &path, config_v2); + assert!( + result.is_err(), + "re-initializing with different erofs_version must fail" + ); + let err = result.unwrap_err(); + // Use the full chain representation so we see the inner bail! message, + // not just the outermost fn_error_context wrapper. + let msg = format!("{err:#}"); + assert!( + msg.contains("erofs_version"), + "error message must mention erofs_version, got: {msg}" + ); + Ok(()) + } + + #[test] + fn test_init_path_same_erofs_version_is_idempotent() -> Result<()> { + let tmp = tempdir(); + let path = tmp.path().join("repo"); + + let config = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_formats: FormatConfig::single(FormatVersion::V1), + ..RepositoryConfig::default().set_insecure() + }; + let (_, was_new) = Repository::::init_path(CWD, &path, config.clone())?; + assert!(was_new, "first init must be fresh"); + + let (repo, was_new) = Repository::::init_path(CWD, &path, config)?; + assert!(!was_new, "second init with same config must be idempotent"); + assert_eq!(repo.erofs_version(), FormatVersion::V1); + Ok(()) + } + + #[test] + fn test_legacy_repo_defaults_to_v2() { + // A repo with no feature flags → no v1_erofs → derived version is V2. + let json = br#"{"version":1,"algorithm":"fsverity-sha256-12","features":{}}"#; + let meta: RepoMetadata = serde_json::from_slice(json).unwrap(); + assert_eq!( + meta.erofs_version(), + FormatVersion::V2, + "repo with no v1_erofs flag should derive V2" + ); + + // A repo with v1_erofs in ro_compat → derived version is V1. + let json_v1 = br#"{"version":1,"algorithm":"fsverity-sha256-12","features":{"read-only-compatible":["v1_erofs"]}}"#; + let meta_v1: RepoMetadata = serde_json::from_slice(json_v1).unwrap(); + assert_eq!( + meta_v1.erofs_version(), + FormatVersion::V1, + "repo with v1_erofs flag should derive V1" + ); + + // Old JSON that happens to have an erofs_version field (written by a previous + // version of this code) must deserialize successfully — serde ignores unknown fields. + let json_old = + br#"{"version":1,"algorithm":"fsverity-sha256-12","features":{},"erofs_version":2}"#; + let meta_old: RepoMetadata = serde_json::from_slice(json_old).unwrap(); + assert_eq!( + meta_old.erofs_version(), + FormatVersion::V2, + "old JSON with explicit erofs_version field should still derive V2 from flags" + ); + } + + #[test] + fn test_old_tool_blocked_on_v1_repo() { + // Simulate an old tool that does not know about "v1_erofs". + // A V1 repo places "v1_erofs" in ro_compat, so any tool that + // does not recognise that feature must open the repo read-only. + // We model this by constructing the FeatureFlags directly and filtering + // against an empty ro_compat allowlist. + let features = FeatureFlags { + compatible: vec![], + read_only_compatible: vec![known_features::V1_EROFS.to_string()], + incompatible: vec![], + }; + + // An unknown ro_compat feature must not prevent opening, but must + // signal read-only access. + let unknown_ro: Vec = features + .read_only_compatible + .iter() + .filter(|f| ![].contains(&f.as_str())) // empty old-tool allowlist + .cloned() + .collect(); + assert_eq!( + unknown_ro, + vec![known_features::V1_EROFS.to_string()], + "old tool should see v1_erofs as an unknown ro_compat feature" + ); + // And the current tool knows about it, so check() returns ReadWrite. + assert_eq!(features.check().unwrap(), FeatureCheck::ReadWrite); + } + #[test] fn test_object_store_method_variants() { // Verify all variants exist and are distinct @@ -5057,9 +5489,12 @@ mod tests { // Create a repo, store an object, then remove meta.json to // simulate an old-format repository. - let (repo, _) = - Repository::::init_path(CWD, &repo_path, Algorithm::SHA256, false) - .unwrap(); + let (repo, _) = Repository::::init_path( + CWD, + &repo_path, + RepositoryConfig::default().set_insecure(), + ) + .unwrap(); let data = b"hello world"; let obj_id = repo.ensure_object(data).unwrap(); drop(repo); @@ -5108,9 +5543,12 @@ mod tests { let tmp = tempdir(); let repo_path = tmp.path().join("repo"); - let (repo, _) = - Repository::::init_path(CWD, &repo_path, Algorithm::SHA512, false) - .unwrap(); + let (repo, _) = Repository::::init_path( + CWD, + &repo_path, + RepositoryConfig::new(Algorithm::SHA512).set_insecure(), + ) + .unwrap(); let data = b"sha512 test data"; let obj_id = repo.ensure_object(data).unwrap(); drop(repo); @@ -5145,9 +5583,12 @@ mod tests { let tmp = tempdir(); let repo_path = tmp.path().join("repo"); - let (repo, _) = - Repository::::init_path(CWD, &repo_path, Algorithm::SHA512, false) - .unwrap(); + let (repo, _) = Repository::::init_path( + CWD, + &repo_path, + RepositoryConfig::new(Algorithm::SHA512).set_insecure(), + ) + .unwrap(); repo.ensure_object(b"some data").unwrap(); drop(repo); @@ -5185,11 +5626,340 @@ mod tests { let tmp = tempdir(); let repo_path = tmp.path().join("repo"); - Repository::::init_path(CWD, &repo_path, Algorithm::SHA256, false) - .unwrap(); + Repository::::init_path( + CWD, + &repo_path, + RepositoryConfig::default().set_insecure(), + ) + .unwrap(); let (_repo, upgraded) = Repository::::open_upgrade(CWD, &repo_path).unwrap(); assert!(!upgraded); } + + #[tokio::test] + async fn test_fsck_v1_image_detects_missing_object() -> Result<()> { + // Same as test_fsck_validates_erofs_image_objects but with a V1 image, + // ensuring fsck correctly parses V1 images to find object references. + let tmp = tempdir(); + let repo = create_test_repo(&tmp.path().join("repo"))?; + + let obj_size: u64 = 32 * 1024; + let obj = generate_test_data(obj_size, 0xBC); + let obj_id = repo.ensure_object(&obj)?; + + commit_v1_image(&repo, &obj_id, obj_size)?; + repo.sync()?; + + // Sanity: passes before we break it + let result = repo.fsck().await?; + assert!( + result.is_ok(), + "healthy V1 image should pass fsck: {result}" + ); + + // Delete the referenced object + let hex = obj_id.to_hex(); + let (prefix, rest) = hex.split_at(2); + let dir = open_test_repo_dir(&tmp); + dir.remove_file(format!("objects/{prefix}/{rest}"))?; + + let result = repo.fsck().await?; + assert!( + !result.is_ok(), + "fsck should detect missing object in V1 erofs image: {result}" + ); + assert!( + result.missing_objects > 0, + "should report missing objects: {result}" + ); + Ok(()) + } + + // ---- FormatSet / v1_erofs feature flag tests ---- + // + // The `v1_erofs` ro_compat flag is the single on-disk signal for V1 EROFS. + // It is derived from `erofs_formats.default`; the `extra` list is not + // persisted to disk. + + #[test] + fn test_v1_erofs_flag_set_for_v1_repos() { + // V1 primary → v1_erofs present + let meta = RepoMetadata::new_with_formats( + Algorithm::SHA256, + &FormatConfig::single(FormatVersion::V1), + ); + assert!( + meta.features + .read_only_compatible + .contains(&known_features::V1_EROFS.to_string()), + "V1 repo must set v1_erofs in ro_compat, got: {:?}", + meta.features.read_only_compatible + ); + assert_eq!(meta.erofs_version(), FormatVersion::V1); + } + + #[test] + fn test_v1_erofs_flag_absent_for_v2_repos() { + // V2 primary → v1_erofs absent + let meta = RepoMetadata::new_with_formats( + Algorithm::SHA256, + &FormatConfig::single(FormatVersion::V2), + ); + assert!( + !meta + .features + .read_only_compatible + .contains(&known_features::V1_EROFS.to_string()), + "V2 repo must NOT set v1_erofs in ro_compat, got: {:?}", + meta.features.read_only_compatible + ); + assert_eq!(meta.erofs_version(), FormatVersion::V2); + } + + #[test] + fn test_default_format_config_from_v1_erofs_flag() { + // v1_erofs present → V1 primary + let meta_v1 = RepoMetadata::new_with_formats( + Algorithm::SHA256, + &FormatConfig::single(FormatVersion::V1), + ); + assert_eq!( + repo_format_config_from_meta(&meta_v1), + FormatConfig::single(FormatVersion::V1) + ); + + // v1_erofs absent → V2 primary + let meta_v2 = RepoMetadata::new_with_formats( + Algorithm::SHA256, + &FormatConfig::single(FormatVersion::V2), + ); + assert_eq!( + repo_format_config_from_meta(&meta_v2), + FormatConfig::single(FormatVersion::V2) + ); + } + + #[test] + fn test_init_path_v1_format_config() -> Result<()> { + let tmp = tempdir(); + let path = tmp.path().join("repo"); + + let config = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_formats: FormatConfig::single(FormatVersion::V1), + ..RepositoryConfig::default().set_insecure() + }; + let (repo, was_new) = Repository::::init_path(CWD, &path, config)?; + assert!(was_new); + assert_eq!(repo.erofs_version(), FormatVersion::V1); + assert_eq!( + repo.default_format_config(), + FormatConfig::single(FormatVersion::V1) + ); + assert!( + repo.metadata() + .features + .read_only_compatible + .contains(&known_features::V1_EROFS.to_string()), + "v1_erofs must be in ro_compat for V1 repos" + ); + Ok(()) + } + + #[test] + fn test_init_path_v2_format_config() -> Result<()> { + let tmp = tempdir(); + let path = tmp.path().join("repo"); + + let config = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_formats: FormatConfig::single(FormatVersion::V2), + ..RepositoryConfig::default().set_insecure() + }; + let (repo, was_new) = Repository::::init_path(CWD, &path, config)?; + assert!(was_new); + assert_eq!(repo.erofs_version(), FormatVersion::V2); + assert!( + !repo + .metadata() + .features + .read_only_compatible + .contains(&known_features::V1_EROFS.to_string()), + "v1_erofs must NOT be in ro_compat for V2 repos" + ); + Ok(()) + } + + /// Verify `commit_images` with a dual-format config: + /// - both ObjectIDs are in the returned map, + /// - both image symlinks exist in `images/`, + /// - the named ref points to the V1 image (the default / primary version). + #[test] + fn test_commit_images_both_named_ref_points_to_v1() -> Result<()> { + use crate::tree::{FileSystem, Stat}; + + let tmp = tempdir(); + let repo_path = tmp.path().join("repo"); + // V1 is default (gets the named ref); V2 is extra. + let dual_fmt = FormatConfig { + default: FormatVersion::V1, + extra: [FormatVersion::V2].into(), + }; + let config = RepositoryConfig { + algorithm: Algorithm::SHA256, + erofs_formats: dual_fmt, + ..RepositoryConfig::default().set_insecure() + }; + let (repo, _) = Repository::::init_path(CWD, &repo_path, config)?; + + // Build a minimal filesystem (empty root dir is enough). + let root_stat = Stat { + st_mode: 0o755, + st_uid: 0, + st_gid: 0, + st_mtim_sec: 0, + st_mtim_nsec: 0, + xattrs: Default::default(), + }; + let mut fs: FileSystem = FileSystem::new(root_stat); + + // commit_images now reads the FormatConfig from the repository itself. + let map = fs.commit_images(&repo, Some("myref"))?; + repo.sync()?; + + // Both versions must be in the result. + let v1_id = map + .get(&FormatVersion::V1) + .expect("V1 must be in result map"); + let v2_id = map + .get(&FormatVersion::V2) + .expect("V2 must be in result map"); + + // Both image symlinks must exist under images/. + let v1_image_path = format!("images/{}", v1_id.to_hex()); + let v2_image_path = format!("images/{}", v2_id.to_hex()); + assert!( + test_path_exists_in_repo(&tmp, &v1_image_path)?, + "V1 image symlink must exist: {v1_image_path}" + ); + assert!( + test_path_exists_in_repo(&tmp, &v2_image_path)?, + "V2 image symlink must exist: {v2_image_path}" + ); + + // The named ref must exist and must point to the V1 image (primary). + let ref_path = "images/refs/myref"; + assert!( + test_path_exists_in_repo(&tmp, ref_path)?, + "named ref images/refs/myref must exist" + ); + // The ref symlink target should contain the V1 image hex, not V2. + let ref_full = tmp.path().join("repo").join(ref_path); + let target = readlinkat(CWD, &ref_full, Vec::new())?; + let target_str = target.to_str()?; + assert!( + target_str.contains(&v1_id.to_hex()), + "named ref must point to V1 image ({}), but points to: {target_str}", + v1_id.to_hex() + ); + assert!( + !target_str.contains(&v2_id.to_hex()), + "named ref must NOT point to V2 image, but points to: {target_str}" + ); + Ok(()) + } + + #[test] + fn test_meta_json_shapes() { + // Verify the serialized JSON shapes for each repo type. + let v2 = RepoMetadata::new_with_formats( + Algorithm::SHA256, + &FormatConfig::single(FormatVersion::V2), + ); + let v2_json = String::from_utf8(v2.to_json().unwrap()).unwrap(); + // erofs_formats field must be present with default:2, no extra key + assert!( + v2_json.contains("\"erofs_formats\""), + "V2 meta.json must contain erofs_formats, got: {v2_json}" + ); + assert!( + v2_json.contains("\"default\": 2"), + "V2 meta.json must have default:2, got: {v2_json}" + ); + assert!( + !v2_json.contains("\"extra\""), + "V2 single-format meta.json must not have extra key, got: {v2_json}" + ); + + let v1 = RepoMetadata::new_with_formats( + Algorithm::SHA256, + &FormatConfig::single(FormatVersion::V1), + ); + let v1_json = String::from_utf8(v1.to_json().unwrap()).unwrap(); + assert!( + v1_json.contains("\"erofs_formats\""), + "V1 meta.json must contain erofs_formats, got: {v1_json}" + ); + assert!( + v1_json.contains("\"default\": 1"), + "V1 meta.json must have default:1, got: {v1_json}" + ); + assert!( + !v1_json.contains("\"extra\""), + "V1 single-format meta.json must not have extra key, got: {v1_json}" + ); + assert!( + v1_json.contains("v1_erofs"), + "V1 meta.json must still contain v1_erofs flag for old-tool compat, got: {v1_json}" + ); + + let dual = RepoMetadata::new_with_formats( + Algorithm::SHA256, + &FormatConfig { + default: FormatVersion::V1, + extra: [FormatVersion::V2].into(), + }, + ); + let dual_json = String::from_utf8(dual.to_json().unwrap()).unwrap(); + assert!( + dual_json.contains("\"erofs_formats\""), + "dual meta.json must contain erofs_formats, got: {dual_json}" + ); + assert!( + dual_json.contains("\"default\": 1"), + "dual meta.json must have default:1, got: {dual_json}" + ); + assert!( + dual_json.contains("\"extra\""), + "dual meta.json must have extra key, got: {dual_json}" + ); + assert!( + dual_json.contains("v1_erofs"), + "dual meta.json must still contain v1_erofs flag, got: {dual_json}" + ); + + // Verify round-trip: parse back and check format_config() + let v2_parsed: RepoMetadata = serde_json::from_str(&v2_json).unwrap(); + assert_eq!( + v2_parsed.format_config(), + FormatConfig::single(FormatVersion::V2) + ); + + let v1_parsed: RepoMetadata = serde_json::from_str(&v1_json).unwrap(); + assert_eq!( + v1_parsed.format_config(), + FormatConfig::single(FormatVersion::V1) + ); + + let dual_parsed: RepoMetadata = serde_json::from_str(&dual_json).unwrap(); + assert_eq!( + dual_parsed.format_config(), + FormatConfig { + default: FormatVersion::V1, + extra: [FormatVersion::V2].into(), + } + ); + } } diff --git a/crates/composefs/src/splitstream.rs b/crates/composefs/src/splitstream.rs index 6b338dee..8fd45f0b 100644 --- a/crates/composefs/src/splitstream.rs +++ b/crates/composefs/src/splitstream.rs @@ -1168,8 +1168,11 @@ mod tests { /// Create a test repository in insecure mode (no fs-verity required). fn create_test_repo(path: &Path) -> Result>> { - let (repo, _) = - Repository::init_path(CWD, path, crate::fsverity::Algorithm::SHA256, false)?; + let (repo, _) = Repository::init_path( + CWD, + path, + crate::repository::RepositoryConfig::default().set_insecure(), + )?; Ok(Arc::new(repo)) } diff --git a/crates/composefs/src/test.rs b/crates/composefs/src/test.rs index 36ac600a..a14736e0 100644 --- a/crates/composefs/src/test.rs +++ b/crates/composefs/src/test.rs @@ -13,7 +13,10 @@ use once_cell::sync::Lazy; use rustix::fs::CWD; use tempfile::TempDir; -use crate::{fsverity::FsVerityHashValue, repository::Repository}; +use crate::{ + fsverity::FsVerityHashValue, + repository::{Repository, RepositoryConfig}, +}; static TMPDIR: Lazy = Lazy::new(|| { if let Some(path) = std::env::var_os("CFS_TEST_TMPDIR") { @@ -67,8 +70,12 @@ impl TestRepo { pub fn new() -> Self { let dir = tempdir(); let repo_path = dir.path().join("repo"); - let (repo, _) = Repository::init_path(CWD, &repo_path, ObjectID::ALGORITHM, false) - .expect("initializing test repo"); + let (repo, _) = Repository::init_path( + CWD, + &repo_path, + RepositoryConfig::new(ObjectID::ALGORITHM).set_insecure(), + ) + .expect("initializing test repo"); Self { repo: Arc::new(repo), repo_path, @@ -143,19 +150,37 @@ pub(crate) mod proptest_strategies { /// /// Linux filenames are arbitrary bytes except `/` (0x2F) and `\0` (0x00), /// with a max length of [`NAME_MAX`] (255) bytes. We generate a mix of - /// ASCII names and binary names, occasionally long, to exercise directory - /// entry layout edge cases. + /// lengths to exercise directory entry layout edge cases: + /// + /// - Short ASCII (common case) + /// - Binary bytes (no NUL or `/`) + /// - Long ASCII (crosses xattr/inode inline-data boundaries) + /// - Near-NAME_MAX: lengths 252–255 exercise all four 4-byte padding + /// residues in the erofs directory entry format (names are padded to the + /// next 4-byte boundary, so a 255-byte name has 1 pad byte, 254 has 2, + /// 253 has 3, 252 has 0) + /// - Exactly NAME_MAX (255 bytes): the hard limit pub fn filename() -> impl Strategy { prop_oneof![ // Short ASCII names (common case) - 6 => proptest::string::string_regex("[a-zA-Z0-9._-]{1,20}") + 5 => proptest::string::string_regex("[a-zA-Z0-9._-]{1,20}") .expect("valid regex") .prop_map(OsString::from), // Binary names with arbitrary bytes (no NUL or /) - 3 => prop::collection::vec(1..=0xFEu8, 1..=30) + 2 => prop::collection::vec(1..=0xFEu8, 1..=30) .prop_map(|mut v| { v.iter_mut().for_each(|b| if *b == b'/' { *b = b'_' }); OsString::from_vec(v) }), - // Long ASCII names (up to NAME_MAX) - 1 => proptest::string::string_regex(&format!("[a-zA-Z0-9._-]{{100,{NAME_MAX}}}")) + // Long ASCII names (100..=251) — crosses inline-data boundaries + 1 => proptest::string::string_regex("[a-zA-Z0-9._-]{100,251}") + .expect("valid regex") + .prop_map(OsString::from), + // Near-NAME_MAX (252–254): all four mod-4 padding residues in erofs dirents + 1 => (252usize..=254).prop_flat_map(|len| { + proptest::string::string_regex(&format!("[a-zA-Z0-9._-]{{{len}}}")) + .expect("valid regex") + .prop_map(OsString::from) + }), + // Exactly NAME_MAX (255): the hard limit + 1 => proptest::string::string_regex(&format!("[a-zA-Z0-9._-]{{{NAME_MAX}}}")) .expect("valid regex") .prop_map(OsString::from), ] @@ -166,29 +191,38 @@ pub(crate) mod proptest_strategies { pub fn stat() -> impl Strategy { ( 0..=0o7777u32, // permission bits - 0..=65535u32, // uid - 0..=65535u32, // gid - 0..=2_000_000_000i64, // mtime + 0..=131071u32, // uid — crosses u16::MAX to exercise extended inodes + 0..=131071u32, // gid — crosses u16::MAX to exercise extended inodes + 0..=2_000_000_000i64, // mtime sec + 0..1_000_000_000u32, // mtime nsec xattrs(), ) - .prop_map(|(mode, uid, gid, mtime, xattrs)| tree::Stat { - st_mode: mode, - st_uid: uid, - st_gid: gid, - st_mtim_sec: mtime, - xattrs, - }) + .prop_map( + |(mode, uid, gid, mtime_sec, mtime_nsec, xattrs)| tree::Stat { + st_mode: mode, + st_uid: uid, + st_gid: gid, + st_mtim_sec: mtime_sec, + st_mtim_nsec: mtime_nsec, + xattrs, + }, + ) } /// Strategy for xattr keys covering all erofs prefix namespaces. /// /// The erofs format uses prefix indices to compress xattr names: - /// 0 = "" (fallback), 1 = "user.", 2 = "system.posix_acl_access", + /// 0 = "" (fallback, for unrecognized prefixes like com.example.*), + /// 1 = "user.", 2 = "system.posix_acl_access", /// 3 = "system.posix_acl_default", 4 = "trusted.", 5 = "lustre.", /// 6 = "security." /// /// The writer also escapes `trusted.overlay.*` → `trusted.overlay.overlay.*`, /// so we must test that path too. + /// + /// `lustre.*` keys are included here. For V1 images the writer skips index 5 during + /// prefix matching, so lustre.* xattrs fall through to prefix index 0 (raw fallback), + /// matching C mkcomposefs v1.0.8 behavior. fn xattr_key() -> impl Strategy { prop_oneof![ // user.* namespace (index 1) — most common @@ -214,6 +248,16 @@ pub(crate) mod proptest_strategies { 1 => Just("system.posix_acl_access".to_string()), // system.posix_acl_default (index 3) — exact name, no suffix 1 => Just("system.posix_acl_default".to_string()), + // Fallback prefix (index 0) — unrecognized prefix, full key stored as suffix. + // Both Rust and C agree on index 0 for these keys. + 1 => (0..3u32).prop_map(|n| format!("com.example.test_{n}")), + // lustre.* (index 5 in EROFS spec, but index 0 in C mkcomposefs v1.0.8). + // For V1 images, the writer skips index 5 so lustre.* falls through to index 0, + // matching C behavior for binary compatibility. + 1 => prop_oneof![ + Just("lustre.lov".to_string()), + Just("lustre.lma".to_string()), + ], ] } @@ -232,6 +276,58 @@ pub(crate) mod proptest_strategies { }) } + /// Strategy for xattr keys that stress corner cases in the V1 writer: + /// - Multiple `trusted.overlay.*` keys → all get escaped on disk + /// - `trusted.overlay.overlay.X` → double-escaped to `trusted.overlay.overlay.overlay.X` + /// - `security.selinux` + `security.ima` combinations + /// - `system.posix_acl_access` → triggers LCFS_EROFS_FLAGS_HAS_ACL header bit + fn xattr_key_unusual() -> impl Strategy { + prop_oneof![ + // trusted.overlay.* — each gets escaped to trusted.overlay.overlay.* on disk + 4 => prop_oneof![ + Just("trusted.overlay.custom".to_string()), + Just("trusted.overlay.origin".to_string()), + Just("trusted.overlay.upper".to_string()), + Just("trusted.overlay.redirect".to_string()), + Just("trusted.overlay.nfs_fh".to_string()), + ], + // Already-escaped key: trusted.overlay.overlay.X → double-escape on disk + 2 => Just("trusted.overlay.overlay.nested".to_string()), + // security.* — two labels on same inode + 3 => prop_oneof![ + Just("security.selinux".to_string()), + Just("security.ima".to_string()), + Just("security.capability".to_string()), + ], + // ACL — triggers LCFS_EROFS_FLAGS_HAS_ACL + 2 => Just("system.posix_acl_access".to_string()), + // user.* — filler + 1 => proptest::string::string_regex("user\\.[a-z]{1,10}") + .expect("valid regex"), + ] + } + + /// Xattr strategy for the unusual generator: 2–8 xattr pairs (key collisions are silently deduplicated by BTreeMap) with long values allowed. + fn xattrs_unusual() -> impl Strategy, Box<[u8]>>> { + prop::collection::vec( + ( + xattr_key_unusual(), + // Mix of short and long values — long values stress xattr dedup/block layout + prop_oneof![ + 3 => prop::collection::vec(any::(), 0..=20), + 1 => prop::collection::vec(any::(), 64..=512), + ], + ), + 2..=8, + ) + .prop_map(|pairs| { + pairs + .into_iter() + .map(|(k, v)| (OsStr::new(&k).into(), v.into_boxed_slice())) + .collect() + }) + } + /// Strategy for symlink targets as OsString. /// /// Symlink targets on Linux are arbitrary bytes except `\0`, up to @@ -256,7 +352,7 @@ pub(crate) mod proptest_strategies { /// /// External file references store raw hash bytes rather than a concrete /// `ObjectID` type, so the same spec works with any hash algorithm. - #[derive(Debug)] + #[derive(Debug, Clone)] pub enum LeafContentSpec { Inline(Vec), /// External file: random hash bytes (truncated to hash size at build time) and size. @@ -265,6 +361,10 @@ pub(crate) mod proptest_strategies { BlockDevice(u64), CharacterDevice(u64), Fifo, + Socket, + /// Overlay whiteout: char device with rdev=0. Always maps to CharacterDevice(0). + /// Distinct from CharacterDevice(rdev) to allow weighted generation. + Whiteout, } /// Strategy for hash-type-agnostic leaf content. @@ -274,7 +374,7 @@ pub(crate) mod proptest_strategies { // Inline file data is capped at INLINE_CONTENT_MAX_V0 (64 bytes) to match // the composefs invariant: larger files must be external (ChunkBased). ( - 0..10u8, + 0..11u8, prop::collection::vec(any::(), 0..=INLINE_CONTENT_MAX_V0), symlink_target(), prop::collection::vec(any::(), 64..=64), @@ -288,13 +388,14 @@ pub(crate) mod proptest_strategies { 5..=6 => LeafContentSpec::Symlink(symlink_target), 7 => LeafContentSpec::BlockDevice(rdev), 8 => LeafContentSpec::CharacterDevice(rdev), - _ => LeafContentSpec::Fifo, + 9 => LeafContentSpec::Fifo, + _ => LeafContentSpec::Socket, }, ) } /// A hash-type-agnostic leaf node specification. - #[derive(Debug)] + #[derive(Debug, Clone)] pub struct LeafSpec { pub stat: tree::Stat, pub content: LeafContentSpec, @@ -305,8 +406,17 @@ pub(crate) mod proptest_strategies { } /// Strategy for a list of uniquely-named leaf specs. - fn named_leaf_specs(max_entries: usize) -> impl Strategy> { - prop::collection::vec((filename(), leaf_spec()), 0..=max_entries).prop_map(|entries| { + /// Strategy for a list of uniquely-named leaf specs with a given entry count range. + /// + /// The `min..=max` range controls how many entries are attempted before + /// deduplication. Use `named_leaf_specs(0, 30)` for a small directory and + /// `named_leaf_specs(150, 300)` to reliably cross a 4 KiB directory block + /// boundary (~170 entries with typical short names × ~20 bytes each). + fn named_leaf_specs( + min: usize, + max: usize, + ) -> impl Strategy> { + prop::collection::vec((filename(), leaf_spec()), min..=max).prop_map(|entries| { let mut seen = std::collections::HashSet::new(); entries .into_iter() @@ -316,7 +426,7 @@ pub(crate) mod proptest_strategies { } /// Description of a directory to be built, including potential hardlinks. - #[derive(Debug)] + #[derive(Debug, Clone)] pub struct DirSpec { /// Stat metadata for this directory. pub stat: tree::Stat, @@ -327,7 +437,7 @@ pub(crate) mod proptest_strategies { } /// Description of a filesystem to be built, with hardlink info. - #[derive(Debug)] + #[derive(Debug, Clone)] pub struct FsSpec { /// Root directory specification. pub root: DirSpec, @@ -344,9 +454,35 @@ pub(crate) mod proptest_strategies { pub link_name: OsString, } + /// Hardlink spec for the unusual generator: places hardlink in root or a named subdir. + /// `target_dir_index: None` → root; `Some(i)` → subdirs[i % subdirs.len()]`. + #[derive(Debug, Clone)] + pub struct UnusualHardlinkSpec { + /// Index into the flat all-leaves list (root leaves first, then subdir leaves in order). + pub source_leaf_index: usize, + /// Name for the hardlink entry. + pub link_name: OsString, + /// Which directory receives the hardlink entry. + pub target_dir_index: Option, + } + + /// Filesystem description for the unusual generator. + #[derive(Debug, Clone)] + pub struct UnusualFsSpec { + pub root: DirSpec, + pub hardlinks: Vec, + } + /// Strategy for a subdirectory (no further nesting). + /// + /// Usually small (0–20 entries), but 1-in-4 times generates a large + /// directory (150–300 entries) to exercise multi-block directory layout. fn subdir_spec() -> impl Strategy { - (filename(), stat(), named_leaf_specs(10)).prop_map(|(name, stat, leaves)| { + let leaves_strat = prop_oneof![ + 3 => named_leaf_specs(0, 20), + 1 => named_leaf_specs(150, 300), + ]; + (filename(), stat(), leaves_strat).prop_map(|(name, stat, leaves)| { ( name, DirSpec { @@ -370,14 +506,19 @@ pub(crate) mod proptest_strategies { /// Strategy for generating a complete `FsSpec`. /// - /// Generates a root directory with up to 15 file entries and up to 5 - /// subdirectories (each with up to 10 entries, max depth 2). Then - /// optionally generates 0-3 hardlinks that reference existing leaves. + /// Root directory entry count is weighted: usually small (0–30), but + /// 1-in-4 times large (150–300) to reliably cross the 4 KiB directory + /// block boundary. Subdirectories use the same weighted split inside + /// `subdir_spec`. pub fn filesystem_spec() -> impl Strategy { + let root_leaves_strat = prop_oneof![ + 3 => named_leaf_specs(0, 30), + 1 => named_leaf_specs(150, 300), + ]; ( stat(), - named_leaf_specs(15), - unique_subdirs(5), + root_leaves_strat, + unique_subdirs(10), // Hardlink candidates: (source index placeholder, link name) prop::collection::vec((any::(), filename()), 0..=3), ) @@ -420,6 +561,153 @@ pub(crate) mod proptest_strategies { ) } + /// Strategy for the "unusual content" proptest generator. + /// + /// Explicitly constructs filesystem trees that stress corner cases in the V1 writer: + /// - Whiteout files (rdev=0 char devices) at root and in subdirs + /// - Multiple trusted.overlay.* xattrs per inode (escape path) + /// - Large external file sizes (up to 30 GB) + /// - Hardlinks across all leaf types and directories (post-generation pass) + pub fn unusual_filesystem_spec() -> impl Strategy { + fn unusual_stat() -> impl Strategy { + ( + 0u32..=0o7777u32, + 0u32..=131071u32, + 0u32..=131071u32, + 0u64..=u32::MAX as u64, + 0u32..=999_999_999u32, + xattrs_unusual(), + ) + .prop_map(|(mode, uid, gid, mtime_sec, mtime_nsec, xattrs)| { + tree::Stat { + st_mode: mode, + st_uid: uid, + st_gid: gid, + st_mtim_sec: mtime_sec as i64, + st_mtim_nsec: mtime_nsec, + xattrs, + } + }) + } + + fn unusual_leaf_content_spec() -> impl Strategy { + let hash_bytes = prop::collection::vec(any::(), 64..=64); + let ext_size = prop_oneof![ + 5 => 1u64..=1_000_000u64, + 3 => 1_000_001u64..=100_000_000u64, + 2 => 100_000_001u64..=30_000_000_000u64, + ]; + ( + 0u8..=10u8, + prop::collection::vec(any::(), 0..=INLINE_CONTENT_MAX_V0), + symlink_target(), + hash_bytes, + ext_size, + 1u64..=65535u64, + ) + .prop_map( + |(tag, file_data, symlink_target, hash_bytes, ext_size, rdev)| match tag { + 0..=1 => LeafContentSpec::Inline(file_data), + 2..=3 => LeafContentSpec::External(hash_bytes, ext_size), + 4..=5 => LeafContentSpec::Symlink(symlink_target), + 6..=7 => LeafContentSpec::Whiteout, + 8 => LeafContentSpec::BlockDevice(rdev), + 9 => LeafContentSpec::Fifo, + _ => LeafContentSpec::Socket, + }, + ) + } + + fn unusual_leaf_spec() -> impl Strategy { + (unusual_stat(), unusual_leaf_content_spec()) + .prop_map(|(stat, content)| LeafSpec { stat, content }) + } + + fn unusual_named_leaves(max: usize) -> impl Strategy> { + prop::collection::vec((filename(), unusual_leaf_spec()), 0..=max).prop_map(|entries| { + let mut seen = std::collections::HashSet::new(); + entries + .into_iter() + .filter(|(name, _)| seen.insert(name.clone())) + .collect() + }) + } + + fn unusual_subdir_spec() -> impl Strategy { + (filename(), unusual_stat(), unusual_named_leaves(10)).prop_map( + |(name, stat, leaves)| { + ( + name, + DirSpec { + stat, + leaves, + subdirs: vec![], + }, + ) + }, + ) + } + + fn unusual_unique_subdirs(max: usize) -> impl Strategy> { + prop::collection::vec(unusual_subdir_spec(), 0..=max).prop_map(|dirs| { + let mut seen = std::collections::HashSet::new(); + dirs.into_iter() + .filter(|(name, _)| seen.insert(name.clone())) + .collect() + }) + } + + ( + unusual_stat(), + unusual_named_leaves(15), + unusual_unique_subdirs(5), + prop::collection::vec((any::(), filename(), any::()), 0..=5), + ) + .prop_map( + |(root_stat, mut root_leaves, mut root_subdirs, hl_candidates)| { + let mut seen: std::collections::HashSet = + std::collections::HashSet::new(); + root_subdirs.retain(|(name, _)| seen.insert(name.clone())); + root_leaves.retain(|(name, _)| seen.insert(name.clone())); + + let root_leaf_count = root_leaves.len(); + let total_leaves: usize = root_leaf_count + + root_subdirs + .iter() + .map(|(_, d)| d.leaves.len()) + .sum::(); + + let hardlinks = if total_leaves > 0 { + hl_candidates + .into_iter() + .map(|(src_idx, name, dir_idx)| UnusualHardlinkSpec { + source_leaf_index: src_idx % total_leaves, + link_name: name, + target_dir_index: if root_subdirs.is_empty() { + None + } else if dir_idx % 2 == 0 { + None + } else { + Some(dir_idx % root_subdirs.len()) + }, + }) + .collect() + } else { + vec![] + }; + + UnusualFsSpec { + root: DirSpec { + stat: root_stat, + leaves: root_leaves, + subdirs: root_subdirs, + }, + hardlinks, + } + }, + ) + } + /// Convert a `LeafContentSpec` into a concrete `tree::LeafContent`. fn build_leaf_content( spec: LeafContentSpec, @@ -440,6 +728,8 @@ pub(crate) mod proptest_strategies { LeafContentSpec::BlockDevice(rdev) => tree::LeafContent::BlockDevice(rdev), LeafContentSpec::CharacterDevice(rdev) => tree::LeafContent::CharacterDevice(rdev), LeafContentSpec::Fifo => tree::LeafContent::Fifo, + LeafContentSpec::Socket => tree::LeafContent::Socket, + LeafContentSpec::Whiteout => tree::LeafContent::CharacterDevice(0), } } @@ -489,4 +779,79 @@ pub(crate) mod proptest_strategies { fs } + + /// Build a `tree::FileSystem` from an `UnusualFsSpec`. + /// + /// Handles post-generation hardlink injection: hardlinks can target any leaf type + /// (symlinks, whiteouts, devices, FIFOs) and can be placed in root or any subdir. + pub fn build_unusual_filesystem( + spec: UnusualFsSpec, + ) -> tree::FileSystem { + let mut fs = tree::FileSystem::new(spec.root.stat); + + let mut all_leaf_ids: Vec = Vec::new(); + let mut root_used_names: std::collections::HashSet = + std::collections::HashSet::new(); + + // Insert root leaves + for (name, leaf_spec) in spec.root.leaves { + let leaf_id = fs.push_leaf(leaf_spec.stat, build_leaf_content(leaf_spec.content)); + all_leaf_ids.push(leaf_id); + root_used_names.insert(name.clone()); + fs.root.insert(&name, tree::Inode::leaf(leaf_id)); + } + + // Remember subdir names and per-subdir used-name sets for hardlink dedup + let mut subdir_names: Vec = Vec::new(); + let mut subdir_used_names: Vec> = Vec::new(); + + for (dir_name, dir_spec) in spec.root.subdirs { + subdir_names.push(dir_name.clone()); + let mut used: std::collections::HashSet = std::collections::HashSet::new(); + let mut subdir = tree::Directory::new(dir_spec.stat); + for (name, leaf_spec) in dir_spec.leaves { + let leaf_id = fs.push_leaf(leaf_spec.stat, build_leaf_content(leaf_spec.content)); + all_leaf_ids.push(leaf_id); + used.insert(name.clone()); + subdir.insert(&name, tree::Inode::leaf(leaf_id)); + } + subdir_used_names.push(used); + root_used_names.insert(dir_name.clone()); + fs.root + .insert(&dir_name, tree::Inode::Directory(Box::new(subdir))); + } + + // Post-generation hardlink pass: inject hardlinks to any leaf type, any dir. + // Whiteouts (chardev rdev=0) are excluded: hardlinked whiteouts are invalid. + let non_whiteout_leaf_ids: Vec = all_leaf_ids + .iter() + .copied() + .filter(|&id| !matches!(fs.leaf(id).content, tree::LeafContent::CharacterDevice(0))) + .collect(); + if !non_whiteout_leaf_ids.is_empty() { + for hl in spec.hardlinks { + let leaf_id = + non_whiteout_leaf_ids[hl.source_leaf_index % non_whiteout_leaf_ids.len()]; + match hl.target_dir_index { + None => { + if root_used_names.insert(hl.link_name.clone()) { + fs.root.insert(&hl.link_name, tree::Inode::leaf(leaf_id)); + } + } + Some(raw_idx) => { + let idx = raw_idx % subdir_names.len(); + if subdir_used_names[idx].insert(hl.link_name.clone()) { + if let Ok(subdir) = + fs.root.get_directory_mut(subdir_names[idx].as_os_str()) + { + subdir.insert(&hl.link_name, tree::Inode::leaf(leaf_id)); + } + } + } + } + } + } + + fs + } } diff --git a/crates/composefs/src/tree.rs b/crates/composefs/src/tree.rs index dd8865d4..ddfc61bf 100644 --- a/crates/composefs/src/tree.rs +++ b/crates/composefs/src/tree.rs @@ -57,6 +57,7 @@ mod tests { st_uid: 1000, st_gid: 1000, st_mtim_sec: mtime, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } @@ -76,6 +77,7 @@ mod tests { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } diff --git a/crates/composefs/tests/mkfs.rs b/crates/composefs/tests/mkfs.rs index b2896c69..da38b57a 100644 --- a/crates/composefs/tests/mkfs.rs +++ b/crates/composefs/tests/mkfs.rs @@ -11,8 +11,12 @@ use similar_asserts::assert_eq; use tempfile::NamedTempFile; use composefs::{ - dumpfile::write_dumpfile, - erofs::{debug::debug_img, writer::mkfs_erofs}, + dumpfile::{dumpfile_to_filesystem, write_dumpfile}, + erofs::{ + debug::debug_img, + format::FormatVersion, + writer::{ValidatedFileSystem, mkfs_erofs, mkfs_erofs_versioned}, + }, fsverity::{FsVerityHashValue, Sha256HashValue}, tree::{FileSystem, Inode, LeafContent, RegularFile, Stat}, }; @@ -23,12 +27,13 @@ fn default_stat() -> Stat { st_uid: 0, st_gid: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), } } fn debug_fs(fs: FileSystem) -> String { - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); let mut output = vec![]; debug_img(&mut output, &image).unwrap(); String::from_utf8(output).unwrap() @@ -54,6 +59,7 @@ fn add_leaf( st_uid: 0, st_mode: 0, st_mtim_sec: 0, + st_mtim_nsec: 0, xattrs: BTreeMap::new(), }, content, @@ -94,22 +100,81 @@ fn test_simple() { insta::assert_snapshot!(debug_fs(fs)); } -fn foreach_case(f: fn(&FileSystem)) { +fn foreach_case(f: fn(FileSystem)) { for case in [empty, simple] { let mut fs = FileSystem::new(default_stat()); case(&mut fs); - f(&fs); + f(fs); } } #[test_with::executable(fsck.erofs)] fn test_fsck() { foreach_case(|fs| { + // V2 (default) let mut tmp = NamedTempFile::new().unwrap(); - tmp.write_all(&mkfs_erofs(fs)).unwrap(); + tmp.write_all(&mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap())) + .unwrap(); let mut fsck = Command::new("fsck.erofs").arg(tmp.path()).spawn().unwrap(); assert!(fsck.wait().unwrap().success()); }); + + // V1 — needs its own filesystem instances (whiteout stubs are added by the writer) + for case in [empty, simple] { + let mut fs = FileSystem::::new(default_stat()); + case(&mut fs); + let image = mkfs_erofs_versioned( + &mut ValidatedFileSystem::new(fs).unwrap(), + FormatVersion::V1, + ); + let mut tmp = NamedTempFile::new().unwrap(); + tmp.write_all(&image).unwrap(); + let mut fsck = Command::new("fsck.erofs").arg(tmp.path()).spawn().unwrap(); + assert!(fsck.wait().unwrap().success()); + } +} + +/// Verify byte-for-byte identity with C mkcomposefs for the pinned test cases. +/// +/// These fixed cases (`empty`, `simple`) complement the proptest binary-compat +/// tests in reader.rs which cover random trees. Keeping them pinned here means +/// a regression on these canonical shapes is immediately visible without proptest +/// shrinking, and is also validated by the digest stability tests above. +#[test_with::executable(mkcomposefs)] +fn test_vs_mkcomposefs() { + for case in [empty, simple] { + let mut fs_rust = FileSystem::new(default_stat()); + case(&mut fs_rust); + let mut fs_c = FileSystem::new(default_stat()); + case(&mut fs_c); + + let image = mkfs_erofs_versioned( + &mut ValidatedFileSystem::new(fs_rust).unwrap(), + FormatVersion::V0, + ); + + let mut mkcomposefs = Command::new("mkcomposefs") + .args(["--from-file", "-", "-"]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .unwrap(); + + let mut stdin = mkcomposefs.stdin.take().unwrap(); + write_dumpfile(&mut stdin, &fs_c).unwrap(); + drop(stdin); + + let output = mkcomposefs.wait_with_output().unwrap(); + assert!(output.status.success()); + let mkcomposefs_image = output.stdout.into_boxed_slice(); + + if image != mkcomposefs_image { + let dump = dump_image(&image); + let mkcomposefs_dump = dump_image(&mkcomposefs_image); + assert_eq!(mkcomposefs_dump, dump, "structural diff (rust vs C)"); + } + assert_eq!(image, mkcomposefs_image); + } } fn dump_image(img: &[u8]) -> String { @@ -139,7 +204,7 @@ fn test_erofs_digest_stability() { for (name, case, expected_digest) in cases { let mut fs = FileSystem::::new(default_stat()); case(&mut fs); - let image = mkfs_erofs(&fs); + let image = mkfs_erofs(&mut ValidatedFileSystem::new(fs).unwrap()); let digest = composefs::fsverity::compute_verity::(&image); let hex = digest.to_hex(); assert_eq!( @@ -149,32 +214,99 @@ fn test_erofs_digest_stability() { } } -#[should_panic] +#[test] +fn test_erofs_v1_digest_stability() { + // Same as test_erofs_digest_stability but for V0 (C-compatible) format. + // V0 uses the same layout as C mkcomposefs default: composefs_version is 0 + // normally and auto-bumped to 1 when user whiteouts are present. + // Output must be byte-stable since it needs to match C mkcomposefs. + let cases: &[(&str, fn(&mut FileSystem), &str)] = &[ + ( + "empty_v1", + empty, + "8f589e8f57ecb88823736b0d857ddca1e1068a23e264fad164b28f7038eb3682", + ), + ( + "simple_v1", + simple, + "9f3f5620ee0c54708516467d0d58741e7963047c7106b245d94c298259d0fa01", + ), + ]; + + for (name, case, expected_digest) in cases { + let mut fs = FileSystem::::new(default_stat()); + case(&mut fs); + let image = mkfs_erofs_versioned( + &mut ValidatedFileSystem::new(fs).unwrap(), + FormatVersion::V0, + ); + let digest = composefs::fsverity::compute_verity::(&image); + let hex = digest.to_hex(); + assert_eq!( + &hex, expected_digest, + "{name}: V0 EROFS digest changed — if this is intentional, update the pinned value" + ); + } +} + +/// Test that `--min-version=1` forces `composefs_version=1` in the EROFS header +/// even when no user-visible whiteout devices are present, matching C mkcomposefs +/// `--min-version=1 --max-version=1` behaviour. +/// +/// Uses a trimmed version of the C test suite's `special_v1.dump` fixture +/// (the `inline-large*` entries were removed since Rust intentionally rejects +/// inline content larger than `MAX_INLINE_CONTENT`). +/// +/// Golden digest verified against C mkcomposefs 1.0.8+. #[test_with::executable(mkcomposefs)] -fn test_vs_mkcomposefs() { - foreach_case(|fs| { - let image = mkfs_erofs(fs); +fn test_vs_mkcomposefs_min_version_1() { + let dump = include_str!("special_v1.dump"); - let mut mkcomposefs = Command::new("mkcomposefs") - .args(["--min-version=3", "--from-file", "-", "-"]) - .stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .spawn() - .unwrap(); + // Parse the dumpfile and build Rust image with --min-version=1. + let fs_rust = dumpfile_to_filesystem::(dump).unwrap(); + let rust_image = mkfs_erofs_versioned( + &mut ValidatedFileSystem::new(fs_rust).unwrap(), + FormatVersion::V1, + ); - let mut stdin = mkcomposefs.stdin.take().unwrap(); - write_dumpfile(&mut stdin, fs).unwrap(); - drop(stdin); + // Also generate via C mkcomposefs --min-version=1 --max-version=1. + let mut mkcomposefs = Command::new("mkcomposefs") + .args([ + "--min-version=1", + "--max-version=1", + "--from-file", + "-", + "-", + ]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .unwrap(); + mkcomposefs + .stdin + .take() + .unwrap() + .write_all(dump.as_bytes()) + .unwrap(); + let output = mkcomposefs.wait_with_output().unwrap(); + assert!(output.status.success()); + let c_image = output.stdout.into_boxed_slice(); - let output = mkcomposefs.wait_with_output().unwrap(); - assert!(output.status.success()); - let mkcomposefs_image = output.stdout.into_boxed_slice(); + if rust_image != c_image { + let rust_dump = dump_image(&rust_image); + let c_dump = dump_image(&c_image); + assert_eq!( + c_dump, rust_dump, + "structural diff (rust vs C --min-version=1)" + ); + } + assert_eq!(rust_image, c_image); - if image != mkcomposefs_image { - let dump = dump_image(&image); - let mkcomposefs_dump = dump_image(&mkcomposefs_image); - assert_eq!(mkcomposefs_dump, dump); - } - assert_eq!(image, mkcomposefs_image); // fallback if the dump is somehow the same - }); + // Pin the expected digest so any regression is immediately visible. + let digest = composefs::fsverity::compute_verity::(&rust_image); + assert_eq!( + digest.to_hex(), + "b1c78c25db8638be5b9b483472d32ee9624d8d32ded626a91cd536116e8df97c", + "special_v1 --min-version=1 digest changed" + ); } diff --git a/crates/composefs/tests/special_v1.dump b/crates/composefs/tests/special_v1.dump new file mode 100644 index 00000000..214b606d --- /dev/null +++ b/crates/composefs/tests/special_v1.dump @@ -0,0 +1,7 @@ +/ 4096 40555 2 0 0 0 1633950376.0 - - - trusted.foo1=bar-1 user.foo2=bar-2 +/blockdev 0 60777 1 0 0 107690 1633950376.0 - - - trusted.bar=bar-2 +/chardev 0 20777 1 0 0 10769 1633950376.0 - - - trusted.foo=bar-2 +/escaped-xattr 0 100777 1 0 0 0 1633950376.0 - - - trusted.overlay.redirect=/foo\n user.overlay.redirect=/foo\n user.foo=bar-2 +/fifo 0 10777 1 0 0 0 1633950376.0 - - - trusted.bar=bar-2 +/inline 15 100777 1 0 0 0 1633950376.0 - FOOBAR\nINAFILE\n - user.foo=bar-2 +/whiteout 0 20777 1 0 0 0 1633950376.0 - - - trusted.foo=bar-2 diff --git a/doc/booting.md b/doc/booting.md new file mode 100644 index 00000000..52282eb8 --- /dev/null +++ b/doc/booting.md @@ -0,0 +1,77 @@ +# Booting from a composefs image + +This document describes how composefs-rs sets up the root filesystem during +early boot. It covers the kernel command-line interface, the expected on-disk +layout, kernel requirements, and the step-by-step mount sequence performed by +`composefs-setup-root`. + +The target audience is system integrators and OS developers who are packaging a +bootable system using composefs. Familiarity with Linux mount namespaces, +overlayfs, and fs-verity is assumed. + +## Kernel command-line + +A single kernel argument controls which image is booted: + +``` +composefs= +``` + +`` is the hex-encoded fs-verity digest of the EROFS metadata image to +mount as root. SHA-256 digests are 64 hex characters; SHA-512 digests are 128 +hex characters. `composefs-setup-root` tries SHA-512 first and falls back to +SHA-256 if the length does not match, so both algorithms are supported without +any additional configuration. + +**Insecure mode.** Prefixing the digest with `?` (e.g. `composefs=?`) +makes fs-verity verification optional. The system will boot even when the +underlying filesystem does not support fs-verity or the image has no verity +metadata attached. This mode exists for development and testing only; it must +not be used in production. + +Parsing is handled by `composefs_boot::cmdline::get_cmdline_composefs` +(`crates/composefs-boot/src/cmdline.rs`). The splitter follows the kernel's +own logic: tokens are separated by ASCII whitespace, and whitespace inside +double-quoted strings is treated as literal. There is no escape mechanism, so a +literal double-quote character cannot appear in a token value. + +## On-disk layout + +The composefs repository must be present at `/sysroot/composefs` with the +standard layout described in `doc/repository.md`. + +The `composefs=` digest must correspond to a symlink under `images/`. + +Persistent per-deployment state lives at `/sysroot/state/deploy//`, +where `` matches the `composefs=` kernel argument exactly. The `etc/` +and `var/` subdirectories within that directory serve as the upper layers for +the corresponding overlayfs mounts. + +## Kernel requirements + +The following kernel features must be available: + +- **EROFS** filesystem driver (`CONFIG_EROFS_FS`) +- **overlayfs** with `metacopy=on` and `redirect_dir=on` + (`CONFIG_OVERLAY_FS`, `CONFIG_OVERLAY_FS_METACOPY`, `CONFIG_OVERLAY_FS_REDIRECT_DIR`) +- **fs-verity** unless insecure mode is used (`CONFIG_FS_VERITY`) +- The modern Linux mount API (`fsopen` / `fsconfig` / `fsmount` / `move_mount`), + available since kernel 5.2. Kernel ≥ 6.15 is required for the atomic root + replacement path (the default build). On kernels without `fsconfig_set_fd` + support (e.g. RHEL 9 / kernel < 5.15), a loopback device is created + automatically by `composefs::mountcompat`. + +## Kernel argument + +The `composefs=` kernel argument is the authoritative selector for which image +Without the `?` insecure prefix, every file access through the overlayfs is +verified against the object's stored digest by the kernel, combining fs-verity +on the data objects with overlayfs `verity=require`. + +## Other notes + +As a workaround for a GPT auto-root issue in systemd +([systemd#35017](https://github.com/systemd/systemd/issues/35017)), +`composefs-setup-root` attempts to create `/run/systemd/volatile-root` as a +symlink pointing to the real block device before performing any mounts. Failure +to do so is non-fatal and does not abort the boot sequence. diff --git a/doc/erofs.md b/doc/erofs.md new file mode 100644 index 00000000..5ccb8306 --- /dev/null +++ b/doc/erofs.md @@ -0,0 +1,82 @@ +# composefs EROFS image format + +composefs images are EROFS filesystem images with composefs-specific extensions. They encode +a directory tree where regular files are stored externally in a content-addressed object store +and referenced by their fs-verity digest. The EROFS image itself carries only metadata: inodes, +directory entries, extended attributes, and chunk index entries that point to the external files. + +composefs-rs supports two EROFS format versions. V1 is byte-for-byte compatible with the C +`mkcomposefs` tool. V2 is the composefs-rs native default and drops several V1 constraints +that exist only for C compatibility. New repositories use V2 unless `--erofs-version v1` is +passed to `cfsctl init`. + +However, V2 is not mountable by RHEL9 era EROFS, and a goal is to transition to V1 by default +for maximum compatibility. + +## Format V1 + +V1 is selected with `cfsctl init --erofs-version v1`. The `v1_erofs` ro-compat feature flag +is written to `meta.json` so that tools without V1 support open the repository read-only. + +**`composefs_version` field values in V1:** + +- `0` — no user-visible whiteout files (character devices with rdev=0) in the tree +- `1` — at least one user-visible whiteout file is present + +The constant `COMPOSEFS_VERSION_V1` is 0; the field only reaches 1 when user whiteouts are +found. The `--min-version` flag in `mkcomposefs` (mirrored by `mkfs_erofs_v1_min_version`) +forces the value to 1 even when no user whiteouts exist, for forward compatibility. + +**Inode layout:** V1 uses compact inodes (32 bytes) when the file data and inode fit within +the constraints of the compact format, and extended inodes (64 bytes) otherwise. + +**Inode traversal order:** V1 collects inodes in breadth-first order — all entries at one +directory level before descending. + +**Whiteout stub table:** V1 includes 256 synthetic inode entries at the start of the inode +area, one per two-hex-character prefix `00`–`ff`. Each entry is a character-device stub +(chr 0,0) used by the overlay filesystem to resolve whiteout paths against the object store. +V2 omits them entirely. + +**Whiteout escaping:** User-visible whiteout files (chr 0,0) in the tree are not stored as +character devices on disk. Instead they receive a `trusted.overlay.opaque=x` xattr and are +serialized differently. The stub entries in the whiteout table are not escaped. + +**`build_time`:** The superblock `build_time` field is set to the minimum mtime across all inodes. + +**xattr sharing:** Xattr entries are deduplicated using a sort key that is the full xattr name (prefix string concatenated with the suffix). + +## Format V2 — Created in composefs-rs + +V2 is the default for all repositories created without `--erofs-version v1`. + +**`composefs_version` field:** Always `2` (the constant `COMPOSEFS_VERSION`). + +**Inode layout:** V2 always uses extended inodes (64 bytes). + +**Inode traversal order:** V2 collects inodes in depth-first order — all descendants of a directory before moving to the next sibling. + +**No whiteout stub table:** V2 has no synthetic stub entries; whiteout files are stored directly without escaping. + +**`build_time`:** Always 0. + +**xattr sharing:** Xattr entries are deduplicated using a sort key of (prefix, suffix, value) +rather than the full name string, which can produce a smaller shared xattr area. + +## Selecting the format + +The format is fixed at repository initialization time and cannot be changed afterward. + +``` +cfsctl init # V2 (default) +cfsctl init --erofs-version v1 # V1 (C-tool compatible) +``` + +The format is recorded in `meta.json` as the `v1_erofs` ro-compat feature flag: present +means V1, absent means V2. Tools that do not recognize this flag open the repository +read-only rather than writing images in the wrong format. + +For the standalone `mkcomposefs` tool, the equivalent flag is `--erofs-version`. The +`--min-version` flag (`mkfs_erofs_v1_min_version` in the Rust API) controls whether the +`composefs_version` field starts at 0 or 1 in V1 images regardless of whether user whiteouts +are present. diff --git a/doc/repository.md b/doc/repository.md index e3188305..7e400040 100644 --- a/doc/repository.md +++ b/doc/repository.md @@ -65,11 +65,42 @@ created by `cfsctl init` and contains: - `read-only-compatible` — old tools may read but must not write. - `incompatible` — old tools must refuse the repository entirely. + The currently defined feature flags are: + - `v1_erofs` (read-only-compatible) — present on repositories whose + EROFS image format is V1 (C-tool compatible: compact inodes, BFS + ordering, whiteout table). This is the single flag that encodes the + EROFS format version: present → V1, absent → V2 (the default). Old + tools that do not recognise this flag open the repository read-only + rather than accidentally writing images in the wrong format. + When `meta.json` is present, `cfsctl` auto-detects the hash algorithm and errors if `--hash` is explicitly passed with a conflicting value. When the file is absent (for repositories created before this feature), `--hash` is honored as before and defaults to `sha512`. +### `cfsctl init --erofs-version` + +The `--erofs-version` flag selects the EROFS format for newly committed +images. It controls the `v1_erofs` feature flag in `meta.json`: + +``` +cfsctl init # default: V2 EROFS (composefs-rs native) +cfsctl init --erofs-version v1 # V1 EROFS (C-tool compatible) +``` + +**V2** (default) uses extended inodes, DFS ordering, and `composefs_version=2` +in the EROFS superblock. This is the composefs-rs native format and is what +all repositories created before V1 support was added use. + +**V1** uses compact inodes where possible, BFS ordering, and a whiteout stub +table, producing output byte-for-byte identical to the C `mkcomposefs` tool. +The `v1_erofs` ro-compat flag is written to `meta.json` so that tools which +predate V1 support open the repository read-only rather than writing images +in the wrong format. + +Re-initializing an existing repository with a different `--erofs-version` is +rejected with an error; the format version is fixed at init time. + ## `objects/` This is where the content-addressed data is stored. The immediate children of