diff --git a/Cargo.toml b/Cargo.toml index 7086d4b3..fda6eef0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,7 @@ default-members = [ "crates/composefs-setup-root", "crates/composefs-storage", "crates/composefs-erofs-debug", - "crates/composefs-splitfdstream", + "crates/composefs-splitdirfdstream", ] resolver = "2" @@ -43,8 +43,6 @@ composefs-ostree = { version = "0.7.0", path = "crates/composefs-ostree", defaul cap-std-ext = "5.1.2" ocidir = "0.7.2" -# JSON-RPC with FD passing for userns helper -jsonrpc-fdpass = { version = "0.1.0", default-features = false } zlink = { version = "0.5", default-features = false, features = ["tokio", "introspection", "proxy", "server", "service", "tracing"] } zlink-core = { version = "0.5", default-features = false, features = ["introspection", "std", "tracing"] } diff --git a/crates/composefs-ctl/Cargo.toml b/crates/composefs-ctl/Cargo.toml index b7155319..27ea8d4e 100644 --- a/crates/composefs-ctl/Cargo.toml +++ b/crates/composefs-ctl/Cargo.toml @@ -19,7 +19,7 @@ path = "src/main.rs" [features] default = ['pre-6.15', 'oci', 'containers-storage', 'ostree'] http = ['composefs-http'] -oci = ['composefs-oci', 'composefs-oci/varlink'] +oci = ['composefs-oci', 'composefs-oci/varlink', 'composefs-splitdirfdstream'] containers-storage = ['composefs-oci/containers-storage', 'cstorage'] ostree = ['composefs-ostree'] rhel9 = ['composefs/rhel9'] @@ -33,19 +33,26 @@ comfy-table = { version = "7.1", default-features = false } composefs = { workspace = true, features = ["varlink"] } composefs-boot = { workspace = true } composefs-oci = { workspace = true, optional = true, features = ["boot"] } +composefs-splitdirfdstream = { path = "../composefs-splitdirfdstream", version = "0.7.0", optional = true } composefs-http = { workspace = true, optional = true } -cstorage = { package = "composefs-storage", path = "../composefs-storage", version = "0.7.0", features = ["userns-helper"], optional = true } +cstorage = { package = "composefs-storage", path = "../composefs-storage", version = "0.7.0", features = ["layer-transfer"], optional = true } composefs-ostree = { workspace = true, optional = true } env_logger = { version = "0.11.0", default-features = false } hex = { version = "0.4.0", default-features = false } indicatif = { version = "0.17.0", default-features = false } libsystemd = { version = "0.7" } log = { version = "0.4", default-features = false } -rustix = { version = "1.0.0", default-features = false, features = ["fs", "process"] } +rustix = { version = "1.0.0", default-features = false, features = ["fs", "net", "pipe", "process"] } serde = { version = "1.0", default-features = false, features = ["derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } -tokio = { version = "1.24.2", default-features = false, features = ["io-std", "io-util", "net", "rt", "sync"] } +tokio = { version = "1.24.2", default-features = false, features = ["io-std", "io-util", "net", "rt", "rt-multi-thread", "sync"] } zlink = { workspace = true } +[dev-dependencies] +similar-asserts = "1.7.0" +tar = { version = "0.4.38", default-features = false } +tempfile = "3.8.0" +tokio = { version = "1.24.2", default-features = false, features = ["macros", "rt-multi-thread"] } + [lints] workspace = true diff --git a/crates/composefs-ctl/src/lib.rs b/crates/composefs-ctl/src/lib.rs index 9a8c592e..d8af0db7 100644 --- a/crates/composefs-ctl/src/lib.rs +++ b/crates/composefs-ctl/src/lib.rs @@ -264,7 +264,7 @@ impl From for composefs::erofs::format::FormatVersion { /// start with `@`. #[cfg(feature = "oci")] #[derive(Debug, Clone)] -pub(crate) enum OciReference { +pub enum OciReference { /// A content-addressable digest such as `sha256:abcdef…`. Digest(composefs_oci::OciDigest), /// A named ref resolved through the repository's ref tree, typically @@ -377,6 +377,31 @@ enum OciCommand { #[arg(long, value_enum, default_value_t = LocalFetchCli::Disabled)] local_fetch: LocalFetchCli, }, + /// Copy an OCI image (and its layers) from another composefs repository + /// into this repository. + /// + /// The destination repository is selected by the global `--repo`/`--user`/ + /// `--system` flags. The source is `--from`. + /// + /// Pass `--zerocopy` to attempt reflink (then hardlink) instead of copying + /// object data. This requires both repositories to be on the same + /// filesystem, to use the same hash algorithm, and the caller to have + /// `CAP_DAC_READ_SEARCH` (i.e. root). + /// Without `--zerocopy`, objects are always copied, which is safe on any + /// filesystem and across repositories using different hash algorithms. + Copy { + /// Image to copy (tag name or `@digest`). + image: OciReference, + /// Path to the source composefs repository. + #[clap(long)] + from: PathBuf, + /// Tag to assign to the image in the destination repository. + #[clap(long)] + name: Option, + /// Use reflink/hardlink zero-copy transfer (requires same filesystem, same hash algorithm, and root). + #[clap(long)] + zerocopy: bool, + }, /// List all tagged OCI images in the repository #[clap(name = "images")] ListImages { @@ -940,12 +965,18 @@ pub async fn run_if_socket_activated() -> Result { if std::env::args_os().len() != 1 { return Ok(false); } - let Some(listener) = crate::varlink::try_activated_listener()? else { - return Ok(false); - }; let service = crate::varlink::CfsctlService::activated(); - crate::varlink::serve_activated(service, listener).await?; - Ok(true) + match crate::varlink::try_activated_listener()? { + Some(crate::varlink::ActivatedSocket::Connected(l)) => { + crate::varlink::serve_activated(service, l).await?; + Ok(true) + } + Some(crate::varlink::ActivatedSocket::Listening(listener)) => { + crate::varlink::serve_on_listener(service, listener).await?; + Ok(true) + } + None => Ok(false), + } } /// Top-level dispatch: handle init specially, otherwise open repo and run. @@ -1147,6 +1178,111 @@ where Ok(repo) } +/// Copy an OCI image (and all its layers) from one repository to another using varlink connections. +#[cfg(feature = "oci")] +pub async fn copy_image( + conn_src: &mut zlink::unix::Connection, + conn_dest: &mut zlink::unix::Connection, + handle_src: u64, + handle_dest: u64, + image: &OciReference, + name: Option<&str>, + zerocopy: bool, +) -> Result { + use crate::varlink::layer_sync::LayerRef; + use crate::varlink::oci::OciError; + use crate::varlink::proxy::{GetLayerParams, OciProxy}; + use anyhow::ensure; + use zlink::futures_util::StreamExt as _; + + let image_str = image.to_string(); + let inspect = conn_src + .inspect(handle_src, &image_str) + .await + .context("zlink transport error calling Inspect")? + .map_err(|e: OciError| anyhow::anyhow!("Inspect failed: {e:?}"))?; + + ensure!( + !inspect.manifest.is_empty(), + "inspect returned empty manifest" + ); + ensure!(!inspect.config.is_empty(), "inspect returned empty config"); + + // Extract ordered layer identifiers via the shared helper that handles + // both container images (rootfs.diff_ids) and OCI artifacts (manifest + // layer digests). + let diff_ids_ordered = composefs_oci::extract_layer_ids(&inspect.manifest, &inspect.config) + .context("extracting layer identifiers")?; + + let mut layer_refs: Vec = Vec::with_capacity(diff_ids_ordered.len()); + + for diff_id in &diff_ids_ordered { + let has = conn_dest + .has_layer(handle_dest, diff_id) + .await + .context("zlink transport error calling HasLayer")? + .map_err(|e: OciError| anyhow::anyhow!("HasLayer failed: {e:?}"))?; + + let layer_verity = if has.present { + has.layer_verity + .context("HasLayer returned present=true but no layer_verity")? + } else { + let get_params = GetLayerParams { + diff_id: Some(diff_id.to_string()), + storage: None, + }; + let mut get_stream = std::pin::pin!( + conn_src + .get_layer(handle_src, get_params) + .await + .context("zlink transport error calling GetLayer")? + ); + let mut all_fds: Vec = Vec::new(); + let mut get_reply = None; + while let Some(item) = get_stream.next().await { + let (result, fds) = item.context("GetLayer stream frame error")?; + let reply = + result.map_err(|e: OciError| anyhow::anyhow!("GetLayer failed: {e:?}"))?; + get_reply = Some(reply); + all_fds.extend(fds); + } + let get_reply = get_reply.context("GetLayer returned empty stream")?; + let dir_count = get_reply.dir_count as usize; + + let pipe_and_dirfds_len = 1 + dir_count; + let lifetime_fds = all_fds.split_off(pipe_and_dirfds_len); + + let put_reply = conn_dest + .put_layer(handle_dest, diff_id, zerocopy, all_fds) + .await + .context("zlink transport error calling PutLayer")? + .map_err(|e: OciError| anyhow::anyhow!("PutLayer failed: {e:?}"))?; + drop(lifetime_fds); + + put_reply.layer_verity + }; + + layer_refs.push(LayerRef { + diff_id: diff_id.clone(), + layer_verity, + }); + } + + let finalize = conn_dest + .finalize_image( + handle_dest, + &inspect.manifest, + &inspect.config, + layer_refs, + name, + ) + .await + .context("zlink transport error calling FinalizeImage")? + .map_err(|e: OciError| anyhow::anyhow!("FinalizeImage failed: {e:?}"))?; + + Ok(finalize) +} + /// Resolve an [`OciReference`] to an [`OciImage`]. #[cfg(feature = "oci")] pub(crate) fn resolve_oci_image( @@ -1350,6 +1486,8 @@ where ObjectID: FsVerityHashValue, { let repo = Arc::new(repo); + #[cfg(feature = "oci")] + let dest_path = resolve_repo_path(&args)?; match args.cmd { Command::Init { .. } => { // Handled in run_app before we get here @@ -1453,6 +1591,81 @@ where println!("Boot image: {}", image_verity.to_hex()); } } + OciCommand::Copy { + ref image, + ref from, + ref name, + zerocopy, + } => { + use crate::varlink::proxy::RepositoryProxy; + + let src_hash = resolve_hash_type(from, args.hash, !args.no_upgrade) + .with_context(|| format!("opening source repository {}", from.display()))?; + let dest_hash = resolve_hash_type(&dest_path, args.hash, !args.no_upgrade) + .with_context(|| { + format!("opening destination repository {}", dest_path.display()) + })?; + + if zerocopy && src_hash != dest_hash { + anyhow::bail!( + "--zerocopy requires matching hash algorithms; \ + source uses {src_hash:?} but destination uses {dest_hash:?}" + ); + } + + let from_str = from.to_str().context("source path is not valid UTF-8")?; + let dest_str = dest_path + .to_str() + .context("destination path is not valid UTF-8")?; + + let service_src = crate::varlink::CfsctlService::new(); + let service_dest = crate::varlink::CfsctlService::new(); + + let (mut conn_src, _srv_src) = crate::varlink::spawn_in_process(service_src) + .context("spawning source in-process service")?; + let (mut conn_dest, _srv_dest) = crate::varlink::spawn_in_process(service_dest) + .context("spawning destination in-process service")?; + + let handle_src = conn_src + .open_repository(Some(from_str), None, None) + .await + .context("zlink transport error calling OpenRepository on source")? + .map_err(|e| anyhow::anyhow!("OpenRepository failed on source: {e:?}"))? + .handle; + + let handle_dest = conn_dest + .open_repository(Some(dest_str), None, None) + .await + .context("zlink transport error calling OpenRepository on destination")? + .map_err(|e| anyhow::anyhow!("OpenRepository failed on destination: {e:?}"))? + .handle; + + let finalize_reply = copy_image( + &mut conn_src, + &mut conn_dest, + handle_src, + handle_dest, + image, + name.as_deref(), + zerocopy, + ) + .await?; + + let tag_info = if let Some(n) = name { + format!(", tagged as {n}") + } else { + String::new() + }; + println!( + "Copied image {image} from {} to destination repo{}", + from.display(), + tag_info + ); + println!("Manifest digest: {}", finalize_reply.manifest_digest); + println!("Manifest verity: {}", finalize_reply.manifest_verity); + println!("Config digest: {}", finalize_reply.config_digest); + println!("Config verity: {}", finalize_reply.config_verity); + } OciCommand::ListImages { json } => { let images = composefs_oci::oci_image::list_images(&repo)?; diff --git a/crates/composefs-ctl/src/varlink.rs b/crates/composefs-ctl/src/varlink.rs index fedff358..ae610127 100644 --- a/crates/composefs-ctl/src/varlink.rs +++ b/crates/composefs-ctl/src/varlink.rs @@ -136,11 +136,44 @@ pub enum RepositoryError { }, } -/// Reply carrying an opaque repository handle. +/// Reply carrying an opaque repository handle and basic repository metadata. +/// +/// The `hash_algorithm` and `objects_device_id` fields let a client making a +/// cross-repository copy decide whether zero-copy (reflink / hardlink) +/// transfer is viable for a given source–destination pair: +/// +/// * **`hash_algorithm`** — `"sha256"` or `"sha512"`. Hardlink (zero-copy) +/// requires both repositories to use the same algorithm, because fs-verity +/// is enabled on the *shared* inode. Reflink and regular copy work across +/// algorithms (each produces a fresh inode re-digested under the +/// destination's algorithm). +/// +/// * **`objects_device_id`** — the `st_dev` of the repository's objects +/// directory. Both reflink (`FICLONE`) and hardlink (`linkat`) require +/// source and destination to reside on the same filesystem; comparing +/// `objects_device_id` from both sides lets the client detect this up front. +/// Note: `st_dev` is only meaningful when both servers share a mount +/// namespace (the typical same-host deployment). If they do not, the +/// worst case is a failed `PutLayer` (EXDEV), not silent data corruption. #[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] pub struct OpenRepositoryReply { /// The opaque handle to pass to subsequent repository methods. pub handle: u64, + + /// The fs-verity hash algorithm used by this repository (`"sha256"` or + /// `"sha512"`). + /// + /// `None` on old servers that do not report this field (serde default). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub hash_algorithm: Option, + + /// The `st_dev` of the repository's objects directory, as a decimal u64. + /// + /// Clients comparing two repositories should treat matching values as + /// "likely same filesystem" (and thus eligible for reflink/hardlink). + /// `None` on old servers. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub objects_device_id: Option, } /// Reply from initializing a repository. @@ -163,6 +196,27 @@ pub(crate) enum OpenRepo { Sha512(Arc>), } +impl OpenRepo { + /// The fs-verity hash algorithm name for this repository. + fn hash_algorithm(&self) -> &'static str { + match self { + OpenRepo::Sha256(_) => "sha256", + OpenRepo::Sha512(_) => "sha512", + } + } + + /// The `st_dev` of the repository's objects directory, if available. + fn objects_device_id(&self) -> Option { + let stat_it = |fd: &std::os::fd::OwnedFd| -> Option { + rustix::fs::fstat(fd).ok().map(|s| s.st_dev) + }; + match self { + OpenRepo::Sha256(r) => r.objects_dir().ok().and_then(stat_it), + OpenRepo::Sha512(r) => r.objects_dir().ok().and_then(stat_it), + } + } +} + /// A single entry in the service's open-repository table. #[derive(Debug)] struct HandleEntry { @@ -225,6 +279,12 @@ pub(crate) struct CfsctlService { open_opts: OpenOptions, } +impl Default for CfsctlService { + fn default() -> Self { + Self::new() + } +} + impl CfsctlService { /// Construct an empty service with the given repository open options. /// @@ -256,6 +316,24 @@ impl CfsctlService { Self::with_open_opts(OpenOptions::default()) } + /// Construct a service with default open options. + pub(crate) fn new() -> Self { + Self::with_open_opts(OpenOptions::default()) + } + + /// Construct an insecure service for in-process tests. + /// + /// The `insecure` flag disables fs-verity requirements so that tests can + /// use repositories created on tmpfs or without verity support. + #[cfg(test)] + pub(crate) fn insecure_for_test() -> Self { + Self::with_open_opts(OpenOptions { + insecure: true, + require_verity: false, + no_upgrade: false, + }) + } + /// Allocate a fresh, never-reused handle. Starts at `1` (`0` is "none"). fn next_handle(&mut self) -> u64 { self.next_handle += 1; @@ -285,7 +363,8 @@ impl CfsctlService { .ok_or(oci::OciError::InvalidHandle { handle }) } - /// Resolve, open and register a repository at `path`, returning its handle. + /// Resolve, open and register a repository at `path`, returning the reply + /// with the handle and repository metadata. /// /// The digest algorithm is detected from the repository metadata; both /// resolution and open failures are reported as @@ -294,7 +373,7 @@ impl CfsctlService { &mut self, path: &Path, owner: Option, - ) -> std::result::Result { + ) -> std::result::Result { let hash_type = resolve_hash_type(path, None, !self.open_opts.no_upgrade).map_err(|e| { RepositoryError::RepoNotFound { message: format!("{e:#}"), @@ -325,8 +404,14 @@ impl CfsctlService { )), }; let handle = self.next_handle(); + let hash_algorithm = Some(repo.hash_algorithm().to_string()); + let objects_device_id = repo.objects_device_id(); self.repos.insert(handle, HandleEntry { repo, owner }); - Ok(handle) + Ok(OpenRepositoryReply { + handle, + hash_algorithm, + objects_device_id, + }) } /// Resolve a repository selector (`path`/`user`/`system`) to a path. @@ -794,8 +879,7 @@ mod service_impl { #[zlink(connection)] conn: &mut zlink::Connection, ) -> std::result::Result { let selected = Self::resolve_selector(path, user, system)?; - let handle = self.do_open(&selected, Some(conn.id()))?; - Ok(OpenRepositoryReply { handle }) + self.do_open(&selected, Some(conn.id())) } /// Close a previously opened repository handle. @@ -893,6 +977,9 @@ mod service_impl { mod service_impl { #![allow(missing_docs)] + use super::layer_sync::{ + FinalizeImageReply, GetInfoReply, GetLayerReply, HasLayerReply, LayerRef, PutLayerReply, + }; use super::oci::{ ListImagesReply, OciComputeIdReply, OciError, OciFsckReply, OciInspectReply, PullProgress, parse_local_fetch, pull_stream, @@ -903,7 +990,10 @@ mod service_impl { run_gc, run_image_objects, run_init_repository, run_inspect, run_list_images, run_mount, run_oci_fsck, run_oci_mount, run_tag, run_untag, }; - use composefs::fsverity::{Algorithm, Sha256HashValue, Sha512HashValue}; + use composefs::fsverity::{Algorithm, FsVerityHashValue, Sha256HashValue, Sha512HashValue}; + use composefs_oci::layer_transport::{RepoLayerSource, serve_get_layer}; + use composefs_oci::varlink_types::GetLayerParams; + use composefs_splitdirfdstream::seed_from_id; #[zlink::service( interface = "org.composefs.Repository", @@ -952,8 +1042,7 @@ mod service_impl { #[zlink(connection)] conn: &mut zlink::Connection, ) -> std::result::Result { let selected = Self::resolve_selector(path, user, system)?; - let handle = self.do_open(&selected, Some(conn.id()))?; - Ok(OpenRepositoryReply { handle }) + self.do_open(&selected, Some(conn.id())) } /// Close a previously opened repository handle. @@ -1158,7 +1247,7 @@ mod service_impl { bootable: bool, ) -> impl zlink::futures_util::Stream< Item = std::result::Result, OciError>, - > + Send { + > { let lf = parse_local_fetch(&local_fetch); let sr = storage_root.map(std::path::PathBuf::from); // Resolve the handle synchronously and clone an owned Arc out so the @@ -1213,6 +1302,423 @@ mod service_impl { Err(e) => (Err(e), vec![]), } } + + // --- org.composefs.Oci (layer-sync methods) --- + // + // These methods were previously under org.composefs.LayerSync but have + // been folded into the Oci interface. Each carries an explicit `interface` + // annotation so the wire names land under the correct interface namespace. + + /// Return the capability tokens supported by this service. + /// + /// Currently advertises `"splitdirfdstream-v0"`. + #[zlink(interface = "org.composefs.Oci")] + async fn get_info(&self) -> std::result::Result { + Ok(GetInfoReply { + features: vec!["splitdirfdstream-v0".into()], + }) + } + + /// Check whether the layer splitstream for `diff_id` is present. + /// + /// Returns `present = true` and the hex verity if found; `present = + /// false` and `layer_verity = None` if not. + #[zlink(interface = "org.composefs.Oci")] + async fn has_layer( + &self, + handle: u64, + diff_id: String, + ) -> std::result::Result { + let diff_id_parsed: composefs_oci::OciDigest = + diff_id.parse().map_err(|e| OciError::InvalidDigest { + message: format!("{e}"), + })?; + let content_id = composefs_oci::layer_content_id(&diff_id_parsed); + + fn check( + repo: &composefs::repository::Repository, + content_id: &str, + ) -> std::result::Result { + match repo + .has_stream(content_id) + .map_err(|e| OciError::InternalError { + message: format!("{e:#}"), + })? { + Some(verity) => Ok(HasLayerReply { + present: true, + layer_verity: Some(verity.to_hex()), + }), + None => Ok(HasLayerReply { + present: false, + layer_verity: None, + }), + } + } + + match self.lookup_oci(handle)? { + OpenRepo::Sha256(ref r) => check::(r, &content_id), + OpenRepo::Sha512(ref r) => check::(r, &content_id), + } + } + + /// Stream the layer as a `splitdirfdstream` over a pipe, with the full + /// hardened streaming fd-transport contract. + /// + /// This is a **streaming** method (`more`): it yields multiple frames, + /// each carrying a batch of FDs. The client must concatenate FD batches + /// from all frames to reconstruct the logical array: + /// + /// ```text + /// [ pipe_read | | ] + /// ``` + /// + /// The dirfds region uses sparse placement (hash-determined slot assignment); + /// lifetime fds are opaque tokens the client must hold until done reading. + /// + /// **Non-streaming** (`more=false`): all fds in a single frame; returns + /// `FdLimitExceeded` if the total exceeds `MAX_FDS_PER_FRAME` (retry with + /// `more=true`). + /// + /// The producer runs on `spawn_blocking` so the async task is never blocked. + /// For the repo case there is no external lock to release, so `keepalive_read` + /// is moved into the producer closure and dropped when the producer finishes. + #[zlink(interface = "org.composefs.Oci", more, return_fds)] + async fn get_layer( + &self, + more: bool, + handle: u64, + params: GetLayerParams, + #[zlink(fds)] _fds: Vec, + ) -> impl zlink::futures_util::Stream< + Item = ( + std::result::Result, OciError>, + Vec, + ), + > + Unpin { + use zlink::futures_util::stream::{self, StreamExt as _}; + + type StreamItem = ( + std::result::Result, OciError>, + Vec, + ); + + macro_rules! err_stream { + ($e:expr) => { + return stream::iter(std::iter::once::((Err($e), vec![]))) + .left_stream() + }; + } + + // ── Extract diff_id from params (repo service requires it) ───────── + let diff_id = match params.diff_id { + Some(d) => d, + None => err_stream!(OciError::InvalidRequest { + message: "GetLayer: diff_id is required for the repo service".into(), + }), + }; + + // ── Parse diff_id ───────────────────────────────────────────────── + let diff_id_parsed: composefs_oci::OciDigest = match diff_id.parse() { + Ok(d) => d, + Err(e) => err_stream!(OciError::InvalidDigest { + message: format!("{e}"), + }), + }; + let content_id = composefs_oci::layer_content_id(&diff_id_parsed); + + // ── Drive serve_get_layer via the LayerSource trait ─────────────── + fn do_serve_get_layer( + repo: &std::sync::Arc>, + content_id: &str, + diff_id_str: &str, + more: bool, + ) -> std::result::Result + { + let verity = repo + .has_stream(content_id) + .map_err(|e| OciError::InternalError { + message: format!("{e:#}"), + })? + .ok_or_else(|| OciError::NoSuchLayer { + diff_id: diff_id_str.to_string(), + })?; + + let seed = seed_from_id(content_id); + let source = RepoLayerSource { + repo: repo.clone(), + layer_verity: verity, + }; + + serve_get_layer(source, seed, more).map_err(|e| match e { + composefs_oci::layer_transport::ServeGetLayerError::FdLimitExceeded(e) => { + OciError::FdLimitExceeded { + fd_count: e.fd_count as u64, + max_per_frame: e.max_per_frame as u64, + } + } + composefs_oci::layer_transport::ServeGetLayerError::Other(e) => { + OciError::InternalError { + message: format!("{e:#}"), + } + } + }) + } + + let frames = match self.lookup_oci(handle) { + Ok(OpenRepo::Sha256(ref r)) => { + do_serve_get_layer::(r, &content_id, &diff_id, more) + } + Ok(OpenRepo::Sha512(ref r)) => { + do_serve_get_layer::(r, &content_id, &diff_id, more) + } + Err(e) => Err(e), + }; + + let frames = match frames { + Ok(f) => f, + Err(e) => err_stream!(e), + }; + + let dir_count = frames.dir_count; + let batches = frames.batches; + let n_frames = batches.len(); + let reply = GetLayerReply { dir_count }; + + stream::iter(batches.into_iter().enumerate().map(move |(i, batch)| { + let is_last = i == n_frames - 1; + ( + Ok(zlink::Reply::new(Some(reply.clone())).set_continues(Some(!is_last))), + batch, + ) + })) + .right_stream() + } + + /// Receive a layer as a `splitdirfdstream` from the client and import + /// it into the server's repository, verifying content integrity. + /// + /// The client supplies: + /// * `fds[0]` — read end of a pipe carrying the `splitdirfdstream` bytes. + /// * `fds[1..]` — source object directories (the splitdirfdstream's + /// `dirfd_index` selects among them; objects dir is index 0). + /// + /// The server runs the verified drain on a `spawn_blocking` thread so the + /// async task is not blocked while data flows through the pipe. The layer + /// content is only committed if its reconstructed sha256 matches `diff_id`; + /// on mismatch [`OciError::DiffIdMismatch`] is returned and no stream + /// is committed. + /// + /// The server always drains the pipe to avoid wedging the client's writer + /// even if the layer is already present — the import is idempotent. + #[zlink(interface = "org.composefs.Oci")] + async fn put_layer( + &self, + handle: u64, + diff_id: String, + zerocopy: bool, + #[zlink(fds)] fds: Vec, + ) -> std::result::Result { + // Validate the fd count: fds[0] = pipe read, fds[1..] = dir fds. + if fds.len() < 2 { + return Err(OciError::InvalidRequest { + message: format!( + "expected at least 2 fds (1 pipe + >=1 dir fd), got {}", + fds.len() + ), + }); + } + + let diff_id_parsed: composefs_oci::OciDigest = + diff_id.parse().map_err(|e| OciError::InvalidDigest { + message: format!("{e}"), + })?; + + let content_id = composefs_oci::layer_content_id(&diff_id_parsed); + + // Check whether the layer is already present (for the reply flag). + // We still proceed with the drain regardless to avoid wedging the + // client's writer if it is already producing. + let already_present = match self.lookup_oci(handle)? { + OpenRepo::Sha256(ref r) => r + .has_stream(&content_id) + .map_err(|e| OciError::InternalError { + message: format!("{e:#}"), + })? + .is_some(), + OpenRepo::Sha512(ref r) => r + .has_stream(&content_id) + .map_err(|e| OciError::InternalError { + message: format!("{e:#}"), + })? + .is_some(), + }; + + // Split fds: pipe_read + dir_fds. + let mut fds = fds; + let pipe_read = fds.remove(0); + let dir_fds = fds; // remaining fds are the dir fds + + async fn run_put_layer( + repo: std::sync::Arc>, + pipe_read: std::os::fd::OwnedFd, + dir_fds: Vec, + diff_id: composefs_oci::OciDigest, + zerocopy: bool, + already_present: bool, + ) -> std::result::Result { + tokio::task::spawn_blocking(move || { + composefs_oci::layer_sync::drain_splitdirfdstream_verified( + repo, + pipe_read, + dir_fds, + &diff_id, + zerocopy, + composefs::repository::ImportContext::default(), + ) + }) + .await + .map_err(|e| OciError::InternalError { + message: format!("spawn_blocking panic: {e}"), + })? + .map(|(verity, stats, _ctx)| PutLayerReply { + layer_verity: verity.to_hex(), + already_present, + objects_reflinked: stats.objects_reflinked, + objects_hardlinked: stats.objects_hardlinked, + objects_copied: stats.objects_copied, + objects_already_present: stats.objects_already_present, + }) + .map_err(|e| match e { + composefs_oci::layer_sync::VerifiedDrainError::DiffIdMismatch { + expected, + actual, + } => OciError::DiffIdMismatch { expected, actual }, + composefs_oci::layer_sync::VerifiedDrainError::Other(err) => { + OciError::InternalError { + message: format!("{err:#}"), + } + } + }) + } + + match self.lookup_oci(handle)? { + OpenRepo::Sha256(ref r) => { + run_put_layer::( + r.clone(), + pipe_read, + dir_fds, + diff_id_parsed, + zerocopy, + already_present, + ) + .await + } + OpenRepo::Sha512(ref r) => { + run_put_layer::( + r.clone(), + pipe_read, + dir_fds, + diff_id_parsed, + zerocopy, + already_present, + ) + .await + } + } + } + + /// Finalize an OCI image after all layers have been imported. + /// + /// Given the raw manifest and config JSON bytes and the ordered list of + /// `(diff_id, layer_verity)` pairs (as returned by `PutLayer`), this + /// method writes the config and manifest splitstreams, generates the + /// composefs EROFS image, and optionally tags the manifest. Idempotent. + /// + /// Returns the digest and verity strings for both the manifest and config + /// splitstreams. + #[zlink(interface = "org.composefs.Oci")] + async fn finalize_image( + &self, + handle: u64, + manifest_json: String, + config_json: String, + layers: Vec, + name: Option, + ) -> std::result::Result { + async fn run_finalize( + repo: std::sync::Arc>, + manifest_json: String, + config_json: String, + layers: Vec, + name: Option, + ) -> std::result::Result { + // Parse each LayerRef into (OciDigest, ObjectID). + let mut layer_refs: Vec<(composefs_oci::OciDigest, ObjectID)> = + Vec::with_capacity(layers.len()); + for lr in &layers { + let diff_id: composefs_oci::OciDigest = + lr.diff_id.parse().map_err(|e| OciError::InvalidDigest { + message: format!("diff_id {:?}: {e}", lr.diff_id), + })?; + let verity = ObjectID::from_hex(&lr.layer_verity).map_err(|e| { + OciError::InvalidDigest { + message: format!("layer_verity {:?}: {e}", lr.layer_verity), + } + })?; + layer_refs.push((diff_id, verity)); + } + + tokio::task::spawn_blocking(move || { + composefs_oci::layer_sync::finalize_oci_image( + &repo, + manifest_json.as_bytes(), + config_json.as_bytes(), + &layer_refs, + name.as_deref(), + ) + }) + .await + .map_err(|e| OciError::InternalError { + message: format!("spawn_blocking panic: {e}"), + })? + .map( + |((manifest_digest, manifest_verity), (config_digest, config_verity))| { + FinalizeImageReply { + manifest_digest: manifest_digest.to_string(), + manifest_verity: manifest_verity.to_hex(), + config_digest: config_digest.to_string(), + config_verity: config_verity.to_hex(), + } + }, + ) + .map_err(|e| OciError::InternalError { + message: format!("{e:#}"), + }) + } + + match self.lookup_oci(handle)? { + OpenRepo::Sha256(ref r) => { + run_finalize::( + r.clone(), + manifest_json, + config_json, + layers, + name, + ) + .await + } + OpenRepo::Sha512(ref r) => { + run_finalize::( + r.clone(), + manifest_json, + config_json, + layers, + name, + ) + .await + } + } + } } } @@ -1239,14 +1745,31 @@ impl zlink::Listener for ActivatedListener { } } -/// Try to build an [`ActivatedListener`] from a socket-activated fd. +/// An inherited socket-activation fd, classified by its listening state. +pub(crate) enum ActivatedSocket { + /// A pre-connected stream (`varlinkctl exec:` transport): one connection + /// on fd 3. Served via [`ActivatedListener`]. + Connected(ActivatedListener), + /// A listening socket (systemd `.socket` with `Accept=no`, or the test + /// harness): served with a normal accept loop. + Listening(zlink::unix::Listener), +} + +/// Try to classify a socket-activation fd inherited from the service manager. +/// +/// Uses `libsystemd` to receive file descriptors (checks `LISTEN_FDS`/ +/// `LISTEN_PID` and clears the env vars). Returns `None` when the process +/// was not socket-activated. /// -/// Uses `libsystemd` to receive file descriptors passed by the service -/// manager (checks `LISTEN_FDS`/`LISTEN_PID` and clears the env vars). -/// Returns `None` when the process was not socket-activated. +/// When a fd is present its socket type is inspected via `SO_ACCEPTCONN`: +/// - **Listening**: the fd is a bound, listening socket (e.g. passed by the +/// test harness or a systemd `.socket` unit with `Accept=no`) — wrapped as +/// [`ActivatedSocket::Listening`]. +/// - **Connected**: the fd is an already-connected stream (e.g. `varlinkctl +/// exec:`) — wrapped as [`ActivatedSocket::Connected`]. #[allow(unsafe_code)] -pub(crate) fn try_activated_listener() -> Result> { - use std::os::fd::{FromRawFd as _, IntoRawFd as _}; +pub(crate) fn try_activated_listener() -> Result> { + use std::os::fd::{FromRawFd as _, IntoRawFd as _, OwnedFd}; let fds = libsystemd::activation::receive_descriptors(true) .map_err(|e| anyhow::anyhow!("Failed to receive activation fds: {e}"))?; @@ -1256,56 +1779,100 @@ pub(crate) fn try_activated_listener() -> Result> { None => return Ok(None), }; - // SAFETY: `libsystemd::activation::receive_descriptors(true)` validated - // the fd and transferred ownership. `into_raw_fd()` consumes the - // `FileDescriptor` wrapper, giving us sole ownership of a valid fd. - let std_stream = unsafe { std::os::unix::net::UnixStream::from_raw_fd(fd.into_raw_fd()) }; - std_stream - .set_nonblocking(true) - .context("setting systemd socket to non-blocking")?; - let tokio_stream = tokio::net::UnixStream::from_std(std_stream) - .context("converting systemd UnixStream to tokio")?; - let zlink_stream = zlink::unix::Stream::from(tokio_stream); - let conn = zlink::Connection::from(zlink_stream); - Ok(Some(ActivatedListener { conn: Some(conn) })) + // SAFETY: `receive_descriptors` validated the fd and transferred ownership + // via `IntoRawFd`. We immediately re-wrap the raw integer as an `OwnedFd` + // so that Rust's ownership rules track the fd from this point forward. + let owned: OwnedFd = unsafe { OwnedFd::from_raw_fd(fd.into_raw_fd()) }; + + // Query SO_ACCEPTCONN to distinguish a pre-connected stream (varlinkctl + // `exec:`) from a listening socket (systemd socket unit / test harness). + let is_listening = rustix::net::sockopt::socket_acceptconn(&owned) + .context("querying SO_ACCEPTCONN on activation fd")?; + + if is_listening { + // The fd is a bound, listening Unix socket. Hand it to zlink's + // Listener adapter, which calls set_nonblocking and wraps it in tokio. + let listener = zlink::unix::Listener::try_from(owned) + .context("converting listening activation fd to zlink Listener")?; + Ok(Some(ActivatedSocket::Listening(listener))) + } else { + // The fd is an already-connected stream (e.g. varlinkctl exec:). + // `From` for `UnixStream` is safe — ownership is transferred. + let std_stream = std::os::unix::net::UnixStream::from(owned); + std_stream + .set_nonblocking(true) + .context("setting systemd socket to non-blocking")?; + let tokio_stream = tokio::net::UnixStream::from_std(std_stream) + .context("converting systemd UnixStream to tokio")?; + let zlink_stream = zlink::unix::Stream::from(tokio_stream); + let conn = zlink::Connection::from(zlink_stream); + Ok(Some(ActivatedSocket::Connected(ActivatedListener { + conn: Some(conn), + }))) + } } -/// Serve `service` on an already-obtained socket-activated listener. +/// Serve `service` on an already-obtained socket-activated connected listener. /// /// Status is logged, never written to stdout: under socket activation (e.g. /// varlinkctl's `exec:` transport) the parent may treat our stdout as part of /// the protocol handshake, and any stray bytes there reset the connection. +/// +/// The server loop runs inside a [`tokio::task::LocalSet`] so request handlers +/// can `spawn_local` `!Send` work (see [`pull_stream`]). Both serve paths +/// wrap exactly one `LocalSet`. pub(crate) async fn serve_activated(service: S, listener: ActivatedListener) -> Result<()> where S: zlink::Service, { log::info!("Listening on systemd-activated socket"); let server = zlink::Server::new(listener, service); - server - .run() + tokio::task::LocalSet::new() + .run_until(server.run()) .await .context("running varlink server (activated)") } -/// Serve `service` either on a systemd-activated connected socket or by -/// binding a fresh Unix socket at `address`. +/// Serve `service` on a listening [`zlink::unix::Listener`] inside a +/// [`tokio::task::LocalSet`]. +/// +/// Used for both the socket-activated listening fd path and the normal +/// `bind`-a-fresh-socket path (see [`serve`]). +pub(crate) async fn serve_on_listener(service: S, listener: zlink::unix::Listener) -> Result<()> +where + S: zlink::Service, +{ + let server = zlink::Server::new(listener, service); + tokio::task::LocalSet::new() + .run_until(server.run()) + .await + .context("running varlink server") +} + +/// Serve `service` on the appropriate socket, auto-detecting the source. /// -/// Socket activation takes precedence: if the process was started with a -/// connected activation socket, `address` is ignored. Otherwise `address` -/// must be `Some`, or this returns an error. +/// Resolution order: +/// 1. A socket-activation fd inherited from the service manager: +/// - If listening (`SO_ACCEPTCONN`): serve with a normal accept loop. +/// - If connected (`varlinkctl exec:`): serve single-shot. +/// 2. A freshly bound socket at `address` (which must be `Some`). pub(crate) async fn serve(service: S, address: Option<&Path>) -> Result<()> where S: zlink::Service, { - if let Some(activated) = try_activated_listener()? { - return serve_activated(service, activated).await; + match try_activated_listener()? { + Some(ActivatedSocket::Connected(l)) => return serve_activated(service, l).await, + Some(ActivatedSocket::Listening(listener)) => { + log::info!("Listening on systemd-activated socket"); + return serve_on_listener(service, listener).await; + } + None => {} } let address = address.context("no --address given and not socket-activated")?; let listener = zlink::unix::bind(address) .with_context(|| format!("binding varlink socket at {}", address.display()))?; log::info!("Listening on {}", address.display()); - let server = zlink::Server::new(listener, service); - server.run().await.context("running varlink server") + serve_on_listener(service, listener).await } /// Varlink support for the OCI interface (`org.composefs.Oci`). @@ -1689,6 +2256,10 @@ pub mod oci { /// When `more` is `false` the client asked for a single reply, so no progress /// reporter is attached and the stream yields only the terminal `completed` /// frame (or an error). + /// + /// The pull task uses [`tokio::task::spawn_local`], not [`tokio::spawn`]: + /// `composefs_oci::pull` is `!Send` (the `get_layer` zlink proxy returns a + /// `!Send` `ReplyStream`), and the server loop runs inside a `LocalSet`. #[allow(clippy::too_many_arguments)] pub(crate) fn pull_stream( repo: Arc>, @@ -1702,7 +2273,7 @@ pub mod oci { Box< dyn zlink::futures_util::Stream< Item = std::result::Result, OciError>, - > + Send, + >, >, > { use zlink::futures_util::stream; @@ -1720,7 +2291,7 @@ pub mod oci { // and sends it through the channel before the sender drops. Pull errors // are carried out via the task's `JoinHandle` return value. let task_tx = tx.clone(); - let handle = tokio::spawn(async move { + let handle = tokio::task::spawn_local(async move { let opts = composefs_oci::PullOptions { local_fetch, storage_root: storage_root.as_deref(), @@ -1848,6 +2419,164 @@ pub mod oci { /// Description of the failure. message: String, }, + /// The requested layer (by diff-id) is not present in the repository. + NoSuchLayer { + /// The diff-id that was not found. + diff_id: String, + }, + /// A supplied digest/diff-id string was malformed. + InvalidDigest { + /// Human-readable description of the parse failure. + message: String, + }, + /// Received layer content did not hash to the declared diff-id. + /// + /// The stream was NOT committed; the client must retry with correct data. + DiffIdMismatch { + /// The diff_id that was declared by the client. + expected: String, + /// The sha256 digest of the data that was actually received. + actual: String, + }, + /// The request was malformed (e.g. wrong fd count). + InvalidRequest { + /// Human-readable description of what was wrong. + message: String, + }, + /// The total fd count exceeds [`MAX_FDS_PER_FRAME`] for a `more=false` call. + /// + /// The client must retry with `more=true` (streaming mode). + FdLimitExceeded { + /// Total number of fds that would be sent. + fd_count: u64, + /// The per-frame cap that was exceeded. + max_per_frame: u64, + }, + } +} + +/// Reply types for the layer-sync methods of the `org.composefs.Oci` interface, +/// gated behind the `oci` feature (they depend on [`composefs_oci::layer_sync`]). +/// +/// The four layer-sync methods (`GetInfo`, `HasLayer`, `GetLayer`, `PutLayer`) +/// are part of `org.composefs.Oci`; this module merely collects their reply +/// structs to keep them separate from the rest of the OCI wire types. +#[cfg(feature = "oci")] +pub mod layer_sync { + use super::*; + + /// Reply from `GetInfo`: capability tokens supported by this service. + #[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] + pub struct GetInfoReply { + /// Capability tokens advertised by this service instance. + /// + /// Currently only `"splitdirfdstream-v0"` is defined. + pub features: Vec, + } + + /// Reply from `HasLayer`: whether the layer is present in the repository. + #[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] + pub struct HasLayerReply { + /// Whether the layer splitstream for the given diff-id is present. + pub present: bool, + /// Hex-encoded fs-verity hash of the layer splitstream, if present. + pub layer_verity: Option, + } + + /// Reply from `GetLayer`: the number of diff-directory slots in the logical FD array. + /// + /// `GetLayer` is a **streaming** method (`more`): it yields multiple frames, + /// each carrying a batch of FDs. The client MUST concatenate the FD batches + /// from all frames (in arrival order) to reconstruct the full logical FD array: + /// + /// - `fds[0]` — data pipe read end (carries the `splitdirfdstream` bytes). + /// - `fds[1..=dir_count]` — the dirfds region (`dir_count` slots total). The + /// real objects-directory fd sits at a sparse, hash-determined index within + /// this region; the remaining (gap) slots hold inert dummy fds that + /// `reconstruct` never dereferences. The sparse placement is encoded in each + /// `FileBackedData` chunk's `dirfd_index`; the client passes the whole region + /// to `drain_splitdirfdstream` / `reconstruct` unchanged and must NOT assume + /// the dir is at a fixed index. + /// - `fds[dir_count+1..]` — opaque lifetime FDs. The client MUST hold every + /// one of these open until it has finished reading and processing all dir fds, + /// then close them all to signal completion to the server. The count of + /// trailing FDs is unspecified by contract; the client keeps open whatever it + /// does not otherwise recognise. This lifetime-FD convention is part of the + /// `splitdirfdstream-v0` feature. + /// + /// Each transport frame carries at most `MAX_FDS_PER_FRAME` (240) fds, safely + /// below the kernel `SCM_MAX_FD` (253) limit. Every frame carries the same + /// `dir_count`; the client should use the value from any frame (they are all + /// identical). The stream terminates when a frame with `continues=false` is + /// received. + /// + /// A non-streaming (`more=false`) call delivers all fds in a single frame; if + /// the layer requires more than `MAX_FDS_PER_FRAME` fds the call returns + /// `FdLimitExceeded` and the client must retry with `more=true`. + #[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] + pub struct GetLayerReply { + /// Number of diff-directory file descriptors in the full logical FD array + /// (i.e. `fds[1..=dir_count]` after concatenating all frames' batches). + pub dir_count: u32, + } + + /// Reply from `PutLayer`: the verity hash of the imported layer, whether + /// it was already present, and per-object transfer statistics. + /// + /// The object-count fields let the client verify that zero-copy transfer + /// actually took place (e.g. assert `objects_reflinked > 0` in tests) and + /// accumulate aggregate stats for user-facing output. + #[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] + pub struct PutLayerReply { + /// Hex-encoded fs-verity hash of the committed layer splitstream. + pub layer_verity: String, + /// `true` if the layer was already present before this call. + /// + /// The server always drains the pipe regardless (to avoid wedging + /// the client's writer), so the stream is re-imported idempotently. + pub already_present: bool, + + /// Number of objects that were reflinked (FICLONE) into the + /// destination. Non-zero only when source and dest share a filesystem. + #[serde(default)] + pub objects_reflinked: u64, + /// Number of objects hardlinked into the destination (zerocopy mode). + #[serde(default)] + pub objects_hardlinked: u64, + /// Number of objects byte-copied into the destination. + #[serde(default)] + pub objects_copied: u64, + /// Number of objects already present in the destination (skipped). + #[serde(default)] + pub objects_already_present: u64, + } + + /// A single (diff_id, layer_verity) pair passed to `FinalizeImage`. + /// + /// The client builds this list from the `PutLayer` replies it received while + /// copying layers to the destination repository. The order must match the + /// manifest layer order. + #[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] + pub struct LayerRef { + /// OCI diff-id of the layer (e.g. `"sha256:abcd..."`). + pub diff_id: String, + /// Hex-encoded fs-verity hash of the layer splitstream in the destination + /// repository, as returned by `PutLayer`. + pub layer_verity: String, + } + + /// Reply from `FinalizeImage`: digest and verity strings for the manifest + /// and config splitstreams that were written (or already existed). + #[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] + pub struct FinalizeImageReply { + /// OCI digest of the manifest (e.g. `"sha256:abcd..."`). + pub manifest_digest: String, + /// Hex-encoded fs-verity hash of the manifest splitstream. + pub manifest_verity: String, + /// OCI digest of the config (e.g. `"sha256:abcd..."`). + pub config_digest: String, + /// Hex-encoded fs-verity hash of the config splitstream. + pub config_verity: String, } } @@ -1859,6 +2588,10 @@ pub mod oci { pub mod proxy { #![allow(missing_docs)] + #[cfg(feature = "oci")] + use super::layer_sync::{ + FinalizeImageReply, GetInfoReply, GetLayerReply, HasLayerReply, LayerRef, PutLayerReply, + }; #[cfg(feature = "oci")] use super::oci::{ ListImagesReply, OciComputeIdReply, OciError, OciFsckReply, OciInspectReply, PullProgress, @@ -1868,6 +2601,8 @@ pub mod proxy { RepositoryError, }; #[cfg(feature = "oci")] + pub use composefs_oci::varlink_types::GetLayerParams; + #[cfg(feature = "oci")] use zlink::futures_util::Stream; /// Typed client for the `org.composefs.Repository` interface. @@ -1975,8 +2710,755 @@ pub mod proxy { storage_root: Option<&str>, bootable: bool, ) -> zlink::Result>>>; + + /// Query capability tokens supported by the service. + async fn get_info(&mut self) -> zlink::Result>; + + /// Check whether a layer is present in the repository. + async fn has_layer( + &mut self, + handle: u64, + diff_id: &str, + ) -> zlink::Result>; + + /// Stream the layer as a `splitdirfdstream` with full hardened fd-transport + /// contract (sparse dirfds, keepalive, lifetime fds, multi-frame). + /// + /// Drive the returned stream to completion (until `continues=false`), + /// concatenating each frame's fd batch in order to reconstruct the full + /// logical FD array `[pipe_read, dirfds.., lifetime_fds..]`. + #[zlink(more, return_fds)] + async fn get_layer( + &mut self, + handle: u64, + params: GetLayerParams, + ) -> zlink::Result< + impl zlink::futures_util::Stream< + Item = zlink::Result<(Result, Vec)>, + >, + >; + + /// Receive a layer as a `splitdirfdstream` from the client and import + /// it into the server's repository with diff_id verification. + /// + /// `fds[0]` is the pipe read end; `fds[1..]` are source object dirs. + async fn put_layer( + &mut self, + handle: u64, + diff_id: &str, + zerocopy: bool, + #[zlink(fds)] fds: Vec, + ) -> zlink::Result>; + + /// Finalize an OCI image after all layers have been imported. + /// + /// `layers` must be in manifest layer order; each entry pairs the layer's + /// OCI diff-id with the hex verity returned by `PutLayer`. `name` is the + /// tag to assign (optional). Idempotent. + async fn finalize_image( + &mut self, + handle: u64, + manifest_json: &str, + config_json: &str, + layers: Vec, + name: Option<&str>, + ) -> zlink::Result>; } } #[cfg(feature = "oci")] pub(crate) use oci::*; + +/// Spawn a `CfsctlService` in-process over a Unix socket pair for testing. +/// +/// Returns a connected client [`zlink::unix::Connection`] and a +/// [`std::thread::JoinHandle`] for the server thread. +/// +/// Mirrors the pattern in `composefs-storage`'s `spawn_in_process`: the zlink +/// server is `!Send` so it runs on a dedicated OS thread with its own +/// current-thread Tokio runtime and [`tokio::task::LocalSet`]. +/// +/// The server thread exits when the client connection is closed. +#[cfg(feature = "oci")] +pub(crate) fn spawn_in_process( + service: CfsctlService, +) -> std::io::Result<(zlink::unix::Connection, std::thread::JoinHandle<()>)> { + let (client_std, server_std) = std::os::unix::net::UnixStream::pair()?; + client_std.set_nonblocking(true)?; + server_std.set_nonblocking(true)?; + + let client_stream = tokio::net::UnixStream::from_std(client_std)?; + let client_zlink = zlink::unix::Stream::from(client_stream); + let client_conn = zlink::Connection::from(client_zlink); + + let handle = std::thread::Builder::new() + .name("cfsctl-service-server".into()) + .spawn(move || { + let rt = match tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + { + Ok(rt) => rt, + Err(e) => { + log::error!("CfsctlService server runtime build failed: {e:#?}"); + return; + } + }; + let local = tokio::task::LocalSet::new(); + local.block_on(&rt, async move { + let server_stream = match tokio::net::UnixStream::from_std(server_std) { + Ok(s) => s, + Err(e) => { + log::error!("CfsctlService server stream conversion failed: {e:#?}"); + return; + } + }; + let server_zlink = zlink::unix::Stream::from(server_stream); + let listener = zlink::ReadyListener::new(server_zlink); + let server = zlink::Server::new(listener, service); + if let Err(e) = server.run().await { + log::warn!("CfsctlService in-process server error: {e:#?}"); + } + }); + })?; + + Ok((client_conn, handle)) +} + +#[cfg(all(test, feature = "oci"))] +mod layer_sync_tests { + //! In-process round-trip tests for the layer-sync methods of the + //! `org.composefs.Oci` interface. + //! + //! These mirror the in-process transport test in + //! `composefs-storage`'s `cstor_service.rs`. + + use std::io::Read as _; + use std::os::fd::AsFd as _; + use std::sync::Arc; + + use composefs::fsverity::{FsVerityHashValue as _, Sha256HashValue}; + use composefs::repository::{Repository, RepositoryConfig}; + use composefs_splitdirfdstream::reconstruct; + + use super::layer_sync::GetLayerReply; + use super::oci::OciError; + use super::proxy::{OciProxy, RepositoryProxy as _}; + use super::{CfsctlService, spawn_in_process}; + use composefs_oci::varlink_types::GetLayerParams; + + /// Drive a streaming `get_layer` call to completion, collecting all FDs. + /// + /// Returns `(reply, all_fds)` where `all_fds` is the concatenated FD vector + /// from all frames in arrival order: + /// ```text + /// [ pipe_read | dirfds region (dir_count) | lifetime fds ] + /// ``` + async fn collect_get_layer( + client: &mut C, + handle: u64, + diff_id: &str, + ) -> Result<(GetLayerReply, Vec), OciError> + where + C: OciProxy, + { + use zlink::futures_util::StreamExt as _; + + let params = GetLayerParams { + diff_id: Some(diff_id.to_owned()), + storage: None, + }; + let mut stream = std::pin::pin!( + client + .get_layer(handle, params) + .await + .expect("get_layer transport error") + ); + + let mut all_fds: Vec = Vec::new(); + let mut last_reply: Option = None; + + while let Some(item) = stream.next().await { + let (result, fds) = item.expect("get_layer stream error"); + match result { + Ok(reply) => { + last_reply = Some(reply); + } + Err(e) => return Err(e), + } + all_fds.extend(fds); + } + + Ok((last_reply.expect("get_layer stream was empty"), all_fds)) + } + + /// Like `collect_get_layer` but splits the fd array into: + /// - `pipe_and_dirfds`: `fds[0..=dir_count]` (pipe + dirfds region) + /// - `lifetime_fds`: `fds[dir_count+1..]` (keepalive + extras) + /// + /// Returns `(dir_count, pipe_and_dirfds, lifetime_fds)`. + async fn collect_get_layer_split( + client: &mut C, + handle: u64, + diff_id: &str, + ) -> ( + GetLayerReply, + Vec, + Vec, + ) + where + C: OciProxy, + { + let (reply, mut all_fds) = collect_get_layer(client, handle, diff_id) + .await + .expect("get_layer failed"); + let dir_count = reply.dir_count as usize; + // pipe_and_dirfds = fds[0..=dir_count] (1 + dir_count) + let pipe_and_dirfds_len = 1 + dir_count; + assert!( + all_fds.len() >= pipe_and_dirfds_len, + "expected at least {pipe_and_dirfds_len} fds, got {}", + all_fds.len() + ); + let lifetime_fds = all_fds.split_off(pipe_and_dirfds_len); + (reply, all_fds, lifetime_fds) + } + + /// Build a trivial tar stream with one file at `size` bytes and return the + /// raw bytes. Content is deterministic (repeating `i % 251`). + fn build_tar_layer(file_size: usize) -> Vec { + let content: Vec = (0..file_size).map(|i| (i % 251) as u8).collect(); + let mut builder = ::tar::Builder::new(vec![]); + let mut header = ::tar::Header::new_ustar(); + header.set_uid(0); + header.set_gid(0); + header.set_mode(0o644); + header.set_entry_type(::tar::EntryType::Regular); + header.set_size(file_size as u64); + builder + .append_data(&mut header, format!("file_{file_size}"), &content[..]) + .unwrap(); + builder.into_inner().unwrap() + } + + /// Create an insecure test repo. + fn create_test_repo() -> (Arc>, tempfile::TempDir) { + let tempdir = tempfile::TempDir::new().unwrap(); + let (repo, _) = Repository::init_path( + rustix::fs::CWD, + tempdir.path().join("repo"), + RepositoryConfig::default().set_insecure(), + ) + .unwrap(); + (Arc::new(repo), tempdir) + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_layer_sync_in_process() { + // --- set up a repo and import a synthetic layer --- + let (repo, _tempdir) = create_test_repo(); + + // Build a tar layer that has one large (>64-byte = external) file. + let tar_bytes = build_tar_layer(128 * 1024); // 128 KiB — external object + let diff_id = composefs_oci::sha256_content_digest(&tar_bytes); + let (verity, _stats) = + composefs_oci::import_layer(&repo, &diff_id, None, tar_bytes.as_slice()) + .await + .expect("import_layer"); + + // Record expected cat() output for comparison later. + let mut expected = Vec::::new(); + { + let mut reader = repo + .open_stream("", Some(&verity), Some(composefs_oci::LAYER_CONTENT_TYPE)) + .expect("open_stream for cat"); + reader.cat(&repo, &mut expected).expect("cat"); + } + + let repo_path = _tempdir.path().join("repo").to_str().unwrap().to_string(); + + // --- build and start the in-process service --- + let service = CfsctlService::insecure_for_test(); + let (mut client, _server_handle) = spawn_in_process(service).unwrap(); + + // OpenRepository to get a handle + metadata. + let open_reply = client + .open_repository(Some(&repo_path), None, None) + .await + .unwrap() + .expect("open_repository"); + let handle = open_reply.handle; + + // Validate the new metadata fields. + assert_eq!( + open_reply.hash_algorithm.as_deref(), + Some("sha256"), + "hash_algorithm must be sha256 for a Sha256HashValue repo" + ); + assert!( + open_reply.objects_device_id.is_some(), + "objects_device_id must be reported" + ); + + // --- GetInfo --- + let info = client.get_info().await.unwrap().expect("get_info"); + assert!( + info.features.contains(&"splitdirfdstream-v0".to_string()), + "expected splitdirfdstream-v0 in features" + ); + + // --- HasLayer: present --- + let has = client + .has_layer(handle, diff_id.as_ref()) + .await + .unwrap() + .expect("has_layer"); + assert!(has.present, "layer must be present"); + assert_eq!( + has.layer_verity.as_deref(), + Some(verity.to_hex().as_str()), + "verity mismatch" + ); + + // --- HasLayer: absent --- + let fake_digest = "sha256:0000000000000000000000000000000000000000000000000000000000000000"; + let has_absent = client + .has_layer(handle, fake_digest) + .await + .unwrap() + .expect("has_layer absent"); + assert!(!has_absent.present, "absent layer must not be present"); + assert!(has_absent.layer_verity.is_none()); + + // --- GetLayer: e2e round-trip --- + // The new streaming form: collect all frames' fds, then split into + // pipe+dirfds (wire positions 0..=dir_count) and lifetime fds (rest). + let (get_reply, pipe_and_dirfds, lifetime_fds) = + collect_get_layer_split(&mut client, handle, diff_id.as_ref()).await; + let dir_count = get_reply.dir_count as usize; + + // Keep lifetime fds alive until we are done reading the stream. + let _lifetime_fds = lifetime_fds; + + // fds[0] = pipe read; fds[1..=dir_count] = dirfds region (sparse). + let pipe_fd = pipe_and_dirfds[0].as_fd(); + let dir_fds: Vec<_> = pipe_and_dirfds[1..=dir_count] + .iter() + .map(|f| f.as_fd()) + .collect(); + + // Read the splitdirfdstream from the pipe to EOF. + let pipe_owned = rustix::io::dup(pipe_fd).expect("dup pipe read"); + let mut pipe_file = std::fs::File::from(pipe_owned); + let mut stream_bytes = Vec::new(); + pipe_file.read_to_end(&mut stream_bytes).unwrap(); + assert!(!stream_bytes.is_empty(), "stream must be non-empty"); + + // Reconstruct via the sparse dirfds region. + let mut actual = Vec::new(); + reconstruct(stream_bytes.as_slice(), &dir_fds, &mut actual) + .expect("reconstruct splitdirfdstream"); + + similar_asserts::assert_eq!( + actual, + expected, + "reconstructed layer must equal cat() output" + ); + + // --- GetLayer: unknown diff-id --- + let err = collect_get_layer(&mut client, handle, fake_digest).await; + match err { + Err(super::oci::OciError::NoSuchLayer { .. }) => {} + other => panic!("expected NoSuchLayer, got {other:?}"), + } + } + + /// Full GetLayer→PutLayer relay: serve repo A via one in-process server, + /// call `get_layer` to obtain the stream fds, then relay them to a second + /// in-process server hosting repo B via `put_layer`. + /// + /// Asserts: + /// - `put_layer` succeeds with `already_present = false`. + /// - repo B has the layer committed and its `cat` output matches repo A. + /// - A second `put_layer` with the same data returns `already_present = true`. + #[tokio::test(flavor = "multi_thread")] + async fn test_put_layer_relay() { + // --- set up repo A with a layer containing an external object --- + let (repo_a, _td_a) = create_test_repo(); + let tar_bytes = build_tar_layer(128 * 1024); // 128 KiB — external object + let diff_id = composefs_oci::sha256_content_digest(&tar_bytes); + let (verity_a, _) = + composefs_oci::import_layer(&repo_a, &diff_id, None, tar_bytes.as_slice()) + .await + .expect("import_layer into repo_a"); + + // expected cat() output for later comparison + let mut expected = Vec::::new(); + { + let mut reader = repo_a + .open_stream("", Some(&verity_a), Some(composefs_oci::LAYER_CONTENT_TYPE)) + .expect("open_stream for cat"); + reader.cat(&repo_a, &mut expected).expect("cat"); + } + let repo_a_path = _td_a.path().join("repo").to_str().unwrap().to_string(); + + // --- set up repo B (empty) --- + let (repo_b, _td_b) = create_test_repo(); + let repo_b_path = _td_b.path().join("repo").to_str().unwrap().to_string(); + + // --- start two in-process services --- + let service_a = CfsctlService::insecure_for_test(); + let (mut client_a, _srv_a) = spawn_in_process(service_a).unwrap(); + + let service_b = CfsctlService::insecure_for_test(); + let (mut client_b, _srv_b) = spawn_in_process(service_b).unwrap(); + + // Open repos via each service. + let handle_a = client_a + .open_repository(Some(&repo_a_path), None, None) + .await + .unwrap() + .expect("open_repository A") + .handle; + let handle_b = client_b + .open_repository(Some(&repo_b_path), None, None) + .await + .unwrap() + .expect("open_repository B") + .handle; + + // --- GetLayer from service A --- + // Collect all frames; split into pipe+dirfds and lifetime fds. + let (get_reply, pipe_and_dirfds, lifetime_fds) = + collect_get_layer_split(&mut client_a, handle_a, diff_id.as_ref()).await; + let dir_count = get_reply.dir_count as usize; + + // --- PutLayer into service B (first time) --- + // PutLayer receives fds[0..=dir_count] (pipe + dirfds region). + // We hold lifetime_fds open until put_layer returns. + let put_fds = pipe_and_dirfds; // fds[0] = pipe, fds[1..=dir_count] = dirs + let put_reply = client_b + .put_layer(handle_b, diff_id.as_ref(), false, put_fds) + .await + .unwrap() + .expect("put_layer"); + // Drop lifetime fds after put_layer completes. + drop(lifetime_fds); + + assert!( + !put_reply.already_present, + "first put_layer must report already_present = false" + ); + assert!( + dir_count > 0, + "dir_count must be > 0 (dirfds region has at least one slot)" + ); + + // The layer has one large external object; at least one object must + // have been stored via copy (same-host in-process, but tmpfs may not + // support reflink). Verify the stats are populated. + let total_stored = + put_reply.objects_reflinked + put_reply.objects_hardlinked + put_reply.objects_copied; + assert!( + total_stored + put_reply.objects_already_present > 0, + "put_layer must report at least one object stored, got {put_reply:?}" + ); + + // Verify repo B now has the layer. + let content_id = composefs_oci::layer_content_id(&diff_id); + assert!( + repo_b + .has_stream(&content_id) + .expect("has_stream B") + .is_some(), + "repo B must have the layer after put_layer" + ); + + // Verify the cat() output matches. + let verity_b: Sha256HashValue = + Sha256HashValue::from_hex(&put_reply.layer_verity).expect("parse layer_verity hex"); + let mut actual = Vec::::new(); + { + let mut reader = repo_b + .open_stream("", Some(&verity_b), Some(composefs_oci::LAYER_CONTENT_TYPE)) + .expect("open_stream B for cat"); + reader.cat(&repo_b, &mut actual).expect("cat B"); + } + similar_asserts::assert_eq!(actual, expected, "repo B cat must equal repo A cat"); + + // --- PutLayer a second time (idempotent): already_present = true --- + let (get_reply2, pipe_and_dirfds2, lifetime_fds2) = + collect_get_layer_split(&mut client_a, handle_a, diff_id.as_ref()).await; + let _ = get_reply2; + + let put_reply2 = client_b + .put_layer(handle_b, diff_id.as_ref(), false, pipe_and_dirfds2) + .await + .unwrap() + .expect("put_layer 2nd"); + drop(lifetime_fds2); + + assert!( + put_reply2.already_present, + "second put_layer must report already_present = true" + ); + } + + /// Negative: `put_layer` with a wrong diff_id must return `DiffIdMismatch` + /// and repo B must NOT have the stream committed. + #[tokio::test(flavor = "multi_thread")] + async fn test_put_layer_wrong_diff_id() { + let (repo_a, _td_a) = create_test_repo(); + let tar_bytes = build_tar_layer(128 * 1024); + let correct_diff_id = composefs_oci::sha256_content_digest(&tar_bytes); + let (_verity_a, _) = + composefs_oci::import_layer(&repo_a, &correct_diff_id, None, tar_bytes.as_slice()) + .await + .expect("import_layer"); + let repo_a_path = _td_a.path().join("repo").to_str().unwrap().to_string(); + + let (_repo_b, _td_b) = create_test_repo(); + let repo_b_path = _td_b.path().join("repo").to_str().unwrap().to_string(); + + let service_a = CfsctlService::insecure_for_test(); + let (mut client_a, _srv_a) = spawn_in_process(service_a).unwrap(); + let service_b = CfsctlService::insecure_for_test(); + let (mut client_b, _srv_b) = spawn_in_process(service_b).unwrap(); + + let handle_a = client_a + .open_repository(Some(&repo_a_path), None, None) + .await + .unwrap() + .expect("open_repository A") + .handle; + let handle_b = client_b + .open_repository(Some(&repo_b_path), None, None) + .await + .unwrap() + .expect("open_repository B") + .handle; + + // Get layer fds from A (collect streaming frames, split off lifetime fds). + let (_get_reply, pipe_and_dirfds, lifetime_fds) = + collect_get_layer_split(&mut client_a, handle_a, correct_diff_id.as_ref()).await; + + // Deliberately supply the wrong diff_id to service B. + let wrong_diff_id = + "sha256:0000000000000000000000000000000000000000000000000000000000000000"; + let put_err = client_b + .put_layer(handle_b, wrong_diff_id, false, pipe_and_dirfds) + .await + .unwrap(); + drop(lifetime_fds); + + match put_err { + Err(super::oci::OciError::DiffIdMismatch { expected, actual }) => { + assert_eq!(expected, wrong_diff_id); + assert_eq!(actual, correct_diff_id.to_string()); + } + other => panic!("expected DiffIdMismatch, got {other:?}"), + } + + // The wrong stream must NOT be committed in repo B. + let wrong_content_id = composefs_oci::layer_content_id( + &wrong_diff_id.parse::().unwrap(), + ); + assert!( + _repo_b + .has_stream(&wrong_content_id) + .expect("has_stream B") + .is_none(), + "repo B must NOT have a stream for the wrong diff_id" + ); + } + + // ------------------------------------------------------------------------- + // Helpers shared by the finalize_image test + // ------------------------------------------------------------------------- + + /// Build a minimal tar layer with a valid OCI directory structure. + /// + /// Creates `./`, `./usr/`, `./usr/share/`, and one data file of `payload_size` + /// bytes at `./usr/share/data_`. + fn build_oci_tar_layer(payload_size: usize) -> Vec { + let mut builder = ::tar::Builder::new(vec![]); + + for (path, is_dir) in &[("./", true), ("./usr/", true), ("./usr/share/", true)] { + let mut hdr = ::tar::Header::new_ustar(); + hdr.set_entry_type(::tar::EntryType::Directory); + hdr.set_uid(0); + hdr.set_gid(0); + hdr.set_mode(0o755); + hdr.set_size(0); + let _ = is_dir; // suppress unused warning + builder + .append_data(&mut hdr, path, std::io::empty()) + .unwrap(); + } + + let content: Vec = (0..payload_size).map(|i| (i % 251) as u8).collect(); + let mut file_hdr = ::tar::Header::new_ustar(); + file_hdr.set_entry_type(::tar::EntryType::Regular); + file_hdr.set_uid(0); + file_hdr.set_gid(0); + file_hdr.set_mode(0o644); + file_hdr.set_size(payload_size as u64); + builder + .append_data( + &mut file_hdr, + format!("./usr/share/data_{payload_size}"), + content.as_slice(), + ) + .unwrap(); + + builder.into_inner().unwrap() + } + + /// Build a minimal OCI config JSON with the given diff-id strings. + /// + /// Produces a JSON that `oci_spec::image::ImageConfiguration` would accept, + /// without pulling in the `oci_spec` builders (not available in composefs-ctl). + fn make_config_json(diff_ids: &[String]) -> String { + let ids: Vec = diff_ids.iter().map(|d| format!("\"{d}\"")).collect(); + format!( + r#"{{"architecture":"amd64","os":"linux","rootfs":{{"type":"layers","diff_ids":[{}]}},"config":{{}}}}"#, + ids.join(",") + ) + } + + /// Build a minimal OCI manifest JSON referencing `config_digest_str`. + fn make_manifest_json( + config_json: &str, + config_digest_str: &str, + diff_ids: &[String], + ) -> String { + let layer_entries: Vec = diff_ids + .iter() + .map(|d| { + format!( + r#"{{"mediaType":"application/vnd.oci.image.layer.v1.tar+gzip","digest":"{d}","size":1}}"# + ) + }) + .collect(); + format!( + r#"{{"schemaVersion":2,"mediaType":"application/vnd.oci.image.manifest.v1+json","config":{{"mediaType":"application/vnd.oci.image.config.v1+json","digest":"{config_digest_str}","size":{}}},"layers":[{}]}}"#, + config_json.len(), + layer_entries.join(",") + ) + } + + /// Round-trip test for the `FinalizeImage` varlink method. + /// + /// Imports a layer directly into repo B, then calls `finalize_image` via + /// the in-process varlink service on B, and asserts: + /// - The reply digests are non-empty. + /// - The manifest and config splitstreams now exist in repo B. + /// - The composefs EROFS image was generated. + #[tokio::test(flavor = "multi_thread")] + async fn test_finalize_image_roundtrip() { + use composefs_oci::OciDigest; + + let (repo_b, _td_b) = create_test_repo(); + let repo_b_path = _td_b.path().join("repo").to_str().unwrap().to_string(); + + // Import a layer directly into repo_b. + let tar_bytes = build_oci_tar_layer(128 * 1024); + let diff_id = composefs_oci::sha256_content_digest(&tar_bytes); + let (layer_verity, _) = + composefs_oci::import_layer(&repo_b, &diff_id, None, tar_bytes.as_slice()) + .await + .expect("import_layer into repo_b"); + + // Build config + manifest JSON. + let diff_ids = vec![diff_id.to_string()]; + let config_json = make_config_json(&diff_ids); + let config_digest = composefs_oci::sha256_content_digest(config_json.as_bytes()); + let manifest_json = make_manifest_json(&config_json, config_digest.as_ref(), &diff_ids); + + // Start the in-process service on repo B. + let service_b = CfsctlService::insecure_for_test(); + let (mut client_b, _srv_b) = spawn_in_process(service_b).unwrap(); + + let handle_b = client_b + .open_repository(Some(&repo_b_path), None, None) + .await + .unwrap() + .expect("open_repository B") + .handle; + + // Build the LayerRef list. + let layers = vec![super::layer_sync::LayerRef { + diff_id: diff_id.to_string(), + layer_verity: layer_verity.to_hex(), + }]; + + // Call finalize_image. + let reply = client_b + .finalize_image( + handle_b, + &manifest_json, + &config_json, + layers, + Some("finalize-test:v1"), + ) + .await + .unwrap() + .expect("finalize_image"); + + // Digests must be non-empty strings. + assert!( + !reply.manifest_digest.is_empty(), + "manifest_digest must be non-empty" + ); + assert!( + !reply.manifest_verity.is_empty(), + "manifest_verity must be non-empty" + ); + assert!( + !reply.config_digest.is_empty(), + "config_digest must be non-empty" + ); + assert!( + !reply.config_verity.is_empty(), + "config_verity must be non-empty" + ); + + // Manifest and config splitstreams must now exist in repo_b. + let manifest_digest: OciDigest = reply.manifest_digest.parse().unwrap(); + let config_digest2: OciDigest = reply.config_digest.parse().unwrap(); + + let manifest_id = composefs_oci::oci_image::manifest_identifier(&manifest_digest); + assert!( + repo_b + .has_stream(&manifest_id) + .expect("has_stream manifest") + .is_some(), + "manifest splitstream must exist in repo_b" + ); + + // The config stream key follows the pattern "oci-config-". + let config_id2 = format!("oci-config-{config_digest2}"); + assert!( + repo_b + .has_stream(&config_id2) + .expect("has_stream config") + .is_some(), + "config splitstream must exist in repo_b" + ); + + // EROFS must have been generated. + let manifest_verity = + Sha256HashValue::from_hex(&reply.manifest_verity).expect("parse manifest_verity"); + let erofs = composefs_oci::composefs_erofs_for_manifest( + &repo_b, + &manifest_digest, + Some(&manifest_verity), + repo_b.erofs_version(), + ) + .expect("composefs_erofs_for_manifest"); + assert!( + erofs.is_some(), + "EROFS image must exist after finalize_image" + ); + } +} diff --git a/crates/composefs-integration-tests/src/main.rs b/crates/composefs-integration-tests/src/main.rs index dc6dbef1..be1dc2a2 100644 --- a/crates/composefs-integration-tests/src/main.rs +++ b/crates/composefs-integration-tests/src/main.rs @@ -14,7 +14,7 @@ use std::fs; use std::path::{Path, PathBuf}; -use anyhow::{Result, bail}; +use anyhow::{Context as _, Result, bail}; use libtest_mimic::{Arguments, Trial}; pub(crate) use composefs_integration_tests::{INTEGRATION_TESTS, integration_test}; @@ -57,6 +57,45 @@ pub(crate) fn cfsctl() -> Result { ) } +/// Bind a listening Unix socket at a fresh tempdir path and spawn `cfsctl` +/// against it via the systemd socket-activation protocol (`LISTEN_FDS=1`, the +/// listening socket on fd 3, `LISTEN_PID` set in the child). The socket is +/// already listening before the child starts, so callers may connect +/// immediately — no polling needed. The path exists on disk, so `varlinkctl` +/// (which connects by path) works unchanged. +/// +/// Returns the spawned child, the tempdir (keep alive for the socket's +/// lifetime), and the socket path. +pub(crate) fn spawn_activated_cfsctl() -> Result<(std::process::Child, tempfile::TempDir, PathBuf)> +{ + use std::os::fd::OwnedFd; + use std::os::unix::net::UnixListener; + use std::sync::Arc; + + use cap_std_ext::cmdext::{CapStdExtCommandExt, CmdFds, SystemdFdName}; + + let cfsctl = cfsctl()?; + let socket_dir = tempfile::tempdir()?; + let socket = socket_dir.path().join("varlink.sock"); + + let listener = UnixListener::bind(&socket) + .with_context(|| format!("binding varlink listener at {}", socket.display()))?; + let listener_fd: Arc = Arc::new(listener.into()); + + let mut cmd = std::process::Command::new(&cfsctl); + // Bare invocation — no subcommand, no --address. cfsctl serves on the + // inherited listening fd via run_if_socket_activated(). + // + // IMPORTANT: do NOT call cmd.env()/.envs() here — take_fds() sets the + // LISTEN_* vars via setenv in pre_exec, and Command::env would clobber + // them by building a separate envp array. + let fds = CmdFds::new_systemd_fds([(listener_fd, SystemdFdName::new("varlink"))]); + cmd.take_fds(fds); + let child = cmd.spawn().context("spawning socket-activated cfsctl")?; + + Ok((child, socket_dir, socket)) +} + /// Create a test rootfs fixture inside `parent` and return its path. /// /// Includes a file large enough (128 KiB) to avoid erofs inlining so that diff --git a/crates/composefs-integration-tests/src/tests/copy_image.rs b/crates/composefs-integration-tests/src/tests/copy_image.rs new file mode 100644 index 00000000..b65c7960 --- /dev/null +++ b/crates/composefs-integration-tests/src/tests/copy_image.rs @@ -0,0 +1,687 @@ +//! Integration tests for the `oci copy` CLI command and `copy_image` API. + +use std::path::Path; +use std::sync::Arc; + +use anyhow::{Context, Result, bail, ensure}; +use xshell::{Shell, cmd}; + +use composefs_oci::OciDigest; +use composefs_oci::composefs::fsverity::{Sha256HashValue, Sha512HashValue}; +use composefs_oci::composefs::repository::{Repository, RepositoryConfig}; +use composefs_oci::layer_sync::finalize_oci_image; +use composefs_oci::test_util::{build_oci_tar_layer, make_config_json, make_manifest_json}; +use composefs_oci::{import_layer, layer_content_id, sha256_content_digest}; + +use composefs_ctl::varlink::RepositoryError; +use composefs_ctl::varlink::proxy::RepositoryProxy; + +use crate::{cfsctl, integration_test}; + +// ── Shared helpers ───────────────────────────────────────────────────────── + +/// Initialise an insecure SHA-256 repository at `path`. +fn init_sha256_repo(path: &std::path::Path) -> Result>> { + std::fs::create_dir_all(path)?; + let fd = rustix::fs::open( + path, + rustix::fs::OFlags::CLOEXEC | rustix::fs::OFlags::RDONLY, + 0.into(), + )?; + let (repo, _) = Repository::::init_path( + &fd, + ".", + RepositoryConfig::default().set_insecure(), + )?; + Ok(Arc::new(repo)) +} + +/// Build a two-layer synthetic image in `repo` and tag it `name`. +/// +/// Layer 1: 10 bytes — small enough to be fully inlined. +/// Layer 2: 256 KiB — large enough to produce an external object that +/// exercises the reflink / hardlink path. +/// +/// Returns `(manifest_json_bytes, config_json_bytes, diff_id_layer2_string)`. +fn build_and_import_test_image( + repo: &Arc>, + name: &str, +) -> Result<(Vec, Vec, String)> { + let rt = tokio::runtime::Runtime::new()?; + + let tar1 = build_oci_tar_layer(10); + let tar2 = build_oci_tar_layer(256 * 1024); + + let diff_id1 = sha256_content_digest(&tar1); + let diff_id2 = sha256_content_digest(&tar2); + let diff_id2_str = diff_id2.to_string(); + + let (verity1, verity2) = rt.block_on(async { + let (v1, _) = import_layer(repo, &diff_id1, None, tar1.as_slice()) + .await + .context("import layer 1")?; + let (v2, _) = import_layer(repo, &diff_id2, None, tar2.as_slice()) + .await + .context("import layer 2")?; + Ok::<_, anyhow::Error>((v1, v2)) + })?; + + let diff_ids = vec![diff_id1.to_string(), diff_id2_str.clone()]; + let config_json = make_config_json(&diff_ids); + let config_digest = sha256_content_digest(&config_json); + let manifest_json = make_manifest_json(&config_json, config_digest.as_ref(), &diff_ids); + + let layer_refs = vec![(diff_id1, verity1), (diff_id2, verity2)]; + finalize_oci_image(repo, &manifest_json, &config_json, &layer_refs, Some(name)) + .context("finalize_oci_image")?; + + Ok((manifest_json, config_json, diff_id2_str)) +} + +// ── Test A: correctness (unprivileged) ───────────────────────────────────── + +fn test_copy_image_correctness() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + + // Both repos under the same parent tempdir (same filesystem). + let parent = tempfile::tempdir()?; + let repo_a_path = parent.path().join("repo-a"); + let repo_b_path = parent.path().join("repo-b"); + + // Initialise repo A and synthesise a two-layer image. + let repo_a = init_sha256_repo(&repo_a_path)?; + let (manifest_json_a, config_json_a, diff_id2_str) = + build_and_import_test_image(&repo_a, "copyme:v1")?; + + // Compute digests for repo A for later comparison. + let manifest_digest_a = sha256_content_digest(&manifest_json_a); + let config_digest_a = sha256_content_digest(&config_json_a); + + // Initialise repo B (empty). + let _repo_b_init = init_sha256_repo(&repo_b_path)?; + drop(_repo_b_init); // close — cfsctl will open it + + // Run `cfsctl oci copy` as a subprocess. This is a sanity check of the + // CLI: it must succeed and land the image in the destination. Structured + // transfer stats are intentionally not exposed by the CLI (use varlink for + // that); we verify the outcome by inspecting the destination repo below. + let repo_a_str = repo_a_path.to_str().unwrap(); + let repo_b_str = repo_b_path.to_str().unwrap(); + let tag = "copyme:v1"; + cmd!( + sh, + "{cfsctl} --repo {repo_b_str} --insecure oci copy {tag} --from {repo_a_str} --name {tag}" + ) + .run() + .context("cfsctl oci copy failed")?; + + // ── Assert: manifest and config digests are identical in repo B ─────── + // Open repo B in-process and compare. + let repo_b = Repository::::open_path(rustix::fs::CWD, &repo_b_path)?; + let img_b = composefs_oci::oci_image::OciImage::open_ref(&repo_b, "copyme:v1") + .context("opening copyme:v1 in repo B")?; + + ensure!( + img_b.manifest_digest() == &manifest_digest_a, + "manifest digest mismatch: src={manifest_digest_a}, dest={}", + img_b.manifest_digest() + ); + ensure!( + img_b.config_digest() == &config_digest_a, + "config digest mismatch: src={config_digest_a}, dest={}", + img_b.config_digest() + ); + + // ── Assert: the 256 KiB external object exists in B and is byte-identical ── + // Find the object path using the diff_id of layer 2. + let diff_id2: composefs_oci::OciDigest = diff_id2_str + .parse() + .context("parsing diff_id2 as OciDigest")?; + let content_id = composefs_oci::layer_content_id(&diff_id2); + + let verity_a = repo_a + .has_stream(&content_id)? + .context("repo_a missing layer 2 stream")?; + let verity_b = repo_b + .has_stream(&content_id)? + .context("repo_b missing layer 2 stream after copy")?; + + // Both repos should have resolved to the same verity hash (same content). + ensure!( + verity_a == verity_b, + "verity hashes differ: src={verity_a:?}, dest={verity_b:?}" + ); + + // Read the external object from both repos and compare bytes. + let obj_bytes_a = repo_a + .read_object(&verity_a) + .context("read_object from repo_a")?; + let obj_bytes_b = repo_b + .read_object(&verity_b) + .context("read_object from repo_b")?; + ensure!( + obj_bytes_a == obj_bytes_b, + "external object content differs between repos" + ); + + Ok(()) +} +integration_test!(test_copy_image_correctness); + +// ── Test B: deterministic reflink on XFS (privileged) ────────────────────── + +fn privileged_copy_image_reflink() -> Result<()> { + use crate::tests::privileged::{LoopTempDir, require_privileged}; + + if require_privileged("privileged_copy_image_reflink")?.is_some() { + return Ok(()); + } + + // Skip if mkfs.xfs is not available. + if !std::path::Path::new("/usr/sbin/mkfs.xfs").exists() + && !std::path::Path::new("/sbin/mkfs.xfs").exists() + { + println!("SKIP: mkfs.xfs not available"); + return Ok(()); + } + + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + + // Both repos on the same XFS reflink-capable loop mount. + let fs = LoopTempDir::xfs_reflink()?; + let repo_a_path = fs.path().join("repo-a"); + let repo_b_path = fs.path().join("repo-b"); + + let repo_a = init_sha256_repo(&repo_a_path)?; + build_and_import_test_image(&repo_a, "copyme:v1")?; + + let _repo_b = init_sha256_repo(&repo_b_path)?; + + // Run `cfsctl oci copy --zerocopy` via the CLI. + let repo_a_str = repo_a_path.to_str().unwrap(); + let repo_b_str = repo_b_path.to_str().unwrap(); + let tag = "copyme:v1"; + cmd!( + sh, + "{cfsctl} --repo {repo_b_str} --insecure oci copy {tag} --from {repo_a_str} --name {tag} --zerocopy" + ) + .run() + .context("cfsctl oci copy --zerocopy failed")?; + + // Verify correctness: manifest + config digests match. + let repo_b = Repository::::open_path(rustix::fs::CWD, &repo_b_path)?; + let img_a = composefs_oci::oci_image::OciImage::open_ref(&repo_a, "copyme:v1") + .context("open copyme:v1 in repo_a")?; + let img_b = composefs_oci::oci_image::OciImage::open_ref(&repo_b, "copyme:v1") + .context("open copyme:v1 in repo_b")?; + + ensure!( + img_a.manifest_digest() == img_b.manifest_digest(), + "manifest digest mismatch after reflink copy" + ); + ensure!( + img_a.config_digest() == img_b.config_digest(), + "config digest mismatch after reflink copy" + ); + + // On XFS with reflinks, the objects should share extents. Verify by + // checking that at least one object in repo B has nlink > 1 or shares + // its extents with the corresponding object in repo A (same inode on + // XFS reflink means FICLONE succeeded). + // A lighter check: the copy succeeded with --zerocopy (which would fail + // if reflink/hardlink was not possible), and the content matches. + + Ok(()) +} +integration_test!(privileged_copy_image_reflink); + +// ── Test C: varlink cross-process fd-passing copy ─────────────────────────── + +/// A running `cfsctl varlink` subprocess bound to a Unix socket. +/// +/// Kills the child process on drop so a test panic does not leak it. +struct VarlinkProc { + child: std::process::Child, + socket: std::path::PathBuf, + /// Keep the tempdir holding the socket alive for the process's lifetime. + _socket_dir: tempfile::TempDir, +} + +impl Drop for VarlinkProc { + fn drop(&mut self) { + let _ = self.child.kill(); + let _ = self.child.wait(); + } +} + +impl VarlinkProc { + /// Spawn `cfsctl` via systemd socket-activation with a pre-bound listening + /// socket. The socket is already listening before the child starts, so + /// callers may connect immediately — no polling needed. + fn spawn() -> Result { + let (child, socket_dir, socket) = crate::spawn_activated_cfsctl()?; + Ok(VarlinkProc { + child, + socket, + _socket_dir: socket_dir, + }) + } + + fn socket(&self) -> &Path { + &self.socket + } +} + +/// Open a repository on a varlink server and return its handle, via the typed +/// zlink proxy. Connects a fresh client for this single call. +/// Open `repo_path` over an existing connection and return the handle. +/// +/// The handle is reused for all subsequent calls on the same connection. +async fn open_repo(conn: &mut zlink::unix::Connection, repo_path: &Path) -> Result { + let path_str = repo_path.to_str().context("repo path is not valid UTF-8")?; + let reply = conn + .open_repository(Some(path_str), None, None) + .await + .context("zlink transport error calling OpenRepository")?; + match reply { + Ok(r) => Ok(r.handle), + Err(RepositoryError::InvalidSpec { message }) => { + bail!("OpenRepository: InvalidSpec: {message}") + } + Err(e) => bail!("OpenRepository failed: {e:?}"), + } +} + +/// Copy an image between two varlink subprocess servers using `copy_image`. +/// +/// This exercises the cross-process SCM_RIGHTS fd passing path that +/// in-process tests cannot cover. +fn test_copy_image_via_varlink() -> Result<()> { + let parent = tempfile::tempdir()?; + let repo_a_path = parent.path().join("repo-a"); + let repo_b_path = parent.path().join("repo-b"); + + let repo_a = init_sha256_repo(&repo_a_path)?; + let (manifest_json_a, config_json_a, _diff_id2_str) = + build_and_import_test_image(&repo_a, "copyme:v1")?; + + let manifest_digest_a = sha256_content_digest(&manifest_json_a); + let config_digest_a = sha256_content_digest(&config_json_a); + + { + let _repo_b_init = init_sha256_repo(&repo_b_path)?; + } + drop(repo_a); + + let svc_a = VarlinkProc::spawn().context("spawning varlink server A")?; + let svc_b = VarlinkProc::spawn().context("spawning varlink server B")?; + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .context("building tokio runtime")?; + + let image: composefs_ctl::OciReference = "copyme:v1".parse()?; + + let finalize_reply = rt.block_on(async { + let mut conn_a = zlink::unix::connect(svc_a.socket()) + .await + .context("connecting to server A")?; + let mut conn_b = zlink::unix::connect(svc_b.socket()) + .await + .context("connecting to server B")?; + + let handle_a = open_repo(&mut conn_a, &repo_a_path).await?; + let handle_b = open_repo(&mut conn_b, &repo_b_path).await?; + + let finalize = composefs_ctl::copy_image( + &mut conn_a, + &mut conn_b, + handle_a, + handle_b, + &image, + Some("copyme:v1"), + false, + ) + .await?; + + // Fsck the destination over the wire. + let fsck = conn_b + .fsck(handle_b, None) + .await + .context("Fsck transport error")? + .map_err(|e: RepositoryError| anyhow::anyhow!("Fsck failed: {e:?}"))?; + ensure!(fsck.ok, "fsck failed after copy: {fsck:?}"); + + Ok::<_, anyhow::Error>(finalize) + })?; + + ensure!( + finalize_reply.manifest_digest == manifest_digest_a.to_string(), + "manifest_digest mismatch" + ); + ensure!( + finalize_reply.config_digest == config_digest_a.to_string(), + "config_digest mismatch" + ); + + // Open repo B in-process and verify the image landed correctly. + let repo_b = Repository::::open_path(rustix::fs::CWD, &repo_b_path)?; + let img_b = composefs_oci::oci_image::OciImage::open_ref(&repo_b, "copyme:v1") + .context("opening copyme:v1 in repo B")?; + + ensure!( + img_b.manifest_digest() == &manifest_digest_a, + "manifest digest mismatch in-process" + ); + ensure!( + img_b.config_digest() == &config_digest_a, + "config digest mismatch in-process" + ); + + Ok(()) +} +integration_test!(test_copy_image_via_varlink); + +// ── Test D: cross-algorithm copy via varlink (sha256 → sha512) ──────────── + +/// Initialise an insecure SHA-512 repository at `path`. +fn init_sha512_repo(path: &std::path::Path) -> Result<()> { + use composefs_oci::composefs::fsverity::Algorithm; + std::fs::create_dir_all(path)?; + let fd = rustix::fs::open( + path, + rustix::fs::OFlags::CLOEXEC | rustix::fs::OFlags::RDONLY, + 0.into(), + )?; + let config = RepositoryConfig::new(Algorithm::SHA512).set_insecure(); + Repository::::init_path(&fd, ".", config)?; + Ok(()) +} + +/// Copy an image from a sha256 repository to a sha512 repository over varlink. +/// +/// This proves that cross-algorithm copy works end-to-end: the +/// splitdirfdstream carries only algorithm-independent data (inline bytes + +/// transient source-object locators) and the destination re-computes every +/// object's fs-verity digest under its own (sha512) algorithm on import. +/// +/// The test also validates the new `OpenRepositoryReply` metadata fields +/// (`hash_algorithm`, `objects_device_id`) introduced for explicit zerocopy +/// negotiation between client and servers. +fn test_copy_image_cross_algorithm() -> Result<()> { + let parent = tempfile::tempdir()?; + let repo_a_path = parent.path().join("repo-sha256"); + let repo_b_path = parent.path().join("repo-sha512"); + + let repo_a = init_sha256_repo(&repo_a_path)?; + let (manifest_json_a, _config_json_a, _) = build_and_import_test_image(&repo_a, "copyme:v1")?; + let manifest_digest_a = sha256_content_digest(&manifest_json_a); + drop(repo_a); + + init_sha512_repo(&repo_b_path)?; + + let svc_a = VarlinkProc::spawn().context("spawning server A (sha256)")?; + let svc_b = VarlinkProc::spawn().context("spawning server B (sha512)")?; + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?; + + let image: composefs_ctl::OciReference = "copyme:v1".parse()?; + + let finalize_reply = rt.block_on(async { + let mut conn_a = zlink::unix::connect(svc_a.socket()).await?; + let mut conn_b = zlink::unix::connect(svc_b.socket()).await?; + + let handle_a = open_repo(&mut conn_a, &repo_a_path).await?; + let handle_b = open_repo(&mut conn_b, &repo_b_path).await?; + + let finalize = composefs_ctl::copy_image( + &mut conn_a, + &mut conn_b, + handle_a, + handle_b, + &image, + Some("copyme:v1"), + false, + ) + .await?; + + // Fsck the sha512 destination. + let fsck = conn_b + .fsck(handle_b, None) + .await + .context("Fsck transport error")? + .map_err(|e: RepositoryError| anyhow::anyhow!("Fsck failed: {e:?}"))?; + ensure!(fsck.ok, "fsck failed on sha512 dest: {fsck:?}"); + + Ok::<_, anyhow::Error>(finalize) + })?; + + // OCI content digests must match regardless of fs-verity algorithm. + ensure!( + finalize_reply.manifest_digest == manifest_digest_a.to_string(), + "manifest_digest mismatch", + ); + + // The verity strings should be sha512 (128 hex chars). + ensure!( + finalize_reply.manifest_verity.len() == 128, + "sha512 manifest_verity should be 128 hex chars, got {} chars", + finalize_reply.manifest_verity.len(), + ); + + // Verify the image exists in the sha512 repo. + let repo_b = Repository::::open_path(rustix::fs::CWD, &repo_b_path)?; + let img_b = composefs_oci::oci_image::OciImage::open_ref(&repo_b, "copyme:v1") + .context("opening copyme:v1 in sha512 repo B")?; + ensure!( + img_b.manifest_digest() == &manifest_digest_a, + "manifest digest mismatch in sha512 repo", + ); + + Ok(()) +} +integration_test!(test_copy_image_cross_algorithm); + +// ── Test E: CLI cross-algorithm copy (sha256 → sha512) ──────────────────── + +/// Run `cfsctl oci copy` from a sha256 repo to a sha512 repo via the CLI. +/// +/// Proves that the CLI dispatch correctly opens the destination with its own +/// algorithm (2×2 monomorphisation) and that non-zerocopy cross-algorithm +/// copy succeeds end-to-end. +fn test_copy_image_cross_algorithm_cli() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + + let parent = tempfile::tempdir()?; + let repo_a_path = parent.path().join("repo-sha256"); + let repo_b_path = parent.path().join("repo-sha512"); + + let repo_a = init_sha256_repo(&repo_a_path)?; + let (manifest_json_a, _config_json_a, _) = build_and_import_test_image(&repo_a, "copyme:v1")?; + let manifest_digest_a = sha256_content_digest(&manifest_json_a); + drop(repo_a); + + init_sha512_repo(&repo_b_path)?; + + let repo_a_str = repo_a_path.to_str().unwrap(); + let repo_b_str = repo_b_path.to_str().unwrap(); + let tag = "copyme:v1"; + cmd!( + sh, + "{cfsctl} --repo {repo_b_str} --insecure oci copy {tag} --from {repo_a_str} --name {tag}" + ) + .run() + .context("cfsctl oci copy (sha256→sha512) should succeed without --zerocopy")?; + + // Verify the image landed in the sha512 repo. + let repo_b = Repository::::open_path(rustix::fs::CWD, &repo_b_path)?; + let img_b = composefs_oci::oci_image::OciImage::open_ref(&repo_b, "copyme:v1") + .context("opening copyme:v1 in sha512 repo B")?; + ensure!( + img_b.manifest_digest() == &manifest_digest_a, + "manifest_digest mismatch: expected={manifest_digest_a}, got={}", + img_b.manifest_digest(), + ); + + Ok(()) +} +integration_test!(test_copy_image_cross_algorithm_cli); + +// ── Test F: zerocopy + algorithm mismatch → error ───────────────────────── + +/// `cfsctl oci copy --zerocopy` from sha256 to sha512 must fail with a clear +/// error rather than silently degrading. +fn test_copy_image_zerocopy_algo_mismatch() -> Result<()> { + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + + let parent = tempfile::tempdir()?; + let repo_a_path = parent.path().join("repo-sha256"); + let repo_b_path = parent.path().join("repo-sha512"); + + let repo_a = init_sha256_repo(&repo_a_path)?; + let _ = build_and_import_test_image(&repo_a, "copyme:v1")?; + drop(repo_a); + + init_sha512_repo(&repo_b_path)?; + + let repo_a_str = repo_a_path.to_str().unwrap(); + let repo_b_str = repo_b_path.to_str().unwrap(); + let tag = "copyme:v1"; + let result = cmd!( + sh, + "{cfsctl} --repo {repo_b_str} --insecure oci copy {tag} --from {repo_a_str} --name {tag} --zerocopy" + ) + .ignore_status() + .read_stderr() + .context("running cfsctl oci copy --zerocopy")?; + + ensure!( + result.contains("zerocopy") && result.contains("hash algorithm"), + "expected error about zerocopy + hash algorithm mismatch, got: {result}" + ); + + Ok(()) +} +integration_test!(test_copy_image_zerocopy_algo_mismatch); + +// ── Test G: OCI artifact copy (no rootfs.diff_ids) ──────────────────────── + +/// Build a synthetic OCI artifact (not a container image) in repo A, then +/// copy it to repo B via `cfsctl oci copy`. Artifacts have no +/// `rootfs.diff_ids` in their config, so this exercises the manifest-layer +/// fallback in `copy_image`. +fn test_copy_artifact() -> Result<()> { + use composefs_oci::composefs::repository::Repository; + use containers_image_proxy::oci_spec::image::{ + DescriptorBuilder, ImageManifestBuilder, MediaType, + }; + + let sh = Shell::new()?; + let cfsctl = cfsctl()?; + + let parent = tempfile::tempdir()?; + let repo_a_path = parent.path().join("repo-a"); + let repo_b_path = parent.path().join("repo-b"); + + // ── Build an artifact in repo A ─────────────────────────────────────── + let repo_a = init_sha256_repo(&repo_a_path)?; + + // Artifact payload: a small SBOM-like blob (not a tar). + let blob_data = br#"{"spdxVersion":"SPDX-2.3","name":"test-artifact"}"#; + let blob_digest = sha256_content_digest(blob_data); + + // Store the blob as an object + layer splitstream. + let blob_object_id = repo_a.ensure_object(blob_data)?; + let layer_id = composefs_oci::layer_identifier(&blob_digest); + // Use OCI_BLOB_CONTENT_TYPE for non-tar artifact layers. + let mut layer_stream = repo_a.create_stream(composefs_oci::skopeo::OCI_BLOB_CONTENT_TYPE)?; + layer_stream.add_external_size(blob_data.len() as u64); + layer_stream.write_reference(blob_object_id)?; + let layer_verity = repo_a.write_stream(layer_stream, &layer_id, None)?; + + // Empty config (the OCI 1.1 artifact config pattern). + let config_json = b"{}"; + let config_digest = sha256_content_digest(config_json); + + // Build the manifest with EmptyJSON config media type (not ImageConfig). + let config_desc = DescriptorBuilder::default() + .media_type(MediaType::EmptyJSON) + .digest(config_digest.as_ref().to_string()) + .size(config_json.len() as i64) + .build() + .context("building config descriptor")?; + + let layer_desc = DescriptorBuilder::default() + .media_type(MediaType::Other("text/spdx+json".to_string())) + .digest(blob_digest.as_ref().to_string()) + .size(blob_data.len() as i64) + .build() + .context("building layer descriptor")?; + + let manifest = ImageManifestBuilder::default() + .schema_version(2u32) + .media_type(MediaType::ImageManifest) + .config(config_desc) + .layers(vec![layer_desc]) + .build() + .context("building manifest")?; + + let manifest_json = serde_json::to_vec(&manifest).context("serializing manifest")?; + let manifest_digest = sha256_content_digest(&manifest_json); + + let layer_refs = vec![(blob_digest.clone(), layer_verity)]; + finalize_oci_image( + &repo_a, + &manifest_json, + config_json, + &layer_refs, + Some("artifact:v1"), + ) + .context("finalize artifact")?; + + // Verify it's not a container image. + let img = composefs_oci::oci_image::OciImage::open_ref(&repo_a, "artifact:v1")?; + ensure!( + !img.is_container_image(), + "expected artifact, got container image" + ); + drop(repo_a); + + // ── Init repo B and copy via CLI ────────────────────────────────────── + { + let _repo_b = init_sha256_repo(&repo_b_path)?; + } + + let repo_a_str = repo_a_path.to_str().unwrap(); + let repo_b_str = repo_b_path.to_str().unwrap(); + cmd!( + sh, + "{cfsctl} --repo {repo_b_str} --insecure oci copy artifact:v1 --from {repo_a_str} --name artifact:v1" + ) + .run() + .context("cfsctl oci copy failed for artifact")?; + + // ── Verify the artifact landed in repo B ────────────────────────────── + let repo_b = Repository::::open_path(rustix::fs::CWD, &repo_b_path)?; + let img_b = composefs_oci::oci_image::OciImage::open_ref(&repo_b, "artifact:v1") + .context("opening artifact:v1 in repo B")?; + + ensure!( + !img_b.is_container_image(), + "copied artifact became a container image" + ); + ensure!( + img_b.manifest_digest() == &manifest_digest, + "manifest digest mismatch after artifact copy" + ); + + Ok(()) +} +integration_test!(test_copy_artifact); diff --git a/crates/composefs-integration-tests/src/tests/mod.rs b/crates/composefs-integration-tests/src/tests/mod.rs index 6a89a406..54242ec3 100644 --- a/crates/composefs-integration-tests/src/tests/mod.rs +++ b/crates/composefs-integration-tests/src/tests/mod.rs @@ -1,6 +1,7 @@ //! Integration test modules, organized by execution environment. pub mod cli; +pub mod copy_image; pub mod cstor; pub mod digest_stability; pub mod oci_compat; diff --git a/crates/composefs-integration-tests/src/tests/privileged.rs b/crates/composefs-integration-tests/src/tests/privileged.rs index a5c5f922..fd6c673d 100644 --- a/crates/composefs-integration-tests/src/tests/privileged.rs +++ b/crates/composefs-integration-tests/src/tests/privileged.rs @@ -639,19 +639,19 @@ integration_test!(privileged_pull_readonly_repo); /// Supports ext4 (with verity) and XFS (with reflinks). The backing sparse /// file is 512 MB — large enough for a synthetic OCI image in /// containers-storage plus a composefs repo. -struct LoopTempDir { +pub(crate) struct LoopTempDir { mountpoint: PathBuf, _backing: tempfile::TempDir, } impl LoopTempDir { /// Create a loop-mounted ext4 filesystem with verity support. - fn ext4_verity() -> Result { + pub(crate) fn ext4_verity() -> Result { Self::create("mkfs.ext4", &["-q", "-O", "verity", "-b", "4096"]) } /// Create a loop-mounted XFS filesystem with reflink support. - fn xfs_reflink() -> Result { + pub(crate) fn xfs_reflink() -> Result { Self::create("mkfs.xfs", &["-q", "-m", "reflink=1"]) } @@ -672,7 +672,7 @@ impl LoopTempDir { }) } - fn path(&self) -> &Path { + pub(crate) fn path(&self) -> &Path { &self.mountpoint } } diff --git a/crates/composefs-integration-tests/src/tests/varlink.rs b/crates/composefs-integration-tests/src/tests/varlink.rs index aaa279b8..b73bcb28 100644 --- a/crates/composefs-integration-tests/src/tests/varlink.rs +++ b/crates/composefs-integration-tests/src/tests/varlink.rs @@ -1,8 +1,8 @@ //! Integration tests for the `cfsctl` varlink RPC API. //! //! These drive the service the same way an external consumer would: they spawn -//! `cfsctl varlink --address ` (or `cfsctl oci varlink ...`) as a -//! separate process and talk to it over the Unix socket. +//! `cfsctl` as a separate process via systemd socket-activation and talk to it +//! over a pre-bound Unix socket. //! //! ## Two clients, on purpose //! @@ -28,9 +28,9 @@ //! typed error variant. Avoid converting the whole suite to either side. //! //! A single service answers both the `org.composefs.Repository` and -//! `org.composefs.Oci` interfaces on one socket, so the `repository()` and -//! `oci()` spawn helpers below differ only in which CLI subcommand starts the -//! (identical) combined service. +//! `org.composefs.Oci` interfaces on one socket. The `repository()` and +//! `oci()` spawn helpers are retained for historical reasons — both now use +//! socket activation and serve the identical combined interface set. //! //! ## Handle-based API //! @@ -43,7 +43,6 @@ use std::path::Path; use std::process::Command; -use std::time::{Duration, Instant}; use anyhow::{Context, Result, bail}; use composefs_ctl::varlink::oci::{OciError, OciInspectReply, PullProgress}; @@ -79,31 +78,24 @@ impl Drop for VarlinkService { } impl VarlinkService { - /// Spawn `cfsctl --address `, wait for the socket to - /// appear, then open `repo` via `OpenRepository` and cache its handle. + /// Spawn `cfsctl` via systemd socket-activation with a pre-bound listening + /// socket, then open `repo` via `OpenRepository` and cache its handle. + /// + /// The socket is already listening before the child starts, so no polling + /// is needed — the child reaches the socket via the inherited fd 3. The + /// socket path also exists on disk, so `varlinkctl` (which connects by + /// path) works unchanged. /// /// The varlink service opens no repository at startup, so the test repo is /// opened explicitly here; the returned handle is auto-injected into /// subsequent calls. The repository's insecure mode is auto-detected from /// its metadata, so no open flags are needed. - fn spawn(repo: &Path, service_args: &[&str]) -> Result { - let cfsctl = cfsctl()?; - let socket_dir = tempfile::tempdir()?; - let socket = socket_dir.path().join("varlink.sock"); - - let mut cmd = Command::new(&cfsctl); - cmd.args(service_args); - cmd.arg("--address").arg(&socket); - let child = cmd.spawn().context("spawning cfsctl varlink server")?; - - // Wait (briefly) for the service to bind the socket. - let deadline = Instant::now() + Duration::from_secs(10); - while !socket.exists() { - if Instant::now() > deadline { - bail!("timed out waiting for varlink socket {}", socket.display()); - } - std::thread::sleep(Duration::from_millis(20)); - } + /// + /// Socket activation serves the full combined interface set + /// (`org.composefs.Repository` + `org.composefs.Oci`) regardless of which + /// of the historical CLI entry points would have been used. + fn spawn(repo: &Path) -> Result { + let (child, socket_dir, socket) = crate::spawn_activated_cfsctl()?; let handle = Self::open_repository(&socket, repo)?; @@ -154,16 +146,19 @@ impl VarlinkService { .context("OpenRepository reply missing numeric 'handle'") } - /// Spawn the combined service via the top-level `varlink` subcommand. + /// Spawn the combined service (historically the `varlink` subcommand entry + /// point). Both `repository` and `oci` now use socket activation, which + /// serves the full combined interface set on one socket. fn repository(repo: &Path) -> Result { - Self::spawn(repo, &["varlink"]) + Self::spawn(repo) } - /// Spawn the combined service via the `oci varlink` subcommand. Serves the - /// same interfaces as [`Self::repository`]; kept to exercise both CLI entry - /// points. + /// Spawn the combined service (historically the `oci varlink` subcommand + /// entry point). Kept so tests that call this entry point continue to + /// exercise a real service spawn; identical to [`Self::repository`] under + /// socket activation. fn oci(repo: &Path) -> Result { - Self::spawn(repo, &["oci", "varlink"]) + Self::spawn(repo) } fn socket_str(&self) -> String { diff --git a/crates/composefs-oci/Cargo.toml b/crates/composefs-oci/Cargo.toml index 317ff197..513b5b0d 100644 --- a/crates/composefs-oci/Cargo.toml +++ b/crates/composefs-oci/Cargo.toml @@ -14,16 +14,18 @@ version.workspace = true default = ["containers-storage"] test = ["tar", "rand", "composefs/test"] boot = ["composefs-boot"] -containers-storage = ["dep:cstorage", "dep:base64", "cstorage/userns-helper"] -varlink = ['dep:zlink-core', 'composefs/varlink'] +containers-storage = ["dep:cstorage", "dep:base64", "varlink", "cstorage/layer-transfer"] +varlink = ['dep:zlink', 'dep:zlink-core', 'composefs/varlink'] [dependencies] anyhow = { version = "1.0.87", default-features = false } fn-error-context = "0.2" async-compression = { version = "0.4.0", default-features = false, features = ["tokio", "zstd", "gzip"] } base64 = { version = "0.22", default-features = false, features = ["std"], optional = true } +composefs-splitdirfdstream = { path = "../composefs-splitdirfdstream", version = "0.7.0", features = ["tokio"] } bytes = { version = "1", default-features = false } composefs = { workspace = true } +zlink = { workspace = true, optional = true } zlink-core = { workspace = true, optional = true } composefs-boot = { workspace = true, optional = true } flate2 = { version = "1.0", default-features = false, features = ["zlib"] } diff --git a/crates/composefs-oci/src/cstor.rs b/crates/composefs-oci/src/cstor.rs index 33004f5f..e3ba45f5 100644 --- a/crates/composefs-oci/src/cstor.rs +++ b/crates/composefs-oci/src/cstor.rs @@ -14,7 +14,7 @@ //! //! When importing from containers-storage, we: //! 1. Open the storage and locate the image -//! 2. For each layer, iterate through the tar-split metadata +//! 2. For each layer, stream it via the `org.composefs.Oci` zlink service //! 3. For large files (> INLINE_CONTENT_MAX_V0), reflink directly to objects/ //! 4. For small files, embed inline in the splitstream //! 5. Handle overlay whiteouts properly @@ -38,7 +38,6 @@ //! println!("Stats: {:?}", stats); //! ``` -use std::os::unix::fs::FileExt; use std::os::unix::io::OwnedFd; use std::sync::Arc; @@ -46,30 +45,26 @@ use anyhow::{Context, Result}; use base64::Engine; use composefs::{ - INLINE_CONTENT_MAX_V0, fsverity::FsVerityHashValue, - repository::{ImportContext, ObjectStoreMethod, Repository}, + repository::{ImportContext, Repository}, }; use cstorage::{ - Image, Layer, ProxiedTarSplitItem, Storage, StorageProxy, TarSplitFdStream, TarSplitItem, - can_bypass_file_permissions, + CstorLayerService, Image, Layer, Storage, StorageProxy, can_bypass_file_permissions, + spawn_cstor_in_process, }; +use crate::varlink_types::{GetLayerParams, OciProxy as _, StorageLocator}; + // Re-export init_if_helper for consumers that need userns helper support pub use cstorage::init_if_helper; -use crate::oci_image::manifest_identifier; use crate::progress::{ComponentId, ProgressEvent, ProgressUnit, SharedReporter}; -use crate::skopeo::{OCI_CONFIG_CONTENT_TYPE, OCI_MANIFEST_CONTENT_TYPE, TAR_LAYER_CONTENT_TYPE}; -use crate::{ContentAndVerity, ImportStats, OciDigest, config_identifier, layer_identifier}; +use crate::{ContentAndVerity, ImportStats, OciDigest, layer_identifier}; /// Full result of a cstor import: manifest and config digests + verities. type CstorImportResult = (ContentAndVerity, ContentAndVerity); -/// Zero padding buffer for tar block alignment (512 bytes max needed). -const ZERO_PADDING: [u8; 512] = [0u8; 512]; - /// Import a container image from containers-storage into the composefs repository. /// /// This function reads an image from the local containers-storage (podman/buildah) @@ -102,29 +97,23 @@ pub async fn import_from_containers_storage( ) -> Result<(CstorImportResult, ImportStats)> { // Check if we can access files directly or need a proxy if can_bypass_file_permissions() { - // Direct access - use blocking implementation - let repo = Arc::clone(repo); - let image_id = image_id.to_owned(); - let reference = reference.map(|s| s.to_owned()); + // Direct access via in-process CstorLayerService. let storage_root = storage_root.map(|p| p.to_path_buf()); let additional_image_stores: Vec = additional_image_stores .iter() .map(|p| p.to_path_buf()) .collect(); - tokio::task::spawn_blocking(move || { - import_from_containers_storage_direct( - &repo, - &image_id, - reference.as_deref(), - zerocopy, - storage_root.as_deref(), - &additional_image_stores, - reporter, - ) - }) + import_from_containers_storage_direct( + repo, + image_id, + reference, + zerocopy, + storage_root.as_deref(), + &additional_image_stores, + reporter, + ) .await - .context("spawn_blocking failed")? } else { // The proxied (rootless) path uses a userns helper process that does // its own storage discovery. Explicit storage paths are not yet @@ -138,71 +127,73 @@ pub async fn import_from_containers_storage( } } -/// Direct (privileged) implementation of containers-storage import. +/// Resolved image metadata needed by the async layer-import loop. +struct ResolvedImageLayers { + /// The `Image` handle (for finalize_import). + image: Image, + /// Per-layer: (storage_root_path_string, storage_layer_id, diff_id). + layers: Vec<(String, String, OciDigest)>, +} + +/// Synchronous helper: open stores, find image, resolve layer IDs + diff_ids. /// -/// All file I/O operations in this function are blocking, so it must be called -/// from a blocking context (e.g., via `spawn_blocking`). -fn import_from_containers_storage_direct( - repo: &Arc>, +/// Runs entirely in blocking context. We open the stores with explicit paths +/// so that we can pass the path string to the `CstorLayerService` per layer. +fn resolve_image_layers( image_id: &str, - reference: Option<&str>, - zerocopy: bool, - storage_root: Option<&std::path::Path>, - additional_image_stores: &[std::path::PathBuf], - reporter: SharedReporter, -) -> Result<(CstorImportResult, ImportStats)> { - let mut stats = ImportStats::default(); - let mut ctx = ImportContext::default(); + storage_root: Option, + additional_image_stores: Vec, +) -> Result { + // Build an ordered list of store root paths to search. An explicit + // `storage_root` skips auto-discovery; the additional image stores are + // appended after the primary store(s) in either case. + let mut store_paths: Vec = match &storage_root { + Some(root) => vec![root.to_string_lossy().into_owned()], + None => storage_search_paths(), + }; + for p in &additional_image_stores { + store_paths.push(p.to_string_lossy().into_owned()); + } - // Build the list of stores to search. When an explicit root is given, - // skip auto-discovery entirely; otherwise discover from the standard - // locations and $STORAGE_OPTS. - let mut stores = if let Some(root) = storage_root { - vec![ - Storage::open(root) - .with_context(|| format!("Failed to open storage root at {}", root.display()))?, - ] - } else { - // Auto-discover; tolerate failure only if extra stores are provided. - match Storage::discover_all() { - Ok(s) => s, - Err(e) if !additional_image_stores.is_empty() => { - tracing::warn!( - "containers-storage auto-discovery failed ({e:#}), \ - using additional image stores only" - ); - Vec::new() - } - Err(e) => return Err(e).context("Failed to discover containers-storage"), + // Open all stores. + let mut stores: Vec<(String, Storage)> = Vec::with_capacity(store_paths.len()); + let mut open_errors: Vec = Vec::new(); + for path in &store_paths { + match Storage::open(path) { + Ok(s) => stores.push((path.clone(), s)), + Err(e) => open_errors.push(format!("{path}: {e:#}")), } - }; - for path in additional_image_stores { - stores.push(Storage::open(path).with_context(|| { - format!( - "Failed to open additional image store at {}", - path.display() - ) - })?); + } + if stores.is_empty() { + anyhow::bail!( + "Could not open any containers-storage root: {}", + open_errors.join("; ") + ); } - // Search all stores for the image (primary first, then additional image stores) - let (_image_store, image) = stores + // Search all stores for the image. + let image = stores .iter() - .find_map(|s| { + .find_map(|(_path, s)| { Image::open(s, image_id) .or_else(|_| s.find_image_by_name(image_id)) .ok() - .map(|img| (s, img)) }) .with_context(|| format!("Failed to find image {image_id} in any storage"))?; - // Get the storage layer IDs — layers may span stores (e.g. base layers - // in an additional image store, new layers in the primary) + // storage_layer_ids() takes &[Storage]; collect owned Storage values by + // re-opening the already-open stores (cheap: just another fd dup via cap-std). + // We re-open so that storage_layer_ids can have an owned &[Storage] slice. + let storage_vec: Vec = stores + .iter() + .map(|(path, _s)| Storage::open(path)) + .collect::, _>>() + .context("Failed to re-open stores for storage_layer_ids")?; let storage_layer_ids = image - .storage_layer_ids(&stores) + .storage_layer_ids(&storage_vec) .context("Failed to get storage layer IDs from image")?; - // Get the config to access diff_ids + // Get the config to access diff_ids. let config = image.config().context("Failed to read image config")?; let diff_ids: Vec = config .rootfs() @@ -211,7 +202,6 @@ fn import_from_containers_storage_direct( .map(|s| s.parse::().context("parsing diff_id")) .collect::>()?; - // Ensure layer count matches anyhow::ensure!( storage_layer_ids.len() == diff_ids.len(), "Layer count mismatch: {} layers in storage, {} diff_ids in config", @@ -219,10 +209,88 @@ fn import_from_containers_storage_direct( diff_ids.len() ); - stats.layers = storage_layer_ids.len() as u64; + // For each layer, find which store path it lives in. + let mut layers = Vec::with_capacity(storage_layer_ids.len()); + for (storage_layer_id, diff_id) in storage_layer_ids.into_iter().zip(diff_ids) { + let store_path = stores + .iter() + .find_map(|(path, s)| Layer::open(s, &storage_layer_id).ok().map(|_| path.clone())) + .with_context(|| format!("Could not find store containing layer {storage_layer_id}"))?; + layers.push((store_path, storage_layer_id, diff_id)); + } + + Ok(ResolvedImageLayers { image, layers }) +} + +/// Return the ordered list of storage root path strings to search, mirroring +/// `Storage::discover_all()` internals so we can pair each store with a path. +fn storage_search_paths() -> Vec { + // TODO: Read graphroot and additional image dirs from storage.conf, + // and respect the CONTAINERS_STORAGE_CONF environment variable. + let mut paths = Vec::new(); + + if let Ok(root) = std::env::var("CONTAINERS_STORAGE_ROOT") { + paths.push(root); + } + + if let Ok(home) = std::env::var("HOME") { + if let Ok(xdg) = std::env::var("XDG_DATA_HOME") { + paths.push(format!("{xdg}/containers/storage")); + } + paths.push(format!("{home}/.local/share/containers/storage")); + } + + paths.push("/var/lib/containers/storage".to_string()); + + if let Ok(opts) = std::env::var("STORAGE_OPTS") { + for item in opts.split(',') { + let item = item.trim(); + if let Some(p) = item.strip_prefix("additionalimagestore=") { + paths.push(p.to_string()); + } + } + } - let mut layer_refs = Vec::with_capacity(storage_layer_ids.len()); - for (storage_layer_id, diff_id) in storage_layer_ids.iter().zip(diff_ids.iter()) { + paths +} + +/// Direct (privileged) async implementation of containers-storage import. +/// +/// Layer discovery is done synchronously via `spawn_blocking`, then each +/// layer is imported asynchronously through the in-process `CstorLayerService` +/// (`org.composefs.Oci` zlink interface). +async fn import_from_containers_storage_direct( + repo: &Arc>, + image_id: &str, + reference: Option<&str>, + zerocopy: bool, + storage_root: Option<&std::path::Path>, + additional_image_stores: &[std::path::PathBuf], + reporter: SharedReporter, +) -> Result<(CstorImportResult, ImportStats)> { + let mut stats = ImportStats::default(); + let mut ctx = ImportContext::default(); + + // Resolve image layers synchronously (filesystem + JSON work). + let image_id_owned = image_id.to_owned(); + let storage_root_owned = storage_root.map(|p| p.to_path_buf()); + let additional_owned: Vec = additional_image_stores.to_vec(); + let resolved = tokio::task::spawn_blocking(move || { + resolve_image_layers(&image_id_owned, storage_root_owned, additional_owned) + }) + .await + .context("spawn_blocking(resolve_image_layers) failed")??; + + stats.layers = resolved.layers.len() as u64; + + // Spawn the in-process CstorLayerService server (synchronous call, no .await). + // `server` keeps the server's dedicated thread alive for the whole layer loop; + // it is shut down explicitly after the layer loop. + let (mut client, server) = + spawn_cstor_in_process(CstorLayerService).context("Failed to spawn CstorLayerService")?; + + let mut layer_refs = Vec::with_capacity(resolved.layers.len()); + for (store_path, storage_layer_id, diff_id) in &resolved.layers { let content_id = layer_identifier(diff_id); let id = ComponentId::from(diff_id.to_string()); @@ -236,12 +304,16 @@ fn import_from_containers_storage_direct( total: None, unit: ProgressUnit::Bytes, }); - let (layer_store, layer) = stores - .iter() - .find_map(|s| Layer::open(s, storage_layer_id).ok().map(|l| (s, l))) - .with_context(|| format!("Failed to open layer {}", storage_layer_id))?; - let (verity, layer_stats) = - import_layer_direct(repo, layer_store, &layer, diff_id, zerocopy, &mut ctx)?; + let (verity, layer_stats) = import_layer_via_transfer( + repo, + &mut client, + store_path, + storage_layer_id, + diff_id, + zerocopy, + &mut ctx, + ) + .await?; let bytes = layer_stats.new_bytes(); stats.merge(&layer_stats); reporter.report(ProgressEvent::Done { @@ -254,14 +326,138 @@ fn import_from_containers_storage_direct( layer_refs.push((diff_id.clone(), layer_verity)); } + // Drop the client so the server connection closes, then shut the server + // down. The server's `run()` never returns on its own (the in-process + // listener pends forever after the first connection), so `shutdown` sends an + // explicit stop signal and reaps the thread on a blocking task — the latter + // is important so we don't park the caller's reactor, which must stay live + // to deliver that very stop signal to the server thread. + drop(client); + server.shutdown().await; + reporter.report(ProgressEvent::Message("Layers imported".to_string())); - finalize_import(repo, &image, &layer_refs, reference, &reporter, stats) + + // finalize_import does blocking repo work; run it on spawn_blocking. + let repo2 = Arc::clone(repo); + let image = resolved.image; + let reference_owned = reference.map(|s| s.to_owned()); + let reporter2 = reporter.clone(); + tokio::task::spawn_blocking(move || { + finalize_import( + &repo2, + &image, + &layer_refs, + reference_owned.as_deref(), + &reporter2, + stats, + ) + }) + .await + .context("spawn_blocking(finalize_import) failed")? +} + +/// Import a single layer via the in-process `org.composefs.Oci` zlink service. +/// +/// Calls `get_layer(0, GetLayerParams{storage:Some(StorageLocator{...})})`, +/// collects all frames, then drains the `splitdirfdstream` pipe in a +/// `spawn_blocking` closure while the server-side producer fills the pipe +/// concurrently on its own thread. +async fn import_layer_via_transfer( + repo: &Arc>, + client: &mut zlink::unix::Connection, + storage_path: &str, + storage_layer_id: &str, + diff_id: &OciDigest, + zerocopy: bool, + ctx: &mut ImportContext, +) -> Result<(ObjectID, ImportStats)> { + // Call get_layer with the storage locator (handle=0; cstor service ignores it). + let params = GetLayerParams { + diff_id: None, + storage: Some(StorageLocator { + storage_path: storage_path.to_owned(), + layer_id: storage_layer_id.to_owned(), + }), + }; + let stream = client + .get_layer(0, params) + .await + .with_context(|| format!("Oci.GetLayer RPC failed for {storage_layer_id}"))?; + + // Collect all frames. Each frame carries a batch of FDs; concatenate them + // in arrival order to reconstruct the full logical FD array: + // index 0 : pipe read end + // index 1..=dir_count : dirfds region (sparse — may include dummy slots) + // index dir_count+1.. : opaque lifetime FDs (keepalive + optional extras) + let mut fds: Vec = Vec::new(); + let mut reply_opt: Option = None; + { + use zlink::futures_util::StreamExt as _; + let mut stream = std::pin::pin!(stream); + while let Some(item) = stream.next().await { + let (result, frame_fds) = + item.with_context(|| format!("Oci.GetLayer stream error for {storage_layer_id}"))?; + let frame_reply = result.map_err(|e| anyhow::anyhow!("Oci.GetLayer error: {e:?}"))?; + reply_opt = Some(frame_reply); + fds.extend(frame_fds); + } + } + let reply = reply_opt.ok_or_else(|| anyhow::anyhow!("Oci.GetLayer yielded no frames"))?; + + // At minimum: 1 pipe fd + dir_count slots + 1 keepalive fd. + anyhow::ensure!( + fds.len() >= reply.dir_count as usize + 2, + "Oci.GetLayer: expected at least {} fds (1 pipe + {} dirfd slots + 1 keepalive), got {}", + 2 + reply.dir_count, + reply.dir_count, + fds.len() + ); + + // Split into: pipe_read | dir_fds[dir_count] | lifetime_fds... + let mut it = fds.into_iter(); + let pipe_read: OwnedFd = it.next().expect("checked above: fds.len() >= 1"); + let dir_fds: Vec = it.by_ref().take(reply.dir_count as usize).collect(); + // Opaque lifetime tokens — hold open until drain completes, then drop to + // signal the server's producer that we are done consuming the layer. + let lifetime_fds: Vec = it.collect(); + + // Drain the pipe in a blocking task (producer runs concurrently on the + // server's spawn_blocking thread). ImportContext is threaded through. + let repo_clone = Arc::clone(repo); + let diff_id_owned = diff_id.clone(); + let ctx_moved = std::mem::take(ctx); + + let (verity, layer_stats, ctx_returned) = tokio::task::spawn_blocking(move || { + // Hold all lifetime FDs for the full duration of the drain. + let _lifetime_fds = lifetime_fds; + crate::layer_sync::drain_splitdirfdstream( + repo_clone, + pipe_read, + dir_fds, + &diff_id_owned, + zerocopy, + ctx_moved, + ) + }) + .await + .context("spawn_blocking(drain_splitdirfdstream) failed")??; + + *ctx = ctx_returned; + + Ok((verity, layer_stats)) } /// Proxied (rootless) implementation of containers-storage import. /// -/// This spawns a helper process via `podman unshare` that can read all files -/// in containers-storage, and communicates with it via Unix socket + fd passing. +/// Mirrors `import_from_containers_storage_direct` exactly, but acquires the +/// `org.composefs.Oci` zlink client from the userns helper subprocess instead +/// of an in-process server. Metadata resolution (layer IDs, diff_ids, config) +/// runs in `spawn_blocking` using the same `resolve_image_layers` path as the +/// direct path, so the two paths remain behaviorally identical. +/// +/// Note: explicit `storage_root` / `additional_image_stores` are not supported +/// here; the caller (`import_from_containers_storage`) already guards against +/// that and bails before reaching this function. async fn import_from_containers_storage_proxied( repo: &Arc>, image_id: &str, @@ -272,49 +468,27 @@ async fn import_from_containers_storage_proxied( let mut stats = ImportStats::default(); let mut ctx = ImportContext::default(); - // Spawn the proxy helper - let mut proxy = StorageProxy::spawn() - .await - .context("Failed to spawn userns helper")? - .context("Expected proxy but got None")?; - - // Discover storage paths (primary + additional from $STORAGE_OPTS) - let storage_paths = discover_storage_paths()?; - - // Search all storage paths for the image - let mut image_info = None; - let mut found_storage_path = String::new(); - for path in &storage_paths { - match proxy.get_image(path, image_id).await { - Ok(info) => { - found_storage_path = path.clone(); - image_info = Some(info); - break; - } - Err(_) => continue, - } - } - let image_info = - image_info.with_context(|| format!("Failed to find image {} in any storage", image_id))?; - let storage_path = found_storage_path; - - // Ensure layer count matches - anyhow::ensure!( - image_info.storage_layer_ids.len() == image_info.layer_diff_ids.len(), - "Layer count mismatch: {} layers in storage, {} diff_ids in config", - image_info.storage_layer_ids.len(), - image_info.layer_diff_ids.len() - ); + // Resolve image layers synchronously (filesystem + JSON work). + // Pass None / empty for storage_root / additional: the caller already + // rejected those for rootless mode, so this is always auto-discovery. + let image_id_owned = image_id.to_owned(); + let resolved = + tokio::task::spawn_blocking(move || resolve_image_layers(&image_id_owned, None, vec![])) + .await + .context("spawn_blocking(resolve_image_layers) failed")??; - stats.layers = image_info.storage_layer_ids.len() as u64; + stats.layers = resolved.layers.len() as u64; - let mut layer_refs = Vec::with_capacity(image_info.storage_layer_ids.len()); + // Spawn the userns helper; it serves the org.composefs.Oci zlink service + // (CstorLayerService) over a Unix socket. We drive it through + // `proxy.connection()` exactly as the direct path drives the in-process client. + let mut proxy = StorageProxy::spawn() + .await + .context("spawn userns helper")? + .context("expected helper but got None (can_bypass_file_permissions returned true?)")?; - for (storage_layer_id, diff_id) in image_info - .storage_layer_ids - .iter() - .zip(image_info.layer_diff_ids.iter()) - { + let mut layer_refs = Vec::with_capacity(resolved.layers.len()); + for (store_path, storage_layer_id, diff_id) in &resolved.layers { let content_id = layer_identifier(diff_id); let id = ComponentId::from(diff_id.to_string()); @@ -328,10 +502,10 @@ async fn import_from_containers_storage_proxied( total: None, unit: ProgressUnit::Bytes, }); - let (verity, layer_stats) = import_layer_proxied( + let (verity, layer_stats) = import_layer_via_transfer( repo, - &mut proxy, - &storage_path, + proxy.connection(), + store_path, storage_layer_id, diff_id, zerocopy, @@ -350,30 +524,40 @@ async fn import_from_containers_storage_proxied( layer_refs.push((diff_id.clone(), layer_verity)); } - reporter.report(ProgressEvent::Message("Layers imported".to_string())); - - // Config and manifest metadata don't have restrictive file permissions, - // so we can read them directly without the proxy. - let stores = Storage::discover_all().context("Failed to discover containers-storage")?; - let (_, image) = stores - .iter() - .find_map(|s| Image::open(s, &image_info.id).ok().map(|img| (s, img))) - .with_context(|| format!("Failed to open image {}", image_info.id))?; + // Shut down the helper before finalize_import: finalize reads config and + // manifest JSON directly (no restrictive permissions), so the helper is + // no longer needed. + proxy.shutdown().await.context("shutdown userns helper")?; - // Shutdown the proxy before the blocking finalization - proxy.shutdown().await.context("Failed to shutdown proxy")?; + reporter.report(ProgressEvent::Message("Layers imported".to_string())); - finalize_import(repo, &image, &layer_refs, reference, &reporter, stats) + // finalize_import does blocking repo work; run it on spawn_blocking, + // mirroring _direct exactly. + let repo2 = Arc::clone(repo); + let image = resolved.image; + let reference_owned = reference.map(|s| s.to_owned()); + let reporter2 = reporter.clone(); + tokio::task::spawn_blocking(move || { + finalize_import( + &repo2, + &image, + &layer_refs, + reference_owned.as_deref(), + &reporter2, + stats, + ) + }) + .await + .context("spawn_blocking(finalize_import) failed")? } /// Create config + manifest splitstreams, generate the EROFS image, and tag. /// /// This is the shared finalization step for both direct and proxied import /// paths. By this point all layers are already imported; this function: -/// 1. Creates the config splitstream with layer references -/// 2. Creates the manifest splitstream -/// 3. Generates the composefs EROFS image and links it to the config -/// 4. Tags the manifest if a reference was provided +/// 1. Reads config JSON and manifest JSON from the containers-storage `Image` +/// 2. Delegates to [`crate::layer_sync::finalize_oci_image`] for the repo work +/// 3. Re-attaches the `ImportStats` for the caller fn finalize_import( repo: &Arc>, image: &Image, @@ -388,325 +572,32 @@ fn finalize_import( let config_json = image .read_metadata(&encoded_key) .context("Failed to read config bytes")?; - let config_digest = crate::sha256_content_digest(&config_json); - let content_id = config_identifier(&config_digest); - - let config_verity = if let Some(existing) = repo.has_stream(&content_id)? { - reporter.report(ProgressEvent::Message(format!( - "Already have config {config_digest}" - ))); - existing - } else { - reporter.report(ProgressEvent::Message(format!( - "Creating config splitstream {config_digest}" - ))); - let mut writer = repo.create_stream(OCI_CONFIG_CONTENT_TYPE)?; - - for (diff_id, verity) in layer_refs { - let key: &str = diff_id.as_ref(); - writer.add_named_stream_ref(key, verity); - } - writer.write_external(&config_json)?; - repo.write_stream(writer, &content_id, None)? - }; + reporter.report(ProgressEvent::Message(format!( + "Read config ({} bytes)", + config_json.len() + ))); - // Create the manifest splitstream (matching the skopeo path) + // Read the raw manifest JSON bytes let manifest_json = image .read_manifest_raw() .context("Failed to read manifest bytes")?; - let manifest_digest = crate::sha256_content_digest(&manifest_json); - - let manifest_content_id = manifest_identifier(&manifest_digest); - let manifest_verity = if let Some(existing) = repo.has_stream(&manifest_content_id)? { - reporter.report(ProgressEvent::Message(format!( - "Already have manifest {manifest_digest}" - ))); - existing - } else { - reporter.report(ProgressEvent::Message(format!( - "Creating manifest splitstream {manifest_digest}" - ))); - let mut writer = repo.create_stream(OCI_MANIFEST_CONTENT_TYPE)?; - - let config_ref_key = format!("config:{config_digest}"); - writer.add_named_stream_ref(&config_ref_key, &config_verity); - - for (diff_id, verity) in layer_refs { - let key: &str = diff_id.as_ref(); - writer.add_named_stream_ref(key, verity); - } - writer.write_external(&manifest_json)?; - repo.write_stream(writer, &manifest_content_id, None)? - }; + reporter.report(ProgressEvent::Message(format!( + "Read manifest ({} bytes)", + manifest_json.len() + ))); - // Generate the composefs EROFS image and tag the manifest. - // Skip if the image already has an EROFS ref (idempotent re-import). - let existing_erofs = crate::composefs_erofs_for_manifest( + let result = crate::layer_sync::finalize_oci_image( repo, - &manifest_digest, - Some(&manifest_verity), - repo.erofs_version(), - )?; - if existing_erofs.is_none() { - let erofs = crate::ensure_oci_composefs_erofs( - repo, - &manifest_digest, - Some(&manifest_verity), - reference, - )?; - if erofs.is_none() { - // Not a container image (unlikely for cstor, but handle consistently) - if let Some(name) = reference { - crate::oci_image::tag_image(repo, &manifest_digest, name)?; - } - } - } else if let Some(name) = reference { - crate::oci_image::tag_image(repo, &manifest_digest, name)?; - } - - // Re-read verities: ensure_oci_composefs_erofs rewrites config and - // manifest splitstreams (adding the EROFS ref), so the verities captured - // above may be stale. - let config_verity = repo - .has_stream(&content_id)? - .context("config splitstream missing after finalization")?; - let manifest_verity = repo - .has_stream(&manifest_content_id)? - .context("manifest splitstream missing after finalization")?; - - Ok(( - ( - (manifest_digest, manifest_verity), - (config_digest, config_verity), - ), - stats, - )) -} - -/// Import a single layer directly (privileged mode). -fn import_layer_direct( - repo: &Arc>, - storage: &Storage, - layer: &Layer, - diff_id: &OciDigest, - zerocopy: bool, - ctx: &mut ImportContext, -) -> Result<(ObjectID, ImportStats)> { - let mut stats = ImportStats::default(); - let mut inline_buf = Vec::new(); - - let mut stream = TarSplitFdStream::new(storage, layer) - .with_context(|| format!("Failed to create tar-split stream for layer {}", layer.id()))?; - - let mut writer = repo.create_stream(TAR_LAYER_CONTENT_TYPE)?; - let content_id = layer_identifier(diff_id); - - // Track padding from previous file - tar-split bundles padding with the NEXT - // file's header in Segment entries, but we need to write padding immediately - // after file content (like tar.rs does) for consistent splitstream output. - let mut prev_file_padding: usize = 0; - - while let Some(item) = stream.next()? { - match item { - TarSplitItem::Segment(bytes) => { - // Skip the leading padding bytes (we already wrote them after prev file) - let header_bytes = &bytes[prev_file_padding..]; - stats.bytes_inlined += header_bytes.len() as u64; - writer.write_inline(header_bytes); - prev_file_padding = 0; - } - TarSplitItem::FileContent { fd, size, name } => { - process_file_content( - repo, - &mut writer, - &mut stats, - ctx, - fd, - size, - &name, - zerocopy, - &mut inline_buf, - )?; - - // Write padding inline immediately after file content - let padding_size = (size as usize).next_multiple_of(512) - size as usize; - if padding_size > 0 { - stats.bytes_inlined += padding_size as u64; - writer.write_inline(&ZERO_PADDING[..padding_size]); - } - prev_file_padding = padding_size; - } - } - } - - // Write the stream with the content identifier - let verity = repo.write_stream(writer, &content_id, None)?; - Ok((verity, stats)) -} - -/// Import a single layer via the proxy (rootless mode). -async fn import_layer_proxied( - repo: &Arc>, - proxy: &mut StorageProxy, - storage_path: &str, - layer_id: &str, - diff_id: &OciDigest, - zerocopy: bool, - ctx: &mut ImportContext, -) -> Result<(ObjectID, ImportStats)> { - let mut stats = ImportStats::default(); - let mut inline_buf = Vec::new(); - - let mut writer = repo.create_stream(TAR_LAYER_CONTENT_TYPE)?; - let content_id = layer_identifier(diff_id); - - // Track padding from previous file - tar-split bundles padding with the NEXT - // file's header in Segment entries, but we need to write padding immediately - // after file content (like tar.rs does) for consistent splitstream output. - let mut prev_file_padding: usize = 0; - - // Stream the layer via the proxy - let mut stream = proxy - .stream_layer(storage_path, layer_id) - .await - .with_context(|| format!("Failed to start streaming layer {}", layer_id))?; - - while let Some(item) = stream - .next() - .await - .with_context(|| format!("Failed to receive stream item for layer {}", layer_id))? - { - match item { - ProxiedTarSplitItem::Segment(bytes) => { - // Skip the leading padding bytes (we already wrote them after prev file) - let header_bytes = &bytes[prev_file_padding..]; - stats.bytes_inlined += header_bytes.len() as u64; - writer.write_inline(header_bytes); - prev_file_padding = 0; - } - ProxiedTarSplitItem::FileContent { fd, size, name } => { - process_file_content( - repo, - &mut writer, - &mut stats, - ctx, - fd, - size, - &name, - zerocopy, - &mut inline_buf, - )?; - - // Write padding inline immediately after file content - let padding_size = (size as usize).next_multiple_of(512) - size as usize; - if padding_size > 0 { - stats.bytes_inlined += padding_size as u64; - writer.write_inline(&ZERO_PADDING[..padding_size]); - } - prev_file_padding = padding_size; - } - } - } - - // Write the stream with the content identifier - let verity = repo.write_stream(writer, &content_id, None)?; - Ok((verity, stats)) -} - -/// Process file content (shared between direct and proxied modes). -#[allow(clippy::too_many_arguments)] -fn process_file_content( - repo: &Arc>, - writer: &mut composefs::splitstream::SplitStreamWriter, - stats: &mut ImportStats, - ctx: &mut ImportContext, - fd: OwnedFd, - size: u64, - name: &str, - zerocopy: bool, - inline_buf: &mut Vec, -) -> Result<()> { - // Convert fd to File for operations - let file = std::fs::File::from(fd); - - if size as usize > INLINE_CONTENT_MAX_V0 { - // Large file: store as external object - let (object_id, method) = if zerocopy { - repo.ensure_object_from_file_zerocopy(&file, size, ctx) - } else { - repo.ensure_object_from_file(&file, size, ctx) - } - .with_context(|| format!("Failed to store object for {}", name))?; - - match method { - ObjectStoreMethod::Reflinked => { - stats.objects_reflinked += 1; - stats.bytes_reflinked += size; - } - ObjectStoreMethod::Hardlinked => { - stats.objects_hardlinked += 1; - stats.bytes_hardlinked += size; - } - ObjectStoreMethod::Copied => { - stats.objects_copied += 1; - stats.bytes_copied += size; - } - ObjectStoreMethod::AlreadyPresent => { - stats.objects_already_present += 1; - } - } - - writer.add_external_size(size); - writer.write_reference(object_id)?; - } else { - // Small file: read and embed inline (reuse buffer across calls) - inline_buf.resize(size as usize, 0); - file.read_exact_at(inline_buf, 0)?; - stats.bytes_inlined += size; - writer.write_inline(inline_buf); - } - - Ok(()) -} - -/// Discover storage paths: the primary store plus any additional image stores -/// from `$STORAGE_OPTS`. -fn discover_storage_paths() -> Result> { - let mut paths = Vec::new(); - - // Try user storage first (rootless podman) - if let Ok(home) = std::env::var("HOME") { - let user_path = format!("{}/.local/share/containers/storage", home); - if std::path::Path::new(&user_path).exists() { - paths.push(user_path); - } - } - - // Fall back to system storage - let system_path = "/var/lib/containers/storage"; - if std::path::Path::new(system_path).exists() { - paths.push(system_path.to_string()); - } - - // Also check $STORAGE_OPTS for additional image stores - if let Ok(opts) = std::env::var("STORAGE_OPTS") { - for item in opts.split(',') { - let item = item.trim(); - if let Some(path) = item.strip_prefix("additionalimagestore=") - && std::path::Path::new(path).exists() - { - paths.push(path.to_string()); - } - } - } - - anyhow::ensure!( - !paths.is_empty(), - "Could not find containers-storage at standard locations" - ); - Ok(paths) + &manifest_json, + &config_json, + layer_refs, + reference, + ) + .context("finalize_oci_image")?; + + Ok((result, stats)) } /// Check if an image reference uses the containers-storage transport. diff --git a/crates/composefs-oci/src/layer_sync.rs b/crates/composefs-oci/src/layer_sync.rs new file mode 100644 index 00000000..09cc8e9e --- /dev/null +++ b/crates/composefs-oci/src/layer_sync.rs @@ -0,0 +1,1206 @@ +//! Shared, content-agnostic splitdirfdstream producer and consumer for layer +//! synchronisation. +//! +//! This module holds the blocking logic for producing a `splitdirfdstream` from a +//! stored layer and for draining one back into a repository. It depends only on +//! the (tiny) `composefs-splitdirfdstream` crate, not the containers-storage / +//! podman stack, so it is always compiled — including in builds that do not +//! enable the `containers-storage` feature. +//! +//! Callers: +//! * `crates/composefs-oci/src/cstor.rs` — the containers-storage import path +//! (only built with the `containers-storage` feature). +//! * `composefs-ctl` varlink server — repo-to-repo layer synchronisation via +//! `GetLayer`/`PutLayer`, and the `oci copy` command. + +use std::os::unix::fs::FileExt; +use std::os::unix::io::OwnedFd; +use std::sync::Arc; + +use anyhow::{Context, Result}; +use cap_std_ext::cap_std; +use composefs_splitdirfdstream::{Chunk, SplitdirfdstreamReader, SplitdirfdstreamWriter}; +use rustix::fs::{MemfdFlags, fstat, memfd_create}; +use sha2::{Digest as _, Sha256}; + +use composefs::{ + INLINE_CONTENT_MAX_V0, + fsverity::FsVerityHashValue, + repository::{ImportContext, ObjectStoreMethod, Repository}, + splitstream::{SplitStreamData, SplitStreamWriter}, +}; + +use crate::skopeo::TAR_LAYER_CONTENT_TYPE; +use crate::{ImportStats, OciDigest, layer_identifier, sha256_output_to_digest}; + +/// Errors returned by the diff-id-verifying drain path. +/// +/// Separates the integrity-check failure (wrong content) from I/O or +/// repository errors so that callers can surface a clear diagnostic. +#[derive(Debug, thiserror::Error)] +pub enum VerifiedDrainError { + /// The reconstructed layer content does not match the declared `diff_id`. + #[error("layer content does not match declared diff_id: expected {expected}, got {actual}")] + DiffIdMismatch { + /// The diff_id that was declared by the sender. + expected: String, + /// The sha256 of the data that was actually received. + actual: String, + }, + /// Any other I/O or repository error. + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +/// Verify that `fd` is a directory before using it as a base for `openat`. +/// +/// If it is not a directory, returns a detailed error showing the fd index, +/// file type bits, and size — making it easy to distinguish real diff-dirs +/// from dummy `/dev/null` or `memfd` fds that ended up at the wrong slot. +fn assert_is_dir(fd: &impl rustix::fd::AsFd, slot: u32, name: &str) -> anyhow::Result<()> { + use rustix::fs::FileType; + let st = fstat(fd).with_context(|| format!("fstat overlay_dir[{slot}]"))?; + let ft = FileType::from_raw_mode(st.st_mode); + if ft != FileType::Directory { + anyhow::bail!( + "overlay_dir[{slot}] is not a directory (file type: {ft:?}, \ + mode={:#o}, size={}) — cannot open {name:?}; \ + this is likely a bug where a dummy fd (e.g. /dev/null or memfd) \ + was stored at a slot that should hold a real diff-dir fd", + st.st_mode, + st.st_size + ); + } + Ok(()) +} + +/// Drain the `splitdirfdstream` pipe and write the resulting layer splitstream. +/// +/// This is the blocking half of a layer-import operation — it should be run on a +/// `spawn_blocking` thread so the pipe can fill concurrently with the producer. +/// +/// # Arguments +/// * `repo` — Repository to import into. +/// * `pipe_read` — Read end of the splitdirfdstream pipe. +/// * `dir_fds` — Sparse dirfds region passed wholesale by the transport layer. +/// `dir_fds[dirfd_index]` resolves the real diff-directory fd for each +/// [`Chunk::FileBackedData`] chunk. Dummy fds at gap slots are never opened. +/// * `diff_id` — OCI diff-id of the layer (used as the stream content identifier). +/// * `zerocopy` — If `true`, use reflink/zerocopy when storing large objects. +/// * `ctx` — Import context (passed back to the caller on success). +/// +/// # Returns +/// A tuple of `(verity_hash, import_stats, import_context)` on success. +pub fn drain_splitdirfdstream( + repo: Arc>, + pipe_read: OwnedFd, + dir_fds: Vec, + diff_id: &OciDigest, + zerocopy: bool, + mut ctx: ImportContext, +) -> Result<(ObjectID, ImportStats, ImportContext)> { + // Wrap each fd as a cap_std Dir. Dummy fds (at sparse gap slots) are never + // opened by the logic below; only the slot named by dirfd_index is accessed. + let overlay_dirs: Vec = + dir_fds.into_iter().map(cap_std::fs::Dir::from).collect(); + + let mut writer = repo.create_stream(TAR_LAYER_CONTENT_TYPE)?; + let content_id = layer_identifier(diff_id); + let mut reader = SplitdirfdstreamReader::new(std::fs::File::from(pipe_read)); + let mut inline_buf = Vec::new(); + let mut stats = ImportStats::default(); + + drain_splitdirfdstream_inner( + &repo, + &mut writer, + &mut reader, + &overlay_dirs, + zerocopy, + &mut stats, + &mut ctx, + &mut inline_buf, + None, + )?; + + let verity = repo.write_stream(writer, &content_id, None)?; + Ok((verity, stats, ctx)) +} + +#[allow(clippy::too_many_arguments)] +fn drain_splitdirfdstream_inner( + repo: &Arc>, + writer: &mut SplitStreamWriter, + reader: &mut SplitdirfdstreamReader, + overlay_dirs: &[cap_std::fs::Dir], + zerocopy: bool, + stats: &mut ImportStats, + ctx: &mut ImportContext, + inline_buf: &mut Vec, + mut hasher: Option<&mut Sha256>, +) -> Result<()> { + while let Some(chunk) = reader.next_chunk().context("splitdirfdstream read error")? { + match chunk { + Chunk::Metadata(data) => { + if let Some(ref mut h) = hasher { + h.update(data); + } + stats.bytes_inlined += data.len() as u64; + writer.write_inline(data); + } + Chunk::InlineData(data) => { + // Non-world-readable file transported inline by the producer. + // We already have the bytes — no fd needed for the small case. + if let Some(ref mut h) = hasher { + h.update(data); + } + let length = data.len() as u64; + if should_inline(length) { + stats.bytes_inlined += length; + writer.write_inline(data); + } else { + // Large file: store as an external object. A memfd is the + // lightest way to hand an fd to process_file_content. + process_file_content( + repo, + writer, + stats, + ctx, + file_content_to_memfd(data)?, + length, + "", + zerocopy, + inline_buf, + )?; + } + } + Chunk::FileBackedData { + dirfd_index, + length, + filename, + } => { + let name = std::str::from_utf8(filename).with_context(|| { + format!("non-utf8 filename in splitdirfdstream: {filename:?}") + })?; + let dir = overlay_dirs.get(dirfd_index as usize).with_context(|| { + format!( + "dirfd_index {dirfd_index} out of range (dir_fds.len={})", + overlay_dirs.len() + ) + })?; + assert_is_dir(dir, dirfd_index, name)?; + let fd = dir + .open(name) + .map(OwnedFd::from) + .with_context(|| format!("open {name:?} in overlay dir[{dirfd_index}]"))?; + + use rustix::fs::FileType; + let st = fstat(&fd).with_context(|| format!("fstat {name:?}"))?; + let ft = FileType::from_raw_mode(st.st_mode); + if ft != FileType::RegularFile { + anyhow::bail!( + "object {name:?} in overlay dir[{dirfd_index}] is not a regular file (file type: {ft:?}, mode={:#o})", + st.st_mode + ); + } + + let fd_to_process = if let Some(ref mut h) = hasher { + // Defend against a producer that declares a `length` that + // disagrees with the actual object size. The reflink store + // path already rejects a mismatch, but the copy fallback would + // store the whole file while we only hashed `length` bytes, + // so the committed object could differ from the verified + // bytes. Pin the two together by requiring length == size. + let obj_file = std::fs::File::from(fd); + let actual_size = st.st_size as u64; + if actual_size != length { + anyhow::bail!( + "object {name}: declared length {length} != actual size {actual_size}" + ); + } + + // Hash the object file content using positioned reads. This + // does not disturb the fd cursor, so process_file_content can + // independently read/reflink the same fd afterwards. + hash_fd_contents(&obj_file, length, h) + .with_context(|| format!("hashing object {name}"))?; + + obj_file.into() + } else { + fd + }; + + process_file_content( + repo, + writer, + stats, + ctx, + fd_to_process, + length, + name, + zerocopy, + inline_buf, + )?; + // NOTE: padding after the file content is already emitted by the + // producer as a following Inline chunk; do NOT add extra padding here. + } + } + } + Ok(()) +} + +/// Materialise `data` into an anonymous `memfd` for use with +/// [`process_file_content`]. +/// +/// The memfd is created `CLOEXEC` and the data written at offset 0. +/// `process_file_content` uses positioned reads (`pread`/`read_at`) so the +/// write cursor position does not matter. +fn file_content_to_memfd(data: &[u8]) -> Result { + let memfd = memfd_create(c"composefs-filecontent", MemfdFlags::CLOEXEC) + .context("memfd_create for FileContent chunk")?; + rustix::io::write(&memfd, data).context("writing FileContent to memfd")?; + Ok(memfd) +} + +/// Hash `len` bytes of `fd` starting at offset 0 into `hasher`, using +/// positioned reads so the fd cursor is not disturbed. +/// +/// Uses [`FileExt::read_at`] in a loop with a 64 KiB stack buffer to +/// avoid allocating for large objects. +fn hash_fd_contents(fd: &std::fs::File, len: u64, hasher: &mut Sha256) -> Result<()> { + const BUF_SIZE: usize = 65536; + let mut buf = [0u8; BUF_SIZE]; + let mut remaining = len; + let mut offset = 0u64; + + while remaining > 0 { + let to_read = remaining.min(BUF_SIZE as u64) as usize; + let n = fd + .read_at(&mut buf[..to_read], offset) + .context("read_at while hashing fd contents")?; + if n == 0 { + anyhow::bail!( + "unexpected EOF at offset {offset} hashing fd (expected {len} bytes total)" + ); + } + hasher.update(&buf[..n]); + offset += n as u64; + remaining -= n as u64; + } + Ok(()) +} + +/// Like [`drain_splitdirfdstream`], but additionally verifies that the +/// reconstructed (uncompressed tar) content hashes to `diff_id`, and only +/// commits the layer splitstream to the repository if it matches. +/// +/// On mismatch the partially-written stream is discarded and +/// [`VerifiedDrainError::DiffIdMismatch`] is returned. +/// +/// # Integrity model +/// +/// The diff_id is the sha256 of the uncompressed tar bytes — the same byte +/// stream that `cat` on the splitstream produces. As chunks are processed, +/// every logical byte is fed into a running SHA-256 hasher: +/// * **Inline** chunks: hash the raw bytes verbatim. +/// * **External** chunks: hash the full object file (all `length` bytes) via +/// positioned reads, then pass the same fd to [`process_file_content`]. +/// Using positioned reads (`read_at`) means the fd cursor is not consumed, +/// so `process_file_content` can read the file independently. +/// +/// # Note on partial objects +/// +/// If verification fails, large external objects that were already written to +/// the `objects/` directory of the destination repo are left in place. They +/// are orphaned (not referenced by any committed splitstream) and will be +/// reclaimed by the next GC run. We do NOT attempt to clean them up here +/// because doing so correctly (without racing with concurrent imports) would +/// be complex and the GC already handles this case. +pub fn drain_splitdirfdstream_verified( + repo: Arc>, + pipe_read: OwnedFd, + dir_fds: Vec, + diff_id: &OciDigest, + zerocopy: bool, + mut ctx: ImportContext, +) -> Result<(ObjectID, ImportStats, ImportContext), VerifiedDrainError> { + // We hash with SHA-256, which is the only algorithm OCI diff-ids use in + // practice (and the only one the rest of this crate assumes). Reject other + // algorithms up front with a clear error rather than producing a confusing + // "mismatch" between a sha256 hash and e.g. a sha512 diff-id string. + let algorithm = diff_id.algorithm().as_ref(); + if algorithm != "sha256" { + return Err(VerifiedDrainError::Other(anyhow::anyhow!( + "unsupported diff_id algorithm {algorithm:?}: only sha256 is supported" + ))); + } + + // Wrap each fd as a cap_std Dir. Dummy fds at sparse gap slots are never + // opened by the logic below; only the slot named by dirfd_index is accessed. + let overlay_dirs: Vec = + dir_fds.into_iter().map(cap_std::fs::Dir::from).collect(); + + let mut writer = repo + .create_stream(TAR_LAYER_CONTENT_TYPE) + .context("create_stream")?; + let content_id = layer_identifier(diff_id); + let mut reader = SplitdirfdstreamReader::new(std::fs::File::from(pipe_read)); + let mut inline_buf = Vec::new(); + let mut stats = ImportStats::default(); + let mut hasher = Sha256::new(); + + drain_splitdirfdstream_inner( + &repo, + &mut writer, + &mut reader, + &overlay_dirs, + zerocopy, + &mut stats, + &mut ctx, + &mut inline_buf, + Some(&mut hasher), + )?; + + // Verify the accumulated hash against the declared diff_id. + let actual_digest = sha256_output_to_digest(hasher.finalize()); + let actual_str = actual_digest.to_string(); + let expected_str = diff_id.to_string(); + if actual_str != expected_str { + // Drop `writer` without calling write_stream: the stream is not + // committed. Large objects already written to objects/ are orphaned + // but GC will reclaim them. + return Err(VerifiedDrainError::DiffIdMismatch { + expected: expected_str, + actual: actual_str, + }); + } + + let verity = repo + .write_stream(writer, &content_id, None) + .context("write_stream")?; + Ok((verity, stats, ctx)) +} + +/// Decide whether a file of the given `size` should be stored inline in the +/// splitstream rather than as a separate object in the object store. +/// +/// Files with `size <= INLINE_CONTENT_MAX_V0` are embedded directly in the +/// splitstream; larger files become external objects. +pub(crate) fn should_inline(size: u64) -> bool { + (size as usize) <= INLINE_CONTENT_MAX_V0 +} + +/// Store the content of one file fd into the splitstream, choosing inline vs +/// external storage based on the file size. +/// +/// For files at or below [`composefs::INLINE_CONTENT_MAX_V0`] bytes the +/// content is read into `inline_buf` and embedded directly in `writer`. +/// Larger files are stored as external objects in `repo` and referenced +/// by hash. +#[allow(clippy::too_many_arguments)] +pub fn process_file_content( + repo: &Arc>, + writer: &mut SplitStreamWriter, + stats: &mut ImportStats, + ctx: &mut ImportContext, + fd: OwnedFd, + size: u64, + name: &str, + zerocopy: bool, + inline_buf: &mut Vec, +) -> Result<()> { + // Convert fd to File for operations + let file = std::fs::File::from(fd); + + if !should_inline(size) { + // Large file: store as external object + let (object_id, method) = if zerocopy { + repo.ensure_object_from_file_zerocopy(&file, size, ctx) + } else { + repo.ensure_object_from_file(&file, size, ctx) + } + .with_context(|| format!("Failed to store object for {}", name))?; + + match method { + ObjectStoreMethod::Reflinked => { + stats.objects_reflinked += 1; + stats.bytes_reflinked += size; + } + ObjectStoreMethod::Hardlinked => { + stats.objects_hardlinked += 1; + stats.bytes_hardlinked += size; + } + ObjectStoreMethod::Copied => { + stats.objects_copied += 1; + stats.bytes_copied += size; + } + ObjectStoreMethod::AlreadyPresent => { + stats.objects_already_present += 1; + } + } + + writer.add_external_size(size); + writer.write_reference(object_id)?; + } else { + // Small file: read and embed inline (reuse buffer across calls) + inline_buf.resize(size as usize, 0); + file.read_exact_at(inline_buf, 0)?; + stats.bytes_inlined += size; + writer.write_inline(inline_buf); + } + + Ok(()) +} + +/// Return the on-disk size of the object identified by `id` in `repo`. +/// +/// Opens the object file and `fstat`s it to obtain the size without reading +/// the full content. This is used by [`produce_layer_splitdirfdstream`] to +/// populate the `length` field of external chunks in the output stream. +fn object_size( + repo: &Repository, + id: &ObjectID, +) -> Result { + let fd = repo + .open_object(id) + .context("Opening object for size query")?; + let stat = fstat(&fd).context("fstat on object fd")?; + Ok(stat.st_size as u64) +} + +/// Produce a `splitdirfdstream` for the layer splitstream identified by +/// `layer_verity`, writing it to `out`. +/// +/// External objects are emitted as references of the form `"xx/yyyy"` beneath +/// the repository's `objects/` directory. The consumer must supply the objects +/// directory at `dirfd_index` within the sparse dirfds region (obtained from +/// [`composefs_splitdirfdstream::build_layer_fd_layout`]'s `real_indices[0]`). +/// +/// This is the read/serve side that mirrors [`drain_splitdirfdstream`]: given +/// the same repo, a stream produced here and then passed through +/// [`composefs_splitdirfdstream::reconstruct`] with the repo's objects dir will +/// yield byte-identical output to calling +/// [`composefs::splitstream::SplitStreamReader::cat`] on the same layer. +/// +/// # Arguments +/// * `repo` — Repository that holds the layer and its objects. +/// * `layer_verity` — fs-verity hash of the layer splitstream to serve. +/// * `objects_dirfd_index` — Slot index within the sparse dirfds region where +/// the repository objects directory fd is placed (from `real_indices[0]`). +/// * `out` — Destination writer for the produced `splitdirfdstream`. +pub fn produce_layer_splitdirfdstream( + repo: &Repository, + layer_verity: &ObjectID, + objects_dirfd_index: u32, + out: W, +) -> Result<()> { + let mut reader = repo + .open_stream("", Some(layer_verity), Some(TAR_LAYER_CONTENT_TYPE)) + .context("Opening layer splitstream")?; + let mut writer = SplitdirfdstreamWriter::new(out); + + reader + .for_each_chunk(|chunk| { + match chunk { + SplitStreamData::Inline(data) => { + writer.write_metadata(&data).map_err(anyhow::Error::from)?; + } + SplitStreamData::External(id) => { + let size = object_size(repo, &id)?; + let pathname = id.to_object_pathname(); + writer + .write_file_backed_data(objects_dirfd_index, size, pathname.as_bytes()) + .map_err(anyhow::Error::from)?; + } + } + Ok(()) + }) + .context("Walking layer splitstream chunks")?; + + writer.finish().map_err(anyhow::Error::from)?; + Ok(()) +} + +/// A type alias for the (manifest, config) digest-and-verity pair returned by +/// [`finalize_oci_image`]. +/// +/// The first element is `(manifest_digest, manifest_verity)`; +/// the second is `(config_digest, config_verity)`. +pub type FinalizeResult = ( + crate::ContentAndVerity, + crate::ContentAndVerity, +); + +/// Finalize an OCI image whose layers are already imported into `repo`. +/// +/// Given the raw manifest and config JSON (exact bytes, so their sha256 +/// digests are preserved) and the ordered (diff_id, layer_verity) pairs, +/// this writes the config and manifest splitstreams, generates the composefs +/// EROFS image, and optionally tags the manifest under `name`. Idempotent. +/// +/// Returns `((manifest_digest, manifest_verity), (config_digest, config_verity))`. +pub fn finalize_oci_image( + repo: &Arc>, + manifest_json: &[u8], + config_json: &[u8], + layer_refs: &[(OciDigest, ObjectID)], + name: Option<&str>, +) -> anyhow::Result> { + use crate::oci_image::manifest_identifier; + use crate::skopeo::{OCI_CONFIG_CONTENT_TYPE, OCI_MANIFEST_CONTENT_TYPE}; + use crate::{config_identifier, sha256_content_digest}; + + let config_digest = sha256_content_digest(config_json); + let content_id = config_identifier(&config_digest); + + let config_verity = if let Some(existing) = repo.has_stream(&content_id)? { + existing + } else { + let mut writer = repo.create_stream(OCI_CONFIG_CONTENT_TYPE)?; + + for (diff_id, verity) in layer_refs { + let key: &str = diff_id.as_ref(); + writer.add_named_stream_ref(key, verity); + } + + writer.write_external(config_json)?; + repo.write_stream(writer, &content_id, None)? + }; + + let manifest_digest = sha256_content_digest(manifest_json); + + let manifest_content_id = manifest_identifier(&manifest_digest); + let manifest_verity = if let Some(existing) = repo.has_stream(&manifest_content_id)? { + existing + } else { + let mut writer = repo.create_stream(OCI_MANIFEST_CONTENT_TYPE)?; + + let config_ref_key = format!("config:{config_digest}"); + writer.add_named_stream_ref(&config_ref_key, &config_verity); + + for (diff_id, verity) in layer_refs { + let key: &str = diff_id.as_ref(); + writer.add_named_stream_ref(key, verity); + } + + writer.write_external(manifest_json)?; + repo.write_stream(writer, &manifest_content_id, None)? + }; + + // Generate the composefs EROFS image and tag the manifest. + // Skip if the image already has an EROFS ref (idempotent re-finalize). + let existing_erofs = crate::composefs_erofs_for_manifest( + repo, + &manifest_digest, + Some(&manifest_verity), + repo.erofs_version(), + )?; + if existing_erofs.is_none() { + let erofs = crate::ensure_oci_composefs_erofs( + repo, + &manifest_digest, + Some(&manifest_verity), + name, + )?; + if erofs.is_none() { + // Not a container image (e.g. an artifact) — tag directly. + if let Some(n) = name { + crate::oci_image::tag_image(repo, &manifest_digest, n)?; + } + } + } else if let Some(n) = name { + crate::oci_image::tag_image(repo, &manifest_digest, n)?; + } + + // Re-read verities: ensure_oci_composefs_erofs rewrites config and + // manifest splitstreams (adding the EROFS ref), so the verities captured + // above may be stale. + let config_verity = repo + .has_stream(&content_id)? + .context("config splitstream missing after finalization")?; + let manifest_verity = repo + .has_stream(&manifest_content_id)? + .context("manifest splitstream missing after finalization")?; + + Ok(( + (manifest_digest, manifest_verity), + (config_digest, config_verity), + )) +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::io::Write as _; + + use composefs::fsverity::Sha256HashValue; + use composefs::repository::RepositoryConfig; + use composefs_splitdirfdstream::reconstruct; + + /// The pure inline-vs-external predicate, tested exhaustively around the + /// boundary. This is the exact expression branched on inside + /// [`process_file_content`], so locking it down here guards the decision + /// independently of the (heavier) end-to-end test below. + #[test] + fn test_should_inline_boundary() { + assert!(should_inline(0), "size 0 should be inlined"); + assert!(should_inline(1), "size 1 should be inlined"); + assert!( + should_inline(INLINE_CONTENT_MAX_V0 as u64), + "size == INLINE_CONTENT_MAX_V0 ({INLINE_CONTENT_MAX_V0}) should be inlined" + ); + assert!( + !should_inline(INLINE_CONTENT_MAX_V0 as u64 + 1), + "size INLINE_CONTENT_MAX_V0+1 ({}) should NOT be inlined", + INLINE_CONTENT_MAX_V0 + 1 + ); + for size in [128u64, 4096, 65536, 1024 * 1024] { + assert!( + !should_inline(size), + "size {size} should NOT be inlined (well above threshold)" + ); + } + } + + /// Create an insecure (no fs-verity) repository in a fresh tempdir. + /// + /// Returns the repo alongside the `TempDir`, which the caller must keep + /// alive for the duration of the test. + fn create_test_repo() -> (Arc>, tempfile::TempDir) { + let tempdir = tempfile::TempDir::new().unwrap(); + let (repo, _) = Repository::init_path( + rustix::fs::CWD, + &tempdir.path().join("repo"), + RepositoryConfig::default().set_insecure(), + ) + .unwrap(); + (Arc::new(repo), tempdir) + } + + /// A file fd backed by `len` bytes of deterministic content. + fn tmpfile_of(len: usize) -> OwnedFd { + let mut f = tempfile::tempfile().unwrap(); + let data: Vec = (0..len).map(|i| (i % 251) as u8).collect(); + f.write_all(&data).unwrap(); + f.into() + } + + /// Data-driven end-to-end check of `process_file_content`'s storage + /// decision: small files land inline (no object written, `bytes_inlined` + /// grows), large files become external objects (`objects_*` grows, + /// `bytes_inlined` unchanged). + #[test] + fn test_process_file_content_inline_vs_external() { + let (repo, _tempdir) = create_test_repo(); + + let cases = [ + (0usize, false), + (1, false), + (INLINE_CONTENT_MAX_V0, false), + (INLINE_CONTENT_MAX_V0 + 1, true), + (4096, true), + (256 * 1024, true), + ]; + + for (size, expect_external) in cases { + let mut writer = repo.create_stream(TAR_LAYER_CONTENT_TYPE).unwrap(); + let mut stats = ImportStats::default(); + let mut ctx = ImportContext::default(); + let mut inline_buf = Vec::new(); + + let before_inlined = stats.bytes_inlined; + process_file_content( + &repo, + &mut writer, + &mut stats, + &mut ctx, + tmpfile_of(size), + size as u64, + "test-file", + false, + &mut inline_buf, + ) + .unwrap(); + + let objects_written = stats.objects_reflinked + + stats.objects_hardlinked + + stats.objects_copied + + stats.objects_already_present; + + if expect_external { + assert_eq!( + stats.bytes_inlined, before_inlined, + "size {size}: external file must not change bytes_inlined" + ); + assert_eq!( + objects_written, 1, + "size {size}: exactly one external object expected" + ); + } else { + assert_eq!( + stats.bytes_inlined, + before_inlined + size as u64, + "size {size}: inline file must add its bytes to bytes_inlined" + ); + assert_eq!( + objects_written, 0, + "size {size}: inline file must not write an object" + ); + } + } + } + + // ------------------------------------------------------------------------- + // Helpers for the produce->reconstruct==cat round-trip tests + // ------------------------------------------------------------------------- + + /// Build a tar layer where each file has the specified size (bytes). + /// + /// File names are derived from the size so each case is identifiable + /// in assertion output. Content is deterministic: repeating bytes + /// `i % 251`. + fn build_tar_layer(file_sizes: &[usize]) -> Vec { + let mut builder = ::tar::Builder::new(vec![]); + for &size in file_sizes { + let content: Vec = (0..size).map(|i| (i % 251) as u8).collect(); + let mut header = ::tar::Header::new_ustar(); + header.set_uid(0); + header.set_gid(0); + header.set_mode(0o644); + header.set_entry_type(::tar::EntryType::Regular); + header.set_size(size as u64); + builder + .append_data( + &mut header, + format!("file_{size}_{:08x}", size), + &content[..], + ) + .unwrap(); + } + builder.into_inner().unwrap() + } + + /// Assert that produce -> reconstruct yields the same bytes as `cat`. + /// + /// This is the determinism contract: given a layer already imported into + /// `repo` with verity hash `verity`, the two paths must be identical. + async fn assert_produce_eq_cat( + repo: &Arc>, + verity: &Sha256HashValue, + ) { + use std::os::fd::AsFd as _; + // --- expected: what cat() produces --- + let mut expected = Vec::::new(); + let mut reader = repo + .open_stream("", Some(verity), Some(TAR_LAYER_CONTENT_TYPE)) + .expect("open_stream for cat"); + reader + .cat(repo, &mut expected) + .expect("cat on layer splitstream"); + + // --- actual: produce -> reconstruct --- + // For the single-repo test path the objects dir is always at index 0 + // (one real dir, seed-determined slot may vary, but for this helper we + // just use dirfd_index=0 directly to keep the test simple). + let mut stream_buf = Vec::::new(); + produce_layer_splitdirfdstream(repo, verity, 0, &mut stream_buf) + .expect("produce_layer_splitdirfdstream"); + + let objects_dir_fd = repo.objects_dir().expect("objects_dir"); + let dirfds = [objects_dir_fd.as_fd()]; + let mut actual = Vec::::new(); + reconstruct(stream_buf.as_slice(), &dirfds, &mut actual) + .expect("reconstruct splitdirfdstream"); + + similar_asserts::assert_eq!( + actual, + expected, + "produce->reconstruct must equal cat for verity={verity:?}" + ); + } + + /// Data-driven round-trip test: produce -> reconstruct == cat for each + /// layer shape. + /// + /// The test cases cover: + /// - all-inline (only files <= INLINE_CONTENT_MAX_V0) + /// - only-external (one file larger than INLINE_CONTENT_MAX_V0) + /// - mixed (various sizes including crossing the threshold) + #[tokio::test] + async fn test_produce_reconstruct_eq_cat() { + let (repo, _tempdir) = create_test_repo(); + + // (label, file sizes in bytes) + let cases: &[(&str, &[usize])] = &[ + // All inline: empty layer (no files) + ("empty", &[]), + // All inline: only small files (all <= INLINE_CONTENT_MAX_V0 = 64) + ("all_inline", &[0, 1, 10, 64]), + // Single external object (> 64 bytes) + ("single_external", &[65]), + // Larger external objects + ("large_external", &[4096, 200_000]), + // Mixed: small + large + various sizes + ("mixed", &[0, 10, 64, 65, 4096, 200_000]), + ]; + + for (label, sizes) in cases { + let tar_bytes = build_tar_layer(sizes); + let diff_id = crate::sha256_content_digest(&tar_bytes); + let (verity, _stats) = crate::import_layer(&repo, &diff_id, None, tar_bytes.as_slice()) + .await + .unwrap_or_else(|e| panic!("import_layer failed for {label}: {e}")); + + // Run the determinism check for this layer shape. + assert_produce_eq_cat(&repo, &verity).await; + } + } + + // ------------------------------------------------------------------------- + // Tests for drain_splitdirfdstream_verified + // ------------------------------------------------------------------------- + + /// Produce a splitdirfdstream from `repo_a` for `verity` into a pipe, + /// returning the pipe read end, repo_a's objects dir fd, and a join handle + /// for the producer task. + /// + /// The producer runs on a `spawn_blocking` task so the pipe can fill + /// concurrently with the consumer. The caller MUST await the returned + /// handle (after draining the pipe, to avoid a full-pipe deadlock) and + /// assert the producer succeeded — the task is tracked, not detached, so + /// producer errors are surfaced rather than swallowed. + fn produce_to_pipe( + repo_a: Arc>, + verity: Sha256HashValue, + ) -> ( + OwnedFd, + Vec, + tokio::task::JoinHandle>, + ) { + use std::os::fd::AsFd as _; + + let (pipe_read, pipe_write) = + rustix::pipe::pipe_with(rustix::pipe::PipeFlags::CLOEXEC).expect("pipe"); + + let objects_dir = repo_a.objects_dir().expect("objects_dir"); + let objects_owned = rustix::io::dup(objects_dir.as_fd()).expect("dup objects_dir"); + + let handle = tokio::task::spawn_blocking(move || { + let wf = std::fs::File::from(pipe_write); + // dirfd_index=0 for single-repo tests (objects dir at slot 0). + produce_layer_splitdirfdstream(&repo_a, &verity, 0, wf) + }); + + (pipe_read, vec![objects_owned], handle) + } + + /// Await a producer handle from [`produce_to_pipe`] and assert success. + async fn join_producer(handle: tokio::task::JoinHandle>) { + handle + .await + .expect("producer task panicked") + .expect("producer must succeed"); + } + + /// Await a producer handle without asserting its outcome. + /// + /// Used when the consumer rejects the stream early (before reading it): + /// the producer then sees a broken pipe and errors, which is expected. We + /// still join it so no task is left untracked. + async fn drain_producer(handle: tokio::task::JoinHandle>) { + let _ = handle.await.expect("producer task panicked"); + } + + /// Positive test: produce → verified drain with the CORRECT diff_id. + /// + /// repo_a imports a layer (with a large external object), then produces + /// it as a splitdirfdstream into repo_b via `drain_splitdirfdstream_verified`. + /// Asserts: + /// - the call succeeds, + /// - repo_b now has the layer stream committed. + #[tokio::test] + async fn test_verified_drain_correct_diff_id() { + let (repo_a, _td_a) = create_test_repo(); + let (repo_b, _td_b) = create_test_repo(); + + // Build a layer with both inline and external content. + let tar_bytes = build_tar_layer(&[10, 128 * 1024]); // 10 B inline + 128 KiB external + let diff_id = crate::sha256_content_digest(&tar_bytes); + + let (verity_a, _) = crate::import_layer(&repo_a, &diff_id, None, tar_bytes.as_slice()) + .await + .expect("import_layer into repo_a"); + + let (pipe_read, dir_fds, producer) = produce_to_pipe(repo_a, verity_a.clone()); + + let repo_b_clone = repo_b.clone(); + let diff_id_clone = diff_id.clone(); + let result = tokio::task::spawn_blocking(move || { + drain_splitdirfdstream_verified( + repo_b_clone, + pipe_read, + dir_fds, + &diff_id_clone, + false, + composefs::repository::ImportContext::default(), + ) + }) + .await + .expect("spawn_blocking"); + + let (verity_b, _stats, _ctx) = result.expect("verified drain must succeed"); + + // Producer must have completed cleanly (drain succeeded, so it did). + join_producer(producer).await; + + // The layer must now be committed in repo_b. + let content_id = crate::layer_content_id(&diff_id); + assert!( + repo_b + .has_stream(&content_id) + .expect("has_stream") + .is_some(), + "repo_b must have the layer stream after verified drain" + ); + + // Both repos resolved to the same verity hash. + assert_eq!( + verity_a, verity_b, + "verity hash must be identical across repos" + ); + } + + /// Negative test: produce → verified drain with a WRONG diff_id. + /// + /// The drain must return `DiffIdMismatch` and repo_b must NOT have the + /// stream committed. + #[tokio::test] + async fn test_verified_drain_wrong_diff_id() { + let (repo_a, _td_a) = create_test_repo(); + let (repo_b, _td_b) = create_test_repo(); + + let tar_bytes = build_tar_layer(&[10, 128 * 1024]); + let correct_diff_id = crate::sha256_content_digest(&tar_bytes); + + let (verity_a, _) = + crate::import_layer(&repo_a, &correct_diff_id, None, tar_bytes.as_slice()) + .await + .expect("import_layer into repo_a"); + + // Construct a diff_id that is a valid sha256 digest but definitely wrong. + let wrong_diff_id: crate::OciDigest = + "sha256:0000000000000000000000000000000000000000000000000000000000000000" + .parse() + .unwrap(); + + let (pipe_read, dir_fds, producer) = produce_to_pipe(repo_a, verity_a); + + let repo_b_clone = repo_b.clone(); + let wrong_diff_id_clone = wrong_diff_id.clone(); + let result = tokio::task::spawn_blocking(move || { + drain_splitdirfdstream_verified( + repo_b_clone, + pipe_read, + dir_fds, + &wrong_diff_id_clone, + false, + composefs::repository::ImportContext::default(), + ) + }) + .await + .expect("spawn_blocking"); + + // The drain hashes the whole stream before rejecting, so it consumes + // the entire pipe and the producer completes cleanly. + join_producer(producer).await; + + // Must fail with DiffIdMismatch. + match result { + Err(VerifiedDrainError::DiffIdMismatch { expected, actual }) => { + assert_eq!( + expected, + wrong_diff_id.to_string(), + "expected field must be the wrong diff_id" + ); + assert_eq!( + actual, + correct_diff_id.to_string(), + "actual field must be the real content hash" + ); + } + other => panic!("expected DiffIdMismatch, got {other:?}"), + } + + // The stream must NOT be committed in repo_b. + let wrong_content_id = crate::layer_content_id(&wrong_diff_id); + assert!( + repo_b + .has_stream(&wrong_content_id) + .expect("has_stream") + .is_none(), + "repo_b must NOT have a committed stream for the wrong diff_id" + ); + } + + /// A non-sha256 diff_id must be rejected up front with a clear error + /// rather than producing a confusing hash "mismatch". + #[tokio::test] + async fn test_verified_drain_rejects_non_sha256() { + let (repo_a, _td_a) = create_test_repo(); + let (repo_b, _td_b) = create_test_repo(); + + let tar_bytes = build_tar_layer(&[10, 128 * 1024]); + let diff_id = crate::sha256_content_digest(&tar_bytes); + let (verity_a, _) = crate::import_layer(&repo_a, &diff_id, None, tar_bytes.as_slice()) + .await + .expect("import_layer into repo_a"); + + // A valid sha512 digest (64 bytes hex) — well-formed but unsupported. + let sha512_diff_id: crate::OciDigest = format!("sha512:{}", "0".repeat(128)) + .parse() + .expect("valid sha512 digest"); + + let (pipe_read, dir_fds, producer) = produce_to_pipe(repo_a, verity_a); + + let repo_b_clone = repo_b.clone(); + let result = tokio::task::spawn_blocking(move || { + drain_splitdirfdstream_verified( + repo_b_clone, + pipe_read, + dir_fds, + &sha512_diff_id, + false, + composefs::repository::ImportContext::default(), + ) + }) + .await + .expect("spawn_blocking"); + + // The drain rejects before reading the pipe, so the producer may error + // with a broken pipe; join it regardless so it isn't left untracked. + drain_producer(producer).await; + + match result { + Err(VerifiedDrainError::Other(e)) => { + let msg = format!("{e:#}"); + assert!( + msg.contains("sha256") && msg.contains("sha512"), + "error should explain the algorithm restriction, got: {msg}" + ); + } + other => panic!("expected Other(unsupported algorithm) error, got {other:?}"), + } + } + + // ------------------------------------------------------------------------- + // Tests for finalize_oci_image + // ------------------------------------------------------------------------- + + /// End-to-end test for `finalize_oci_image`: + /// + /// 1. Import 2 synthetic tar layers with valid OCI structure. + /// 2. Build matching config + manifest JSON. + /// 3. Call `finalize_oci_image`. + /// 4. Assert config/manifest splitstreams exist and EROFS was produced. + #[tokio::test] + async fn test_finalize_oci_image() { + let (repo, _tempdir) = create_test_repo(); + + // Import two layers: one all-inline (10 B), one external (128 KiB). + let tar1 = crate::test_util::build_oci_tar_layer(10); + let tar2 = crate::test_util::build_oci_tar_layer(128 * 1024); + + let diff_id1 = crate::sha256_content_digest(&tar1); + let diff_id2 = crate::sha256_content_digest(&tar2); + + let (verity1, _) = crate::import_layer(&repo, &diff_id1, None, tar1.as_slice()) + .await + .expect("import layer 1"); + let (verity2, _) = crate::import_layer(&repo, &diff_id2, None, tar2.as_slice()) + .await + .expect("import layer 2"); + + let diff_ids = vec![diff_id1.to_string(), diff_id2.to_string()]; + let config_json = crate::test_util::make_config_json(&diff_ids); + let config_digest = crate::sha256_content_digest(&config_json); + let manifest_json = + crate::test_util::make_manifest_json(&config_json, config_digest.as_ref(), &diff_ids); + + let layer_refs = vec![(diff_id1.clone(), verity1), (diff_id2.clone(), verity2)]; + + let ((manifest_digest, manifest_verity), (out_config_digest, config_verity)) = + finalize_oci_image( + &repo, + &manifest_json, + &config_json, + &layer_refs, + Some("test:v1"), + ) + .expect("finalize_oci_image"); + + // Digests must be non-empty. + assert!(!manifest_digest.to_string().is_empty()); + assert!(!out_config_digest.to_string().is_empty()); + + // Splitstreams must exist. + use crate::oci_image::manifest_identifier; + let manifest_id = manifest_identifier(&manifest_digest); + let config_id = crate::config_identifier(&out_config_digest); + + assert!( + repo.has_stream(&manifest_id) + .expect("has_stream manifest") + .is_some(), + "manifest splitstream must exist" + ); + assert!( + repo.has_stream(&config_id) + .expect("has_stream config") + .is_some(), + "config splitstream must exist" + ); + + // Verify the returned verities match what's stored. + let stored_manifest_verity = repo + .has_stream(&manifest_id) + .unwrap() + .expect("manifest verity must be stored"); + assert_eq!( + manifest_verity, stored_manifest_verity, + "returned manifest_verity must match stored" + ); + let stored_config_verity = repo + .has_stream(&config_id) + .unwrap() + .expect("config verity must be stored"); + assert_eq!( + config_verity, stored_config_verity, + "returned config_verity must match stored" + ); + + // EROFS must have been generated for this container image. + let erofs = crate::composefs_erofs_for_manifest( + &repo, + &manifest_digest, + Some(&manifest_verity), + repo.erofs_version(), + ) + .expect("composefs_erofs_for_manifest"); + assert!( + erofs.is_some(), + "EROFS image must exist after finalize_oci_image for a container image" + ); + + // Idempotency: calling again must succeed and return the same digests. + let ((md2, _mv2), (cd2, _cv2)) = finalize_oci_image( + &repo, + &manifest_json, + &config_json, + &layer_refs, + Some("test:v1"), + ) + .expect("finalize_oci_image idempotent"); + assert_eq!(manifest_digest, md2, "idempotent call: manifest_digest"); + assert_eq!(out_config_digest, cd2, "idempotent call: config_digest"); + } +} diff --git a/crates/composefs-oci/src/layer_transport.rs b/crates/composefs-oci/src/layer_transport.rs new file mode 100644 index 00000000..51f744ea --- /dev/null +++ b/crates/composefs-oci/src/layer_transport.rs @@ -0,0 +1,218 @@ +//! Abstraction layer for the OCI `GetLayer` producer side. +//! +//! Defines the [`LayerSource`] trait that decouples the fd-assembly and +//! framing mechanics from the specific backing store (composefs repo today, +//! containers-storage in a later step). Both the in-process `copy` path and +//! the varlink `GetLayer` handler use this trait. +//! +//! The shared entry point is [`serve_get_layer`], which: +//! 1. Calls [`LayerSource::open`] to get real dirfds and a lifetime guard. +//! 2. Calls [`composefs_splitdirfdstream::build_layer_fd_layout`] to place them +//! in the sparse layout, add dummies, keepalive pipe and extra lifetime fds. +//! 3. Spawns the 3-phase self-reaping producer via +//! [`composefs_splitdirfdstream::spawn_self_reaping_producer`]. +//! 4. Splits the fd array into transport frames via +//! [`composefs_splitdirfdstream::split_fds_into_frames`]. +//! 5. Returns the ready frame batches + `dir_count` for the caller to wrap in +//! zlink `Reply` frames (interface-specific error types stay in the callers). +//! +//! # Dependency note +//! +//! This module lives in `composefs-oci` rather than `composefs-splitdirfdstream` +//! because the trait references `anyhow::Result` and repository types, while +//! `composefs-splitdirfdstream` must stay dependency-free of those crates. +//! The cstor service (`composefs-storage`) CANNOT call `serve_get_layer` +//! (it would create a dep cycle), so it calls `build_layer_fd_layout` + +//! `spawn_self_reaping_producer` directly. + +use std::io::Write; +use std::os::fd::{AsFd as _, OwnedFd}; +use std::sync::Arc; + +use anyhow::{Context as _, Result}; + +use composefs::fsverity::FsVerityHashValue; +use composefs::repository::Repository; +use composefs_splitdirfdstream::{ + FdLimitError, build_layer_fd_layout, spawn_self_reaping_producer, split_fds_into_frames, +}; + +use crate::layer_sync::produce_layer_splitdirfdstream; + +// ── LayerSource trait ───────────────────────────────────────────────────────── + +/// A producer that can open the real diff-directory file descriptors for one +/// layer and stream the layer content as a `splitdirfdstream`. +/// +/// Implementors supply two things: +/// 1. The set of real diff-directory fds (in chain order) **plus an opaque +/// lifetime guard** that must be held open until the consumer signals +/// completion (keepalive-pipe EOF). For the repo case the guard is `()`; +/// for containers-storage the guard is the shared `LayerStoreLock`. +/// 2. A function that, given the slot-index assignments from the sparse layout, +/// produces the `splitdirfdstream` bytes into a writer. +/// +/// The trait is object-safe so it can be stored in `Box`. +pub trait LayerSource: Send + 'static { + /// Open the real diff-directory file descriptors for this layer, in chain order, + /// AND acquire any lifetime-bound resources (e.g. a shared layer-store lock). + /// + /// Returns `(dirfds, guard)` where: + /// - `dirfds` — the real diff-dir fds, one per chain layer, in chain order. + /// - `guard` — an opaque object that **must be held open** until the + /// consumer finishes processing (keepalive-pipe EOF). Dropping the guard + /// signals that the layer's storage may be released. For the repo source + /// this is `Box::new(())` (no-op). + /// + /// The lock is acquired BEFORE the fds are opened so that the lock → + /// open sequence is atomic (avoids a TOCTOU race where a concurrent + /// `podman rmi` could unlink a diff directory between lock and open). + /// + /// Called once per `GetLayer` request, before the sparse layout is built. + fn open(&self) -> Result<(Vec, Box)>; + + /// Write the layer content as a `splitdirfdstream` to `out`. + /// + /// `dirfd_index_map[i]` is the slot index within the sparse dirfds region + /// where the `i`-th real diff-dir fd was placed. The producer must use + /// `write_file_backed_data(dirfd_index_map[i], ...)` for every file that + /// belongs to chain layer `i`. + /// + /// This method is called from a `spawn_blocking` context so it MAY block. + fn produce(&self, dirfd_index_map: &[u32], out: Box) -> Result<()>; +} + +// ── RepoLayerSource ─────────────────────────────────────────────────────────── + +/// [`LayerSource`] implementation backed by a composefs [`Repository`]. +/// +/// The objects directory is the single real diff-dir (chain index 0). +pub struct RepoLayerSource { + /// The repository from which to serve the layer. + pub repo: Arc>, + /// fs-verity hash of the layer splitstream to serve. + pub layer_verity: ObjectID, +} + +impl std::fmt::Debug for RepoLayerSource +where + ObjectID: FsVerityHashValue + std::fmt::Debug, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("RepoLayerSource") + .field("layer_verity", &self.layer_verity) + .finish_non_exhaustive() + } +} + +impl LayerSource for RepoLayerSource +where + ObjectID: FsVerityHashValue, +{ + fn open(&self) -> Result<(Vec, Box)> { + let objects_dir = self + .repo + .objects_dir() + .context("opening repository objects dir")?; + let dup = rustix::io::dup(objects_dir.as_fd()) + .map_err(std::io::Error::from) + .context("dup objects_dir fd")?; + // No external lock needed for the repo source; the guard is a no-op. + Ok((vec![dup], Box::new(()))) + } + + fn produce(&self, dirfd_index_map: &[u32], out: Box) -> Result<()> { + // The objects dir is at chain layer index 0 → dirfd_index_map[0]. + let objects_dirfd_index = dirfd_index_map + .first() + .copied() + .context("dirfd_index_map is empty: expected at least one real dir")?; + produce_layer_splitdirfdstream(&self.repo, &self.layer_verity, objects_dirfd_index, out) + } +} + +// ── serve_get_layer ─────────────────────────────────────────────────────────── + +/// Ready frame batches returned by [`serve_get_layer`]. +/// +/// The caller wraps these in interface-specific zlink `Reply` frames and +/// yields them from the streaming `GetLayer` method. +#[derive(Debug)] +pub struct GetLayerFrames { + /// Number of dirfd slots in the sparse layout (`fds[1..=dir_count]`). + pub dir_count: u32, + /// FD batches in frame order; one inner `Vec` per transport frame. + pub batches: Vec>, +} + +/// Error from [`serve_get_layer`]. +#[derive(Debug)] +pub enum ServeGetLayerError { + /// `more=false` but total fd count exceeds `MAX_FDS_PER_FRAME`. + FdLimitExceeded(FdLimitError), + /// Any other I/O or source error. + Other(anyhow::Error), +} + +impl From for ServeGetLayerError { + fn from(e: anyhow::Error) -> Self { + ServeGetLayerError::Other(e) + } +} + +/// Drive the full `GetLayer` flow for a [`LayerSource`] and return ready frame +/// batches. +/// +/// This is the single canonical implementation of the fd-layout + producer + +/// framing logic for the repo `GetLayer` path. The cstor `GetLayer` path in +/// `composefs-storage` cannot call this function (dep cycle), so it calls +/// `build_layer_fd_layout` and `spawn_self_reaping_producer` directly. +/// Both paths share the same primitives from `composefs-splitdirfdstream`. +/// +/// # Arguments +/// * `source` — The layer source. `source.open()` is called once; the returned +/// guard is moved into the self-reaping producer task. +/// * `seed` — Deterministic seed for sparse layout and frame count. +/// * `more` — Whether the client requested multi-frame streaming (`more=true`) +/// or a single-frame reply (`more=false`). +/// +/// # Returns +/// `Ok(GetLayerFrames)` — ready to wrap in zlink replies. +/// `Err(ServeGetLayerError::FdLimitExceeded)` — total fd count exceeds +/// `MAX_FDS_PER_FRAME`; retry with `more=true`. +/// `Err(ServeGetLayerError::Other)` — source or I/O error. +pub fn serve_get_layer( + source: impl LayerSource, + seed: u64, + more: bool, +) -> std::result::Result { + // Open real dirfds + acquire lifetime guard. + let (real_fds, guard) = source.open()?; + + // Create the data pipe. + let (pipe_read, pipe_write) = rustix::pipe::pipe_with(rustix::pipe::PipeFlags::CLOEXEC) + .map_err(|e| anyhow::anyhow!("pipe: {e}"))?; + + // Build the sparse fd layout (dirfds region + keepalive + extra lifetime fds). + let layout = build_layer_fd_layout(pipe_read, real_fds, seed) + .map_err(|e| anyhow::anyhow!("build_layer_fd_layout: {e}"))?; + + let dir_count = layout.dir_count; + let real_indices = layout.real_indices.clone(); + let keepalive_read = layout.keepalive_read; + + // Split into transport frames (enforces more=false single-frame cap). + let batches = split_fds_into_frames(layout.fds_all, seed, more) + .map_err(|(_fds, e)| ServeGetLayerError::FdLimitExceeded(e))?; + + // Spawn the 3-phase self-reaping producer. + // `source` and `guard` are both moved into the closure. + // `guard` drops last (Phase 3), releasing any held lock. + spawn_self_reaping_producer(pipe_write, keepalive_read, guard, move |wf| { + if let Err(e) = source.produce(&real_indices, Box::new(wf)) { + tracing::warn!("GetLayer producer error: {e:#}"); + } + }); + + Ok(GetLayerFrames { dir_count, batches }) +} diff --git a/crates/composefs-oci/src/lib.rs b/crates/composefs-oci/src/lib.rs index 0aefd575..f2de39a7 100644 --- a/crates/composefs-oci/src/lib.rs +++ b/crates/composefs-oci/src/lib.rs @@ -22,12 +22,19 @@ pub mod cstor; pub(crate) mod delta; pub mod image; pub mod layer; +pub mod layer_sync; +pub mod layer_transport; pub mod oci_image; pub mod oci_layout; /// Re-exported from [`composefs::progress`]; use that path directly in new code. pub mod progress; pub mod skopeo; pub mod tar; +/// Shared wire types and client proxy for the `org.composefs.Oci` interface. +/// +/// Available when the `varlink` feature is enabled. +#[cfg(feature = "varlink")] +pub mod varlink_types; /// Test utilities for building OCI images from dumpfile strings. #[cfg(any(test, feature = "test"))] @@ -52,7 +59,7 @@ pub use containers_image_proxy::oci_spec::image::Digest as OciDigest; use containers_image_proxy::ImageProxyConfig; use containers_image_proxy::oci_spec::image::ImageConfiguration; -use containers_image_proxy::oci_spec::image::{Descriptor, MediaType}; +use containers_image_proxy::oci_spec::image::{Descriptor, ImageManifest, MediaType}; use sha2::{Digest, Sha256}; use composefs::{ @@ -64,6 +71,14 @@ use composefs::{ use crate::skopeo::{OCI_CONFIG_CONTENT_TYPE, TAR_LAYER_CONTENT_TYPE}; +/// The content-type tag used to identify OCI layer (tar) splitstreams in the +/// repository. +/// +/// Pass this to [`composefs::repository::Repository::open_stream`] when +/// reading back a stored layer splitstream by its fs-verity hash. The value +/// is the 8-byte ASCII string `"ocilayer"` encoded as a little-endian `u64`. +pub const LAYER_CONTENT_TYPE: u64 = TAR_LAYER_CONTENT_TYPE; + /// Named ref key for the V2 EROFS image derived from this OCI config. pub const IMAGE_REF_KEY: &str = "composefs.image"; @@ -343,6 +358,17 @@ pub(crate) fn layer_identifier(diff_id: &OciDigest) -> String { format!("oci-layer-{diff_id}") } +/// Return the content-identifier string used to register a layer splitstream. +/// +/// This is the key passed to [`composefs::repository::Repository::has_stream`] +/// and [`composefs::repository::Repository::write_stream`] for a given OCI +/// diff-id. Callers outside this crate (e.g. the varlink layer-sync service) +/// use this to look up whether a layer has been imported and to obtain its +/// fs-verity hash. +pub fn layer_content_id(diff_id: &OciDigest) -> String { + layer_identifier(diff_id) +} + pub(crate) fn config_identifier(config: &OciDigest) -> String { format!("oci-config-{config}") } @@ -445,7 +471,11 @@ pub(crate) fn sha256_output_to_digest(output: sha2::digest::Output) -> O /// Compute the SHA-256 content digest of `bytes`, returning an OCI digest /// (e.g. `sha256:abcd...`). -pub(crate) fn sha256_content_digest(bytes: &[u8]) -> OciDigest { +/// +/// This is primarily used to derive an OCI diff-id from a tar layer's raw +/// bytes (before any compression). It is also used by tests that construct +/// synthetic layers without going through the full OCI pull path. +pub fn sha256_content_digest(bytes: &[u8]) -> OciDigest { let mut context = Sha256::new(); context.update(bytes); sha256_output_to_digest(context.finalize()) @@ -455,6 +485,25 @@ fn hash_sha256(bytes: &[u8]) -> OciDigest { sha256_content_digest(bytes) } +/// Extract ordered layer identifiers from raw manifest and config JSON. +/// +/// For standard container images (`ImageConfig` media type), parses the +/// config and returns `rootfs.diff_ids`. For OCI artifacts with +/// non-standard config types, falls back to the manifest's layer digests. +/// +/// This is the high-level entry point for callers that have raw JSON +/// strings (e.g. from a varlink `Inspect` reply). +pub fn extract_layer_ids(manifest_json: &str, config_json: &str) -> Result> { + let manifest: ImageManifest = + serde_json::from_str(manifest_json).context("parsing manifest JSON")?; + extract_diff_ids( + manifest.config().media_type(), + config_json.as_bytes(), + manifest.layers(), + ) + .map(|ids| ids.iter().map(|d| d.to_string()).collect()) +} + /// Extract ordered diff_ids from a config descriptor. /// /// For standard container images (ImageConfig media type), parses the @@ -464,7 +513,7 @@ fn hash_sha256(bytes: &[u8]) -> OciDigest { /// Note: oci-spec models diff_ids as `Vec` but they are actually /// OCI content digests. We parse them here so the rest of the codebase /// can work with the strongly-typed `Digest`. -pub(crate) fn extract_diff_ids( +pub fn extract_diff_ids( media_type: &MediaType, config_reader: impl Read, manifest_layers: &[Descriptor], @@ -754,7 +803,7 @@ pub fn write_config_raw( /// and are collected by the next GC. /// /// Returns the EROFS image's ObjectID (fs-verity digest). -fn ensure_oci_composefs_erofs( +pub(crate) fn ensure_oci_composefs_erofs( repo: &Arc>, manifest_digest: &OciDigest, manifest_verity: Option<&ObjectID>, diff --git a/crates/composefs-oci/src/test_util.rs b/crates/composefs-oci/src/test_util.rs index b7558a7a..73afaaef 100644 --- a/crates/composefs-oci/src/test_util.rs +++ b/crates/composefs-oci/src/test_util.rs @@ -194,6 +194,146 @@ pub fn dumpfile_to_tar(dumpfile: &str) -> Vec { builder.into_inner().unwrap() } +// ============================================================================ +// Low-level tar/config/manifest builders (used by layer_sync tests and +// integration tests that need to synthesize OCI images from scratch) +// ============================================================================ + +/// Build a minimal OCI-compatible tar layer with standard directory entries +/// and a single regular file whose `payload_size` bytes of content are +/// deterministic (byte `i` is `(i % 251) as u8`). +/// +/// The archive contains: +/// - `./` — root directory +/// - `./usr/` — usr directory +/// - `./usr/share/` — usr/share directory +/// - `./usr/share/data_` — regular file with `payload_size` bytes +/// +/// Using `payload_size > 64` (the inline threshold) will produce an external +/// object when the layer is imported into a repository. +pub fn build_oci_tar_layer(payload_size: usize) -> Vec { + let mut builder = ::tar::Builder::new(vec![]); + + // Root directory. + let mut root_hdr = ::tar::Header::new_ustar(); + root_hdr.set_entry_type(::tar::EntryType::Directory); + root_hdr.set_uid(0); + root_hdr.set_gid(0); + root_hdr.set_mode(0o755); + root_hdr.set_size(0); + builder + .append_data(&mut root_hdr, "./", std::io::empty()) + .unwrap(); + + // usr/ directory. + let mut usr_hdr = ::tar::Header::new_ustar(); + usr_hdr.set_entry_type(::tar::EntryType::Directory); + usr_hdr.set_uid(0); + usr_hdr.set_gid(0); + usr_hdr.set_mode(0o755); + usr_hdr.set_size(0); + builder + .append_data(&mut usr_hdr, "./usr/", std::io::empty()) + .unwrap(); + + // usr/share/ directory. + let mut share_hdr = ::tar::Header::new_ustar(); + share_hdr.set_entry_type(::tar::EntryType::Directory); + share_hdr.set_uid(0); + share_hdr.set_gid(0); + share_hdr.set_mode(0o755); + share_hdr.set_size(0); + builder + .append_data(&mut share_hdr, "./usr/share/", std::io::empty()) + .unwrap(); + + // A data file with deterministic content. + let content: Vec = (0..payload_size).map(|i| (i % 251) as u8).collect(); + let mut file_hdr = ::tar::Header::new_ustar(); + file_hdr.set_entry_type(::tar::EntryType::Regular); + file_hdr.set_uid(0); + file_hdr.set_gid(0); + file_hdr.set_mode(0o644); + file_hdr.set_size(payload_size as u64); + builder + .append_data( + &mut file_hdr, + format!("./usr/share/data_{payload_size}"), + content.as_slice(), + ) + .unwrap(); + + builder.into_inner().unwrap() +} + +/// Build a minimal valid OCI `ImageConfiguration` JSON whose +/// `rootfs.diff_ids` list matches `diff_ids`. +/// +/// The produced JSON is recognized as a container image +/// (`MediaType::ImageConfig`) so that `finalize_oci_image` / `OciImage::open` +/// will generate a composefs EROFS from it. +pub fn make_config_json(diff_ids: &[String]) -> Vec { + let rootfs = RootFsBuilder::default() + .typ("layers") + .diff_ids(diff_ids.to_vec()) + .build() + .unwrap(); + + let cfg = ConfigBuilder::default().build().unwrap(); + + let config = ImageConfigurationBuilder::default() + .architecture("amd64") + .os("linux") + .rootfs(rootfs) + .config(cfg) + .build() + .unwrap(); + + config.to_string().unwrap().into_bytes() +} + +/// Build a minimal valid OCI `ImageManifest` JSON that references +/// `config_digest` as the config and has one layer descriptor per entry in +/// `diff_ids`. +/// +/// The layer descriptors use the diff_id as the compressed digest (valid for +/// splitstream tests where no real compressed transport is used). +pub fn make_manifest_json( + config_json: &[u8], + config_digest_str: &str, + diff_ids: &[String], +) -> Vec { + let config_digest: OciDigest = config_digest_str.parse().unwrap(); + let config_desc = DescriptorBuilder::default() + .media_type(MediaType::ImageConfig) + .digest(config_digest) + .size(config_json.len() as u64) + .build() + .unwrap(); + + let layer_descs: Vec<_> = diff_ids + .iter() + .map(|d| { + DescriptorBuilder::default() + .media_type(MediaType::ImageLayerGzip) + .digest(d.parse::().unwrap()) + .size(1u64) + .build() + .unwrap() + }) + .collect(); + + let manifest = ImageManifestBuilder::default() + .schema_version(2u32) + .media_type(MediaType::ImageManifest) + .config(config_desc) + .layers(layer_descs) + .build() + .unwrap(); + + manifest.to_string().unwrap().into_bytes() +} + /// Return value from image creation helpers. #[allow(dead_code)] pub struct TestImage { diff --git a/crates/composefs-oci/src/varlink_types.rs b/crates/composefs-oci/src/varlink_types.rs new file mode 100644 index 00000000..d52ab2f2 --- /dev/null +++ b/crates/composefs-oci/src/varlink_types.rs @@ -0,0 +1,253 @@ +//! Shared wire types and the `OciProxy` client trait for the +//! `org.composefs.Oci` varlink interface. +//! +//! These types are defined here (in `composefs-oci`) rather than +//! `composefs-ctl` so that both the repo-side service (`CfsctlService` in +//! composefs-ctl) and the containers-storage service (`CstorLayerService` in +//! composefs-storage) can share the same proxy trait without creating a +//! dependency cycle. +//! +//! `composefs-ctl` re-exports everything from this module; callers should +//! prefer `composefs_oci::varlink_types` or the re-exports in +//! `composefs_ctl::varlink::{layer_sync,oci::OciError,proxy::OciProxy}`. +//! +//! # Feature gate +//! +//! This module is compiled only when the `varlink` feature is enabled on +//! `composefs-oci` (which pulls in `zlink`). + +#![allow(missing_docs)] + +use serde::{Deserialize, Serialize}; + +// ── Locator for a layer inside containers-storage ──────────────────────────── + +/// Locator for a layer that lives inside a containers-storage store. +/// +/// Both fields are required when routing a `GetLayer` call to the +/// `CstorLayerService`; the repo-side service ignores this field entirely. +#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] +pub struct StorageLocator { + /// Absolute path to the containers-storage root directory + /// (e.g. `/var/lib/containers/storage`). + pub storage_path: String, + /// The layer ID within that storage root, as returned by + /// `storage_layer_ids()`. + pub layer_id: String, +} + +/// Parameters for the `GetLayer` method of the `org.composefs.Oci` interface. +/// +/// Exactly one of `diff_id` or `storage` must be set: +/// - **Repo service** (`CfsctlService`): reads `diff_id`, errors if `None`. +/// - **Cstor service** (`CstorLayerService`): reads `storage`, errors if `None`. +/// +/// Both fields are `Option` for forward-compatibility: new locator kinds can +/// be added in future without breaking old clients. +#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] +pub struct GetLayerParams { + /// OCI diff-id (`sha256:…`) identifying the layer in a composefs repo. + pub diff_id: Option, + /// Location of a specific layer inside a containers-storage store. + pub storage: Option, +} + +// ── Reply types ─────────────────────────────────────────────────────────────── + +/// Reply from `GetInfo`: capability tokens supported by this service. +#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] +pub struct GetInfoReply { + /// Capability tokens advertised by this service instance. + /// + /// Currently only `"splitdirfdstream-v0"` is defined. The cstor service + /// additionally reports `"source-containers-storage"` and `"read-only"`. + pub features: Vec, +} + +/// Reply from `HasLayer`: whether the layer is present in the repository. +#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] +pub struct HasLayerReply { + /// Whether the layer splitstream for the given diff-id is present. + pub present: bool, + /// Hex-encoded fs-verity hash of the layer splitstream, if present. + pub layer_verity: Option, +} + +/// Reply from `GetLayer`: the number of diff-directory slots in the logical FD +/// array. +/// +/// `GetLayer` is a **streaming** method (`more`): it yields multiple frames, +/// each carrying a batch of FDs. The client MUST concatenate the FD batches +/// from all frames (in arrival order) to reconstruct the full logical FD array: +/// +/// - `fds[0]` — data pipe read end (carries the `splitdirfdstream` bytes). +/// - `fds[1..=dir_count]` — the dirfds region (`dir_count` slots total). The +/// real objects-directory fd sits at a sparse, hash-determined index within +/// this region; the remaining (gap) slots hold inert dummy fds that +/// `reconstruct` never dereferences. +/// - `fds[dir_count+1..]` — opaque lifetime FDs. The client MUST hold every +/// one of these open until it has finished reading and processing all dir +/// fds, then close them all to signal completion to the server. +#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] +pub struct GetLayerReply { + /// Number of diff-directory file descriptors in the full logical FD array + /// (i.e. `fds[1..=dir_count]` after concatenating all frames' batches). + pub dir_count: u32, +} + +/// Reply from `PutLayer`: the verity hash of the imported layer, whether +/// it was already present, and per-object transfer statistics. +#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] +pub struct PutLayerReply { + /// Hex-encoded fs-verity hash of the committed layer splitstream. + pub layer_verity: String, + /// `true` if the layer was already present before this call. + pub already_present: bool, + + /// Number of objects that were reflinked (FICLONE) into the destination. + #[serde(default)] + pub objects_reflinked: u64, + /// Number of objects hardlinked into the destination. + #[serde(default)] + pub objects_hardlinked: u64, + /// Number of objects byte-copied into the destination. + #[serde(default)] + pub objects_copied: u64, + /// Number of objects already present in the destination (skipped). + #[serde(default)] + pub objects_already_present: u64, +} + +/// A single (diff_id, layer_verity) pair passed to `FinalizeImage`. +#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] +pub struct LayerRef { + /// OCI diff-id of the layer (e.g. `"sha256:abcd..."`). + pub diff_id: String, + /// Hex-encoded fs-verity hash of the layer splitstream in the destination + /// repository, as returned by `PutLayer`. + pub layer_verity: String, +} + +/// Reply from `FinalizeImage`. +#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] +pub struct FinalizeImageReply { + /// OCI digest of the manifest. + pub manifest_digest: String, + /// Hex-encoded fs-verity hash of the manifest splitstream. + pub manifest_verity: String, + /// OCI digest of the config. + pub config_digest: String, + /// Hex-encoded fs-verity hash of the config splitstream. + pub config_verity: String, +} + +// ── OciError ────────────────────────────────────────────────────────────────── + +/// Errors returned by the `org.composefs.Oci` interface. +#[derive(Debug, zlink::ReplyError, zlink::introspect::ReplyError)] +#[zlink(interface = "org.composefs.Oci")] +pub enum OciError { + /// The repository could not be found or opened at the configured path. + RepoNotFound { + /// Description of the failure. + message: String, + }, + /// The given handle does not refer to an open repository. + InvalidHandle { + /// The handle that was not found. + handle: u64, + }, + /// The named OCI image/reference does not exist. + NoSuchImage { + /// The image reference that was not found. + image: String, + }, + /// An unexpected internal error occurred while servicing the request. + InternalError { + /// Description of the failure. + message: String, + }, + /// The requested layer (by diff-id) is not present in the repository. + NoSuchLayer { + /// The diff-id that was not found. + diff_id: String, + }, + /// A supplied digest/diff-id string was malformed. + InvalidDigest { + /// Human-readable description of the parse failure. + message: String, + }, + /// Received layer content did not hash to the declared diff-id. + DiffIdMismatch { + /// The diff_id that was declared by the client. + expected: String, + /// The sha256 digest of the data that was actually received. + actual: String, + }, + /// The request was malformed (e.g. wrong fd count). + InvalidRequest { + /// Human-readable description of what was wrong. + message: String, + }, + /// The total fd count exceeds the per-frame cap for a `more=false` call. + FdLimitExceeded { + /// Total number of fds that would be sent. + fd_count: u64, + /// The per-frame cap that was exceeded. + max_per_frame: u64, + }, +} + +// ── OciProxy trait ──────────────────────────────────────────────────────────── + +/// Typed client proxy for the `org.composefs.Oci` varlink interface. +/// +/// Both the composefs repo service and the containers-storage service expose +/// this interface; this proxy trait can be used against either. +#[zlink::proxy(interface = "org.composefs.Oci")] +pub trait OciProxy { + /// Query capability tokens supported by the service. + async fn get_info(&mut self) -> zlink::Result>; + + /// Check whether a layer is present in the repository. + async fn has_layer( + &mut self, + handle: u64, + diff_id: &str, + ) -> zlink::Result>; + + /// Stream the layer as a `splitdirfdstream` with full hardened fd-transport + /// contract (sparse dirfds, keepalive, lifetime fds, multi-frame). + /// + /// `params.diff_id` is used by the repo service; `params.storage` is used + /// by the cstor service. + #[zlink(more, return_fds)] + async fn get_layer( + &mut self, + handle: u64, + params: GetLayerParams, + ) -> zlink::Result< + impl zlink::futures_util::Stream< + Item = zlink::Result<(Result, Vec)>, + >, + >; + + /// Receive a layer as a `splitdirfdstream` from the client and import it. + async fn put_layer( + &mut self, + handle: u64, + diff_id: &str, + zerocopy: bool, + #[zlink(fds)] fds: Vec, + ) -> zlink::Result>; + + /// Finalize an OCI image after all layers have been imported. + async fn finalize_image( + &mut self, + handle: u64, + manifest_json: &str, + config_json: &str, + layers: Vec, + name: Option<&str>, + ) -> zlink::Result>; +} diff --git a/crates/composefs-splitdirfdstream/Cargo.toml b/crates/composefs-splitdirfdstream/Cargo.toml new file mode 100644 index 00000000..9f46172c --- /dev/null +++ b/crates/composefs-splitdirfdstream/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "composefs-splitdirfdstream" +description = "Binary format for serializing data with external dirfd-relative file references" +keywords = ["splitdirfdstream", "fd", "serialization"] + +edition.workspace = true +license.workspace = true +# Crate-local README (the inherited workspace value resolves to the workspace +# root README, not this crate's). +readme = "README.md" +repository.workspace = true +rust-version.workspace = true +version.workspace = true + +[features] +## Enable `spawn_self_reaping_producer` (requires tokio). +tokio = ["dep:tokio"] + +[dependencies] +rand = { version = "0.10.0", default-features = false, features = ["alloc"] } +rand_pcg = { version = "0.10.0", default-features = false } +rustix = { version = "1.0.0", default-features = false, features = ["fs", "pipe", "std"] } +sha2 = { version = "0.10", default-features = false, features = ["std"] } +thiserror = { version = "2.0.0", default-features = false } +tokio = { version = "1", default-features = false, features = ["rt"], optional = true } + +[dev-dependencies] +proptest = "1" +tempfile = { version = "3.8.0", default-features = false } + +[lints] +workspace = true diff --git a/crates/composefs-splitdirfdstream/README.md b/crates/composefs-splitdirfdstream/README.md new file mode 100644 index 00000000..6fc48887 --- /dev/null +++ b/crates/composefs-splitdirfdstream/README.md @@ -0,0 +1,15 @@ +# composefs-splitdirfdstream + +A data format and IPC protocol for sending a binary stream across local +processes via file descriptor passing (DBus, varlink, etc.). Designed for +sending tar archives of container image layers that are unpacked into a storage +system such as composefs or docker/podman `overlay` storage. + +See the [crate documentation](https://docs.rs/composefs-splitdirfdstream) (or +run `cargo doc --open`) for the wire format, chunk layouts, limits, safety +model, and API surface. + +## License + +Licensed under the same terms as the +[composefs-rs](https://github.com/containers/composefs-rs) workspace. diff --git a/crates/composefs-splitdirfdstream/src/lib.rs b/crates/composefs-splitdirfdstream/src/lib.rs new file mode 100644 index 00000000..51b501d1 --- /dev/null +++ b/crates/composefs-splitdirfdstream/src/lib.rs @@ -0,0 +1,1763 @@ +//! A data format and IPC protocol for sending a binary stream across local +//! processes via file descriptor passing (DBus, varlink, etc.). +//! +//! Designed for sending tar archives of container image layers that are unpacked +//! into a storage system such as composefs or docker/podman `overlay` storage. +//! More generally it is a mechanism for reassembling any byte stream from a mix +//! of inline bytes and whole-file references, useful whenever the bulk of a +//! stream already lives as files in a content store and copying that bulk inline +//! would be wasteful. +//! +//! External content is identified by `(dirfd_index, filename)` pairs so the +//! receiving side can `openat2` the files itself given a small out-of-band array +//! of directory file descriptors. This avoids passing one open fd per external +//! chunk (which would not scale to streams referencing thousands of files) and +//! suits reconstructing a stream from a content store laid out on disk. +//! +//! # Format +//! +//! A splitdirfdstream is a sequence of chunks with no header or footer. Combined +//! with an out-of-band, ordered array of directory file descriptors `dirfds[0..D]`, +//! it reconstructs a byte stream by concatenating each chunk's contribution: +//! +//! - **Metadata chunk** — raw stream metadata (tar header/padding) carried verbatim. +//! - **InlineData chunk** — file content transported inline (for non-world-readable +//! files the producer read through a privileged fd). +//! - **FileBackedData chunk** — `content_length` bytes of a file, resolved via +//! `openat2(dirfds[dirfd_index], filename, RESOLVE_BENEATH)` and read from offset 0. +//! +//! All integers are little-endian. Each chunk begins with a single type byte: +//! +//! | Type byte | Chunk | Remaining header | Body | +//! |-----------|----------------|-------------------------------------------------------------------|------------------------------| +//! | `0x00` | Metadata | `u32 LE` — body length | `length` bytes | +//! | `0x01` | InlineData | `u32 LE` — body length | `length` bytes | +//! | `0x02` | FileBackedData | `u64 LE` content_length, `u32 LE` dirfd_index, `u32 LE` name_len | `name_len` bytes of filename | +//! +//! Any other type byte is a hard error ([`Error::UnknownChunkType`]). +//! +//! There is no in-band end-of-stream sentinel: the stream ends at clean EOF at +//! the start of a type byte. A partial read anywhere inside a chunk is a +//! truncation error ([`Error::Truncated`]). Because the format carries no length +//! or checksum of the whole stream, callers that need end-to-end integrity should +//! verify the reconstructed bytes against an expected size and digest out of band. +//! +//! # Chunk layouts +//! +//! ### Metadata +//! +//! ```text +//! +--------+---------------+----------------------------+ +//! | 0x00 | length: u32LE | data: `length` raw bytes | +//! +--------+---------------+----------------------------+ +//! ``` +//! +//! `data` (`length` bytes) is written verbatim to output (tar headers, padding, +//! etc.). Empty writes are silently dropped by the writer; a zero-length Metadata +//! chunk is never encoded. `length` is bounded by [`MAX_INLINE_CHUNK_SIZE`] (256 MiB). +//! +//! ### InlineData +//! +//! ```text +//! +--------+---------------+----------------------------+ +//! | 0x01 | length: u32LE | data: `length` raw bytes | +//! +--------+---------------+----------------------------+ +//! ``` +//! +//! `length` is both the byte count of the data that follows and the logical file +//! size the consumer uses for its inline-vs-object storage decision. Unlike +//! FileBackedData, no directory fd is involved; the producer has already read the +//! content through its own privileged fd. +//! +//! A zero-length InlineData **is** written and round-trips correctly — a zero-byte +//! non-world-readable file must still be transported. `length` is bounded by +//! [`MAX_INLINE_CHUNK_SIZE`] (256 MiB), since the data is fully buffered in memory +//! during transport. +//! +//! ### FileBackedData +//! +//! ```text +//! +--------+---------------------+--------------------+-----------------+------------------------------+ +//! | 0x02 | content_len: u64LE | dirfd_index: u32LE | name_len: u32LE | filename: name_len raw bytes | +//! +--------+---------------------+--------------------+-----------------+------------------------------+ +//! ``` +//! +//! The consumer reads exactly `content_len` bytes starting at offset 0 of the +//! opened file. The explicit `content_len` makes the stream self-framing — entry +//! boundaries never depend on trusting the backing file's size, which closes a +//! TOCTOU window if the underlying store is mutated mid-read. +//! +//! A FileBackedData chunk always starts at offset 0: it references a whole file, +//! not a byte range. This is deliberate — it lets every external reference reflink +//! cleanly (`FICLONE`) when materialized on a CoW filesystem. There is no range +//! variant. +//! +//! `filename` is a path relative to `dirfds[dirfd_index]`. It may contain `/` to +//! traverse subdirectories within that root; it is not NUL-terminated and must not +//! contain NUL. +//! +//! # Limits +//! +//! | Constant | Value | Purpose | +//! |----------|-------|---------| +//! | [`MAX_INLINE_CHUNK_SIZE`] | 256 MiB | Bounds memory for a single Metadata or InlineData chunk body. | +//! | [`MAX_FILENAME_LEN`] | 4096 bytes | Bounds the FileBackedData filename length. | +//! +//! The reader rejects an out-of-range `dirfd_index`, and Metadata or InlineData +//! chunks whose `length` exceeds `MAX_INLINE_CHUNK_SIZE`. +//! +//! # Safety +//! +//! The consuming side should always use `openat2(RESOLVE_BENEATH)` or equivalent. +//! This crate uses `rustix::fs::openat2` with +//! `RESOLVE_BENEATH | RESOLVE_NO_SYMLINKS | RESOLVE_NO_MAGICLINKS`, falling back +//! to `openat(O_NOFOLLOW)` on kernels older than 5.6. The [`validate_filename`] +//! function rejects `..` components, absolute paths, and embedded NUL bytes before +//! any syscall is made. +//! +//! The reader performs only *structural* validation (framing, limits); it does +//! **not** validate filename *content*. Callers that consume +//! [`Chunk::FileBackedData`] `.filename` directly must call [`validate_filename`] +//! (or use [`open_beneath`] / [`reconstruct`], which do). +//! +//! # Examples +//! +//! Write a stream, then inspect its chunk structure: +//! +//! ``` +//! use composefs_splitdirfdstream::{SplitdirfdstreamWriter, SplitdirfdstreamReader, Chunk}; +//! +//! // Write a stream: inline glue bytes plus a reference to dirfds[0]/data/blob. +//! let mut buffer = Vec::new(); +//! let mut writer = SplitdirfdstreamWriter::new(&mut buffer); +//! writer.write_metadata(b"tar header bytes").unwrap(); +//! writer.write_file_backed_data(0, 5, b"data/blob").unwrap(); // 5 bytes from dirfds[0]/data/blob +//! writer.write_metadata(b"tar padding").unwrap(); +//! writer.finish().unwrap(); +//! +//! // Inspect the chunk structure. +//! let mut reader = SplitdirfdstreamReader::new(buffer.as_slice()); +//! while let Some(chunk) = reader.next_chunk().unwrap() { +//! match chunk { +//! Chunk::Metadata(data) => { /* tar header/padding bytes */ } +//! Chunk::InlineData(data) => { /* inline file content (non-world-readable files) */ } +//! Chunk::FileBackedData { dirfd_index, length, filename } => { /* (dirfds[i], name, len) */ } +//! } +//! } +//! ``` +//! +//! To reconstruct the full byte stream, supply the directory fds to +//! [`reconstruct`], which resolves and splices each external chunk for you: +//! +//! ```no_run +//! use std::os::fd::BorrowedFd; +//! # fn demo(stream: &[u8], dir: BorrowedFd<'_>, out: &mut Vec) { +//! let dirfds = [dir]; +//! let total = composefs_splitdirfdstream::reconstruct(stream, &dirfds, out).unwrap(); +//! # let _ = total; +//! # } +//! ``` +//! +//! # API surface +//! +//! | Item | Role | +//! |------|------| +//! | [`SplitdirfdstreamWriter`] | Encode inline + external chunks into the wire format. | +//! | [`SplitdirfdstreamReader`] | Iterate a stream as borrowed [`Chunk`]s. | +//! | [`Chunk`] | `Metadata(&[u8])`, `InlineData(&[u8])`, or `FileBackedData { dirfd_index, length, filename }`. | +//! | [`reconstruct`] | Reconstruct the full byte stream given the directory fds. | +//! | [`open_beneath`] | Safely open one external file beneath a directory fd. | +//! | [`validate_filename`] | The path-safety predicate (reused by the writer/consumer). | +//! +//! # See also +//! +//! This crate is only the stream format. A higher-level control channel — +//! opening a source, negotiating capabilities, and handing over the stream fd +//! plus directory fds over a socket via `SCM_RIGHTS` — is layered on top +//! elsewhere (e.g. the `composefs-storage` layer-transfer service); it carries +//! structured metadata only, with all binary content flowing through this format +//! and the directory fds. + +// This is a library: emit diagnostics via the `log` crate (or return them), +// never by writing to the process's stdout/stderr. Genuinely-intentional +// exceptions carry a local `#[allow]` with justification. Test code is exempt. +#![cfg_attr(not(test), deny(clippy::print_stdout, clippy::print_stderr))] + +use std::ffi::CString; +use std::io::{Read, Write}; +use std::os::fd::{BorrowedFd, OwnedFd}; + +use rustix::fs::{Mode, OFlags, ResolveFlags}; +use rustix::io::Errno; + +pub mod transport; +#[cfg(feature = "tokio")] +pub use transport::spawn_self_reaping_producer; +pub use transport::{ + FdLimitError, LayerFdLayout, MAX_FDS_PER_FRAME, build_layer_fd_layout, open_devnull, + seed_from_id, split_fds_into_frames, +}; + +/// Maximum size for an inline chunk (256 MiB). +/// +/// This limit prevents denial-of-service attacks where a malicious stream +/// could specify an extremely large inline chunk size, causing unbounded +/// memory allocation. +pub const MAX_INLINE_CHUNK_SIZE: usize = 256 * 1024 * 1024; + +/// Maximum length of an external filename in bytes. +/// +/// Filenames longer than this are rejected by [`validate_filename`] and +/// by the reader before any buffer is resized. +pub const MAX_FILENAME_LEN: usize = 4096; + +/// Errors that can occur while reading or writing a splitdirfdstream. +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum Error { + /// An underlying I/O error. + #[error(transparent)] + Io(#[from] std::io::Error), + + /// The stream ended in the middle of a chunk. + #[error("truncated stream: expected {expected} more bytes in chunk")] + Truncated { + /// How many bytes were still expected when EOF was encountered. + /// + /// Semantics depend on *where* truncation occurred: + /// + /// - **Type byte**: `1` if EOF arrived after 0 bytes (but that is + /// returned as `Ok(None)`; this only fires for 0 < n < expected). + /// - **Inline body or FileBackedData header/name**: the *full* declared + /// size (i.e. the number passed to `read_exact`), because `read_exact` + /// does not report how many bytes it successfully consumed before EOF. + expected: u64, + }, + + /// An inline chunk's size field exceeds [`MAX_INLINE_CHUNK_SIZE`]. + #[error("inline chunk size {size} exceeds maximum {max}")] + InlineTooLarge { + /// The size read from the stream. + size: usize, + /// The maximum allowed size. + max: usize, + }, + + /// An unrecognised chunk type byte was encountered. + #[error("unknown chunk type byte 0x{0:02x}")] + UnknownChunkType(u8), + + /// A filename exceeds [`MAX_FILENAME_LEN`] bytes. + #[error("filename length {len} exceeds maximum {max}")] + FilenameTooLong { + /// The length of the filename. + len: usize, + /// The maximum allowed length. + max: usize, + }, + + /// A filename failed validation. + #[error("invalid filename: {reason}")] + InvalidFilename { + /// Human-readable description of why the filename is invalid. + reason: &'static str, + }, + + /// A dirfd index in an external chunk was out of range. + #[error("dirfd index {index} out of range (have {count} dirfds)")] + DirfdIndexOutOfRange { + /// The index that was out of range. + index: u32, + /// The number of dirfds available. + count: usize, + }, + + /// An external file was shorter than declared in the stream. + #[error("external file shorter than declared length {declared} (got {actual})")] + ExternalTooShort { + /// The length declared in the stream. + declared: u64, + /// The number of bytes actually read before EOF. + actual: u64, + }, +} + +impl From for Error { + fn from(e: Errno) -> Self { + Error::Io(e.into()) + } +} + +/// Convenience alias for `Result`. +pub type Result = std::result::Result; + +/// A chunk decoded from a splitdirfdstream. +/// +/// Chunks carry either stream metadata (raw tar header/padding bytes), +/// inline-transported file content (non-world-readable files the producer +/// read through a privileged fd), or references to external files identified +/// by a directory fd index and a relative filename. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Chunk<'a> { + /// Stream metadata: raw tar header or padding bytes embedded directly in + /// the stream. Consumers write these verbatim into the output. + Metadata(&'a [u8]), + + /// Inline-transported file content. The producer read these bytes through + /// a privileged fd; the consumer decides whether to store them inline + /// (≤ splitstream threshold) or as an external object based on `data.len()`. + InlineData(&'a [u8]), + + /// Reference to an external file relative to one of the caller-supplied + /// directory file descriptors. + /// + /// The reader does **not** validate `filename` content — callers that + /// consume `filename` directly **must** call [`validate_filename`] or use + /// [`open_beneath`] / [`reconstruct`], which call it internally. + FileBackedData { + /// Index into the `dirfds` array supplied to [`reconstruct`]. + dirfd_index: u32, + /// Number of bytes to read from the file, starting at offset 0. + length: u64, + /// Relative filename within `dirfds[dirfd_index]`. + filename: &'a [u8], + }, +} + +/// Writer for building a splitdirfdstream. +/// +/// Encodes inline data and dirfd-relative file references into the +/// splitdirfdstream binary format. +/// +/// # Example +/// +/// ``` +/// use composefs_splitdirfdstream::SplitdirfdstreamWriter; +/// +/// let mut buffer = Vec::new(); +/// let mut writer = SplitdirfdstreamWriter::new(&mut buffer); +/// writer.write_metadata(b"hello").unwrap(); +/// writer.write_file_backed_data(0, 42, b"objects/abc123").unwrap(); +/// let _buf = writer.finish().unwrap(); +/// ``` +#[derive(Debug)] +pub struct SplitdirfdstreamWriter { + writer: W, +} + +impl SplitdirfdstreamWriter { + /// Create a new writer wrapping `writer`. + pub fn new(writer: W) -> Self { + Self { writer } + } + + /// Write an inline chunk containing `data`. + /// + /// Empty slices are silently ignored (no bytes are written). + /// + /// # Errors + /// + /// Returns [`Error::InlineTooLarge`] if `data.len()` exceeds + /// [`MAX_INLINE_CHUNK_SIZE`], or propagates I/O errors from the + /// underlying writer. + pub fn write_metadata(&mut self, data: &[u8]) -> Result<()> { + if data.is_empty() { + return Ok(()); + } + if data.len() > MAX_INLINE_CHUNK_SIZE { + return Err(Error::InlineTooLarge { + size: data.len(), + max: MAX_INLINE_CHUNK_SIZE, + }); + } + self.writer.write_all(&[0x00u8])?; + self.writer.write_all(&(data.len() as u32).to_le_bytes())?; + self.writer.write_all(data)?; + Ok(()) + } + + /// Write an [`InlineData`](Chunk::InlineData) chunk carrying `data` bytes + /// of file content transported inline. + /// + /// Unlike [`write_metadata`](Self::write_metadata), a zero-length slice IS + /// written (a zero-byte non-world-readable file must still round-trip). + /// + /// # Errors + /// + /// Returns [`Error::InlineTooLarge`] if `data.len()` exceeds + /// [`MAX_INLINE_CHUNK_SIZE`], or propagates I/O errors from the underlying + /// writer. + pub fn write_inline_data(&mut self, data: &[u8]) -> Result<()> { + if data.len() > MAX_INLINE_CHUNK_SIZE { + return Err(Error::InlineTooLarge { + size: data.len(), + max: MAX_INLINE_CHUNK_SIZE, + }); + } + self.writer.write_all(&[0x01u8])?; + self.writer.write_all(&(data.len() as u32).to_le_bytes())?; + self.writer.write_all(data)?; + Ok(()) + } + + /// Write a [`FileBackedData`](Chunk::FileBackedData) chunk referencing a + /// file by dirfd index and relative filename. + /// + /// The consumer will open `filename` beneath `dirfds[dirfd_index]` and + /// read exactly `length` bytes from offset 0. + /// + /// `filename` is validated by [`validate_filename`] before writing. + /// + /// # Errors + /// + /// Returns any [`Error`] produced by [`validate_filename`], or propagates + /// I/O errors from the underlying writer. + pub fn write_file_backed_data( + &mut self, + dirfd_index: u32, + length: u64, + filename: &[u8], + ) -> Result<()> { + validate_filename(filename)?; + self.writer.write_all(&[0x02u8])?; + self.writer.write_all(&length.to_le_bytes())?; + self.writer.write_all(&dirfd_index.to_le_bytes())?; + self.writer + .write_all(&(filename.len() as u32).to_le_bytes())?; + self.writer.write_all(filename)?; + Ok(()) + } + + /// Consume the writer and return the underlying `Write` impl. + pub fn finish(self) -> Result { + Ok(self.writer) + } +} + +/// Reader for parsing a splitdirfdstream. +/// +/// Yields [`Chunk`] values by parsing the binary format. The internal buffer +/// is reused across calls so that only one chunk's data is live at a time. +/// +/// # Example +/// +/// ``` +/// use composefs_splitdirfdstream::{SplitdirfdstreamReader, Chunk}; +/// +/// // Manually constructed stream: Metadata "hello" +/// // Format: [0x00][5u32 LE][b"hello"] +/// let mut data = Vec::new(); +/// data.push(0x00u8); +/// data.extend_from_slice(&5u32.to_le_bytes()); +/// data.extend_from_slice(b"hello"); +/// +/// let mut reader = SplitdirfdstreamReader::new(data.as_slice()); +/// assert_eq!(reader.next_chunk().unwrap(), Some(Chunk::Metadata(b"hello"))); +/// assert_eq!(reader.next_chunk().unwrap(), None); +/// ``` +#[derive(Debug)] +pub struct SplitdirfdstreamReader { + reader: R, + /// Internal buffer reused across [`next_chunk`](SplitdirfdstreamReader::next_chunk) calls. + buffer: Vec, +} + +impl SplitdirfdstreamReader { + /// Create a new reader wrapping `reader`. + pub fn new(reader: R) -> Self { + Self { + reader, + buffer: Vec::new(), + } + } + + /// Consume this reader, returning the underlying `Read` impl. + pub fn into_inner(self) -> R { + self.reader + } + + /// Return the next chunk from the stream, or `None` at a clean EOF. + /// + /// A clean EOF is one where zero bytes have been consumed from the current + /// type byte. Any partial read (0 < n < expected) is [`Error::Truncated`]. + /// + /// The returned [`Chunk::FileBackedData`] contains the raw `filename` bytes + /// without any validation — callers that use `filename` directly **must** + /// call [`validate_filename`] themselves, or use [`reconstruct`] which does + /// it internally via [`open_beneath`]. + /// + /// # Errors + /// + /// - [`Error::Truncated`] — stream ended mid-chunk + /// - [`Error::InlineTooLarge`] — Metadata or InlineData body size exceeds [`MAX_INLINE_CHUNK_SIZE`] + /// - [`Error::FilenameTooLong`] — filename length exceeds [`MAX_FILENAME_LEN`] + /// - [`Error::UnknownChunkType`] — unrecognised type byte + /// - [`Error::Io`] — underlying I/O error + pub fn next_chunk(&mut self) -> Result>> { + // Step 1: read the 1-byte type, distinguishing clean EOF from truncation. + let mut type_byte = [0u8; 1]; + let mut got = 0usize; + loop { + match self.reader.read(&mut type_byte[got..]) { + Ok(0) => { + if got == 0 { + return Ok(None); // clean EOF at chunk boundary + } + return Err(Error::Truncated { expected: 1 }); + } + Ok(n) => { + got += n; + if got == 1 { + break; + } + } + Err(e) => return Err(Error::Io(e)), + } + } + + match type_byte[0] { + 0x00 | 0x01 => { + // Metadata (0x00) or InlineData (0x01): read 4-byte u32 LE body length. + let mut len_bytes = [0u8; 4]; + self.reader.read_exact(&mut len_bytes).map_err(|e| { + if e.kind() == std::io::ErrorKind::UnexpectedEof { + Error::Truncated { expected: 4 } + } else { + Error::Io(e) + } + })?; + let length = u32::from_le_bytes(len_bytes) as usize; + if length > MAX_INLINE_CHUNK_SIZE { + return Err(Error::InlineTooLarge { + size: length, + max: MAX_INLINE_CHUNK_SIZE, + }); + } + self.buffer.resize(length, 0); + self.reader.read_exact(&mut self.buffer).map_err(|e| { + if e.kind() == std::io::ErrorKind::UnexpectedEof { + Error::Truncated { + expected: length as u64, + } + } else { + Error::Io(e) + } + })?; + if type_byte[0] == 0x00 { + Ok(Some(Chunk::Metadata(&self.buffer))) + } else { + Ok(Some(Chunk::InlineData(&self.buffer))) + } + } + 0x02 => { + // FileBackedData: [u64 LE content_length][u32 LE dirfd_index][u32 LE name_len][name bytes] + let mut header = [0u8; 16]; + self.reader.read_exact(&mut header).map_err(|e| { + if e.kind() == std::io::ErrorKind::UnexpectedEof { + Error::Truncated { expected: 16 } + } else { + Error::Io(e) + } + })?; + let length = u64::from_le_bytes(header[0..8].try_into().unwrap()); + let dirfd_index = u32::from_le_bytes(header[8..12].try_into().unwrap()); + let name_len = u32::from_le_bytes(header[12..16].try_into().unwrap()) as usize; + if name_len > MAX_FILENAME_LEN { + return Err(Error::FilenameTooLong { + len: name_len, + max: MAX_FILENAME_LEN, + }); + } + self.buffer.resize(name_len, 0); + self.reader.read_exact(&mut self.buffer).map_err(|e| { + if e.kind() == std::io::ErrorKind::UnexpectedEof { + Error::Truncated { + expected: name_len as u64, + } + } else { + Error::Io(e) + } + })?; + Ok(Some(Chunk::FileBackedData { + dirfd_index, + length, + filename: &self.buffer, + })) + } + other => Err(Error::UnknownChunkType(other)), + } + } +} + +/// Validate that `filename` is an acceptable external filename. +/// +/// Checks, in order: +/// 1. Not empty. +/// 2. Not longer than [`MAX_FILENAME_LEN`]. +/// 3. No embedded NUL bytes. +/// 4. Not an absolute path (does not start with `/`). +/// 5. No `..` path components. +/// +/// Note that `.` components and empty components (from `//`) are permitted, +/// as the kernel will handle them safely. +/// +/// # Errors +/// +/// Returns [`Error::InvalidFilename`] or [`Error::FilenameTooLong`] on failure. +pub fn validate_filename(filename: &[u8]) -> Result<()> { + if filename.is_empty() { + return Err(Error::InvalidFilename { + reason: "empty filename", + }); + } + if filename.len() > MAX_FILENAME_LEN { + return Err(Error::FilenameTooLong { + len: filename.len(), + max: MAX_FILENAME_LEN, + }); + } + if filename.contains(&0u8) { + return Err(Error::InvalidFilename { + reason: "embedded NUL", + }); + } + if filename[0] == b'/' { + return Err(Error::InvalidFilename { + reason: "absolute path", + }); + } + for component in filename.split(|&b| b == b'/') { + if component == b".." { + return Err(Error::InvalidFilename { + reason: "`..` component", + }); + } + } + Ok(()) +} + +/// Open a file at `filename` relative to `dirfd` using safe kernel primitives. +/// +/// Internally calls `openat2(2)` with `RESOLVE_BENEATH | RESOLVE_NO_MAGICLINKS | +/// RESOLVE_NO_XDEV` when available. On kernels that do not support `openat2` +/// (`ENOSYS`), falls back to plain `openat(2)`. +/// +/// # Security note +/// +/// The `openat2` path prevents directory escapes (`RESOLVE_BENEATH`), blocks +/// magic-link traversal (`RESOLVE_NO_MAGICLINKS`), prevents crossing filesystem +/// boundaries (`RESOLVE_NO_XDEV`), and **does** allow symlinks as long as they +/// resolve within the base directory. A symlink pointing outside the base +/// directory is still rejected by `RESOLVE_BENEATH`. +/// +/// The `openat` fallback does not enforce `RESOLVE_BENEATH` (that kernel +/// feature is unavailable on old kernels), so callers should prefer environments +/// with kernel ≥ 5.6 (where `openat2` is available) when strict confinement is +/// required. [`validate_filename`] is still called before any syscall to reject +/// `..` components and absolute paths. +/// +/// # Errors +/// +/// Returns any error from [`validate_filename`], or an [`Error::Io`] wrapping +/// the kernel error (`EXDEV` for escape attempts, etc.). +pub fn open_beneath(dirfd: BorrowedFd<'_>, filename: &[u8]) -> Result { + validate_filename(filename)?; + + // Build a CString: validate_filename rejects embedded NUL so this is infallible. + let cname = CString::new(filename).expect("validate_filename guarantees no NUL"); + + // Try openat2 first (Linux ≥ 5.6). + // RESOLVE_BENEATH — prevent escaping the base directory via any path tricks. + // RESOLVE_NO_MAGICLINKS — block /proc/self/fd-style magic links. + // RESOLVE_NO_XDEV — prevent crossing filesystem mount boundaries. + // Symlinks that stay within the base directory are permitted. + let oflags = OFlags::RDONLY | OFlags::CLOEXEC; + let result = rustix::fs::openat2( + dirfd, + &cname, + oflags, + Mode::empty(), + ResolveFlags::BENEATH | ResolveFlags::NO_MAGICLINKS | ResolveFlags::NO_XDEV, + ); + + match result { + Ok(fd) => Ok(fd), + Err(e) if e == Errno::NOSYS => { + // Kernel too old for openat2; fall back to plain openat. + // validate_filename already rejected `..` and absolute paths. + rustix::fs::openat( + dirfd, + &cname, + OFlags::RDONLY | OFlags::CLOEXEC, + Mode::empty(), + ) + .map_err(Error::from) + } + Err(e) => Err(e.into()), + } +} + +/// Reconstruct the byte stream encoded in `stream` by combining inline chunks +/// and external file data, writing all output to `output`. +/// +/// For each [`Chunk::FileBackedData`], the corresponding file is opened with +/// [`open_beneath`] using `dirfds[dirfd_index]` as the base directory, and +/// exactly `length` bytes are read from offset 0 via positional reads +/// (`pread(2)`), so the same file can be referenced multiple times without +/// seeking. +/// +/// Returns the total number of bytes written to `output`. +/// +/// # Errors +/// +/// - [`Error::DirfdIndexOutOfRange`] — `dirfd_index` ≥ `dirfds.len()` +/// - [`Error::ExternalTooShort`] — external file has fewer bytes than declared +/// - Any error from [`open_beneath`] or from writing to `output` +pub fn reconstruct( + stream: R, + dirfds: &[BorrowedFd<'_>], + output: &mut W, +) -> Result { + let mut reader = SplitdirfdstreamReader::new(stream); + let mut total: u64 = 0; + const BUF_SIZE: usize = 128 * 1024; + + while let Some(chunk) = reader.next_chunk()? { + match chunk { + Chunk::Metadata(data) => { + output.write_all(data)?; + total += data.len() as u64; + } + Chunk::InlineData(data) => { + output.write_all(data)?; + total += data.len() as u64; + } + Chunk::FileBackedData { + dirfd_index, + length, + filename, + } => { + let idx = dirfd_index as usize; + if idx >= dirfds.len() { + return Err(Error::DirfdIndexOutOfRange { + index: dirfd_index, + count: dirfds.len(), + }); + } + let fd = open_beneath(dirfds[idx], filename)?; + // Cap the scratch buffer at BUF_SIZE; never size it from the + // attacker-controlled `length` (which can be up to u64::MAX), + // which would overflow `as usize` / wrap on 32-bit and panic + // under overflow-checks. `to_read` below bounds each read. + let mut buf = + vec![0u8; BUF_SIZE.min(usize::try_from(length).unwrap_or(usize::MAX))]; + let mut remaining = length; + let mut offset: u64 = 0; + while remaining > 0 { + let to_read = (remaining as usize).min(buf.len()); + let n = + rustix::io::pread(&fd, &mut buf[..to_read], offset).map_err(Error::from)?; + if n == 0 { + return Err(Error::ExternalTooShort { + declared: length, + actual: length - remaining, + }); + } + output.write_all(&buf[..n])?; + remaining -= n as u64; + offset += n as u64; + } + total += length; + } + } + } + + Ok(total) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::os::fd::AsFd; + + // ------------------------------------------------------------------------- + // Helpers + // ------------------------------------------------------------------------- + + /// Write `chunks` into a buffer and read them back, returning the decoded + /// chunks as `(dirfd_index, length, filename, inline_data)` tuples. + fn roundtrip_stream(writes: &[WriteCmd<'_>]) -> (Vec, Vec) { + let mut buf = Vec::new(); + { + let mut w = SplitdirfdstreamWriter::new(&mut buf); + for cmd in writes { + match cmd { + WriteCmd::Metadata(data) => w.write_metadata(data).unwrap(), + WriteCmd::InlineData(data) => w.write_inline_data(data).unwrap(), + WriteCmd::FileBackedData { + dirfd_index, + length, + filename, + } => w + .write_file_backed_data(*dirfd_index, *length, filename) + .unwrap(), + } + } + w.finish().unwrap(); + } + let mut reader = SplitdirfdstreamReader::new(buf.as_slice()); + let mut out = Vec::new(); + while let Some(chunk) = reader.next_chunk().unwrap() { + out.push(DecodedChunk::from(&chunk)); + } + (buf, out) + } + + #[derive(Debug)] + enum WriteCmd<'a> { + Metadata(&'a [u8]), + InlineData(&'a [u8]), + FileBackedData { + dirfd_index: u32, + length: u64, + filename: &'a [u8], + }, + } + + #[derive(Debug, PartialEq, Eq)] + enum DecodedChunk { + Metadata(Vec), + InlineData(Vec), + FileBackedData { + dirfd_index: u32, + length: u64, + filename: Vec, + }, + } + + impl<'a> From<&Chunk<'a>> for DecodedChunk { + fn from(c: &Chunk<'a>) -> Self { + match c { + Chunk::Metadata(d) => DecodedChunk::Metadata(d.to_vec()), + Chunk::InlineData(d) => DecodedChunk::InlineData(d.to_vec()), + Chunk::FileBackedData { + dirfd_index, + length, + filename, + } => DecodedChunk::FileBackedData { + dirfd_index: *dirfd_index, + length: *length, + filename: filename.to_vec(), + }, + } + } + } + + // ------------------------------------------------------------------------- + // Basic wire-format tests + // ------------------------------------------------------------------------- + + #[test] + fn empty_stream_returns_none() { + let mut reader = SplitdirfdstreamReader::new(b"".as_slice()); + assert_eq!(reader.next_chunk().unwrap(), None); + } + + #[test] + fn roundtrip_inline_only() { + let (_, chunks) = + roundtrip_stream(&[WriteCmd::Metadata(b"hello"), WriteCmd::Metadata(b"world")]); + assert_eq!( + chunks, + vec![ + DecodedChunk::Metadata(b"hello".to_vec()), + DecodedChunk::Metadata(b"world".to_vec()), + ] + ); + } + + #[test] + fn roundtrip_external_only() { + let (_, chunks) = roundtrip_stream(&[ + WriteCmd::FileBackedData { + dirfd_index: 0, + length: 100, + filename: b"a/b", + }, + WriteCmd::FileBackedData { + dirfd_index: 3, + length: 0, + filename: b"x/y/z", + }, + ]); + assert_eq!( + chunks, + vec![ + DecodedChunk::FileBackedData { + dirfd_index: 0, + length: 100, + filename: b"a/b".to_vec() + }, + DecodedChunk::FileBackedData { + dirfd_index: 3, + length: 0, + filename: b"x/y/z".to_vec() + }, + ] + ); + } + + #[test] + fn roundtrip_mixed_interleaved() { + let (_, chunks) = roundtrip_stream(&[ + WriteCmd::Metadata(b"header"), + WriteCmd::FileBackedData { + dirfd_index: 0, + length: 7, + filename: b"blob", + }, + WriteCmd::Metadata(b"middle"), + WriteCmd::FileBackedData { + dirfd_index: 1, + length: 3, + filename: b"sub/c", + }, + WriteCmd::Metadata(b"footer"), + ]); + assert_eq!(chunks.len(), 5); + assert_eq!(chunks[0], DecodedChunk::Metadata(b"header".to_vec())); + assert_eq!( + chunks[1], + DecodedChunk::FileBackedData { + dirfd_index: 0, + length: 7, + filename: b"blob".to_vec() + } + ); + assert_eq!(chunks[2], DecodedChunk::Metadata(b"middle".to_vec())); + assert_eq!( + chunks[3], + DecodedChunk::FileBackedData { + dirfd_index: 1, + length: 3, + filename: b"sub/c".to_vec() + } + ); + assert_eq!(chunks[4], DecodedChunk::Metadata(b"footer".to_vec())); + } + + #[test] + fn empty_inline_is_no_op() { + let (buf, chunks) = roundtrip_stream(&[ + WriteCmd::Metadata(b""), + WriteCmd::Metadata(b"real"), + WriteCmd::Metadata(b""), + ]); + // Only "real" produces a chunk; empties are silently dropped. + assert_eq!(chunks, vec![DecodedChunk::Metadata(b"real".to_vec())]); + // Buffer: 1-byte type + 4-byte u32 length + 4 bytes data + assert_eq!(buf.len(), 9); + } + + #[test] + fn external_wire_layout() { + // Verify the exact byte layout for a known external chunk. + // dirfd_index=2, length=9, filename=b"ab" + // Format: [0x02][9u64 LE][2u32 LE][2u32 LE][b'a',b'b'] + let mut buf = Vec::new(); + SplitdirfdstreamWriter::new(&mut buf) + .write_file_backed_data(2, 9, b"ab") + .unwrap(); + + let expected: Vec = { + let mut v = Vec::new(); + v.push(0x02u8); // type byte + v.extend_from_slice(&9u64.to_le_bytes()); // content_length + v.extend_from_slice(&2u32.to_le_bytes()); // dirfd_index + v.extend_from_slice(&2u32.to_le_bytes()); // name_len + v.extend_from_slice(b"ab"); // filename + v + }; + assert_eq!(buf, expected); + } + + #[test] + fn boundary_inline_sizes() { + for &size in &[1usize, 7, 8, 9, 255, 256, 257, 4095, 4096, 4097] { + let data: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + let (buf, chunks) = roundtrip_stream(&[WriteCmd::Metadata(&data)]); + + // Wire layout: 1-byte type + 4-byte u32 length + `size` bytes + assert_eq!(buf.len(), 5 + size, "buf.len() for size={size}"); + assert_eq!(buf[0], 0x00u8, "type byte for size={size}"); + let len_field = u32::from_le_bytes(buf[1..5].try_into().unwrap()); + assert_eq!(len_field, size as u32, "length field for size={size}"); + + assert_eq!( + chunks, + vec![DecodedChunk::Metadata(data)], + "data for size={size}" + ); + } + } + + // ------------------------------------------------------------------------- + // Reader limit / boundary error tests (hand-crafted buffers) + // ------------------------------------------------------------------------- + + #[test] + fn unknown_chunk_type_returns_error() { + // A type byte that is not 0x00, 0x01, or 0x02 must yield UnknownChunkType. + let buf = vec![0x03u8]; + let mut reader = SplitdirfdstreamReader::new(buf.as_slice()); + let err = reader.next_chunk().unwrap_err(); + assert!( + matches!(err, Error::UnknownChunkType(0x03)), + "expected UnknownChunkType(0x03), got {err:?}" + ); + } + + #[test] + fn error_unknown_chunk_type() { + // Feeding byte 0x80 must yield UnknownChunkType(0x80). + let buf = vec![0x80u8]; + let mut reader = SplitdirfdstreamReader::new(buf.as_slice()); + let err = reader.next_chunk().unwrap_err(); + assert!( + matches!(err, Error::UnknownChunkType(0x80)), + "expected UnknownChunkType(0x80), got {err:?}" + ); + } + + #[test] + fn error_filename_too_long_in_reader() { + // [0x02][0u64 LE][0u32 LE][name_len as u32 LE] with name_len > MAX_FILENAME_LEN + let name_len = MAX_FILENAME_LEN + 1; + let mut buf = Vec::new(); + buf.push(0x02u8); + buf.extend_from_slice(&0u64.to_le_bytes()); // content_length + buf.extend_from_slice(&0u32.to_le_bytes()); // dirfd_index + buf.extend_from_slice(&(name_len as u32).to_le_bytes()); // name_len + let mut reader = SplitdirfdstreamReader::new(buf.as_slice()); + let err = reader.next_chunk().unwrap_err(); + assert!( + matches!(err, Error::FilenameTooLong { len, max } if len == name_len && max == MAX_FILENAME_LEN), + "expected FilenameTooLong, got {err:?}" + ); + } + + #[test] + fn error_inline_too_large() { + // [0x00][(512 MiB) as u32 LE] — no body needed; length check fires first. + let size: u32 = 512 * 1024 * 1024; + let mut buf = Vec::new(); + buf.push(0x00u8); + buf.extend_from_slice(&size.to_le_bytes()); + let mut reader = SplitdirfdstreamReader::new(buf.as_slice()); + let err = reader.next_chunk().unwrap_err(); + assert!( + matches!(err, Error::InlineTooLarge { size: s, max } if s == 512*1024*1024 && max == MAX_INLINE_CHUNK_SIZE), + "expected InlineTooLarge, got {err:?}" + ); + } + + #[test] + fn error_truncated_inline_content() { + // [0x00][100u32 LE] then only 10 data bytes; Truncated{expected:100}. + let mut buf = Vec::new(); + buf.push(0x00u8); + buf.extend_from_slice(&100u32.to_le_bytes()); + buf.extend_from_slice(&[0u8; 10]); + let mut reader = SplitdirfdstreamReader::new(buf.as_slice()); + let err = reader.next_chunk().unwrap_err(); + assert!( + matches!(err, Error::Truncated { expected: 100 }), + "expected Truncated{{100}}, got {err:?}" + ); + } + + #[test] + fn error_truncated_file_backed_data() { + // [0x02] then only 5 of the 16 required header bytes. + let mut buf = Vec::new(); + buf.push(0x02u8); + buf.extend_from_slice(&[0u8; 5]); + let mut reader = SplitdirfdstreamReader::new(buf.as_slice()); + let err = reader.next_chunk().unwrap_err(); + assert!( + matches!(err, Error::Truncated { .. }), + "expected Truncated, got {err:?}" + ); + } + + #[test] + fn error_truncated_prefix_after_type_byte() { + // Type byte 0x00 is read, then EOF before the 4-byte length — Truncated{expected:4}. + let buf = vec![0x00u8]; + let mut reader = SplitdirfdstreamReader::new(buf.as_slice()); + let err = reader.next_chunk().unwrap_err(); + assert!( + matches!(err, Error::Truncated { expected: 4 }), + "expected Truncated{{4}}, got {err:?}" + ); + } + + #[test] + fn error_dirfd_index_out_of_range_via_reconstruct() { + // External chunk references dirfd_index=5 but we supply 1 dirfd. + let tmp = tempfile::tempdir().unwrap(); + let dir_fd = rustix::fs::open( + tmp.path(), + OFlags::RDONLY | OFlags::DIRECTORY | OFlags::CLOEXEC, + Mode::empty(), + ) + .unwrap(); + + let mut buf = Vec::new(); + SplitdirfdstreamWriter::new(&mut buf) + .write_file_backed_data(5, 0, b"dummy") + .unwrap(); + + let dirfds: &[BorrowedFd<'_>] = &[dir_fd.as_fd()]; + let mut out = Vec::new(); + let err = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap_err(); + assert!( + matches!(err, Error::DirfdIndexOutOfRange { index: 5, count: 1 }), + "expected DirfdIndexOutOfRange, got {err:?}" + ); + } + + // ------------------------------------------------------------------------- + // validate_filename tests (data-driven) + // ------------------------------------------------------------------------- + + /// Expected outcome for a validate_filename test case. + #[derive(Debug)] + enum FilenameExpect { + /// validate_filename must return Ok(()). + Ok, + /// validate_filename must return Err(InvalidFilename { reason }) where + /// `reason` contains the given substring. + InvalidFilename(&'static str), + /// validate_filename must return Err(FilenameTooLong { .. }). + TooLong, + } + + #[test] + fn validate_filename_cases() { + let long_ok: Vec = vec![b'a'; MAX_FILENAME_LEN]; + let long_bad: Vec = vec![b'a'; MAX_FILENAME_LEN + 1]; + + let cases: &[(&[u8], FilenameExpect)] = &[ + // ── rejection cases ────────────────────────────────────────────── + (b"", FilenameExpect::InvalidFilename("empty filename")), + (b"/abs", FilenameExpect::InvalidFilename("absolute path")), + (b"a/../b", FilenameExpect::InvalidFilename("`..` component")), + ( + b"../escape", + FilenameExpect::InvalidFilename("`..` component"), + ), + (b"a/b/..", FilenameExpect::InvalidFilename("`..` component")), + (b"..", FilenameExpect::InvalidFilename("`..` component")), // D2: bare `..` + (b"foo\0bar", FilenameExpect::InvalidFilename("embedded NUL")), + (&long_bad, FilenameExpect::TooLong), + // ── acceptance cases ───────────────────────────────────────────── + (b"a/b/c", FilenameExpect::Ok), // normal relative path + (b"a/./b", FilenameExpect::Ok), // `.` is allowed + (&long_ok, FilenameExpect::Ok), // exactly MAX_FILENAME_LEN + (b"..foo", FilenameExpect::Ok), // D5: `..`-prefix but not a component + (b"foo..", FilenameExpect::Ok), // D5: `..`-suffix but not a component + (b"a/..foo", FilenameExpect::Ok), // D5: `..`-prefixed component + (b"foo../b", FilenameExpect::Ok), // D5: `..`-suffixed component in path + ]; + + for (filename, expect) in cases { + let result = validate_filename(filename); + match expect { + FilenameExpect::Ok => { + assert!( + result.is_ok(), + "validate_filename({filename:?}) should be Ok, got {result:?}" + ); + } + FilenameExpect::InvalidFilename(substr) => { + assert!( + matches!(&result, Err(Error::InvalidFilename { reason }) if reason.contains(substr)), + "validate_filename({filename:?}) should be InvalidFilename containing {substr:?}, got {result:?}" + ); + } + FilenameExpect::TooLong => { + assert!( + matches!(&result, Err(Error::FilenameTooLong { .. })), + "validate_filename({filename:?}) should be FilenameTooLong, got {result:?}" + ); + } + } + } + } + + // ------------------------------------------------------------------------- + // Reconstruction tests + // ------------------------------------------------------------------------- + + fn open_dir(path: &std::path::Path) -> OwnedFd { + rustix::fs::open( + path, + OFlags::RDONLY | OFlags::DIRECTORY | OFlags::CLOEXEC, + Mode::empty(), + ) + .unwrap() + } + + #[test] + fn reconstruct_inline_only() { + let tmp = tempfile::tempdir().unwrap(); + let dir_fd = open_dir(tmp.path()); + + let mut buf = Vec::new(); + { + let mut w = SplitdirfdstreamWriter::new(&mut buf); + w.write_metadata(b"Hello, ").unwrap(); + w.write_metadata(b"world!").unwrap(); + w.finish().unwrap(); + } + + let dirfds: &[BorrowedFd<'_>] = &[dir_fd.as_fd()]; + let mut out = Vec::new(); + let n = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap(); + assert_eq!(out, b"Hello, world!"); + assert_eq!(n, 13); + } + + #[test] + fn reconstruct_with_externals() { + let tmp = tempfile::tempdir().unwrap(); + std::fs::write(tmp.path().join("a"), b"FILEONE").unwrap(); + std::fs::create_dir(tmp.path().join("subdir")).unwrap(); + std::fs::write(tmp.path().join("subdir/b"), b"FILETWO").unwrap(); + + let dir_fd = open_dir(tmp.path()); + + let mut buf = Vec::new(); + { + let mut w = SplitdirfdstreamWriter::new(&mut buf); + w.write_metadata(b"[").unwrap(); + w.write_file_backed_data(0, 7, b"a").unwrap(); + w.write_metadata(b"|").unwrap(); + w.write_file_backed_data(0, 7, b"subdir/b").unwrap(); + w.write_metadata(b"]").unwrap(); + w.finish().unwrap(); + } + + let dirfds: &[BorrowedFd<'_>] = &[dir_fd.as_fd()]; + let mut out = Vec::new(); + let n = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap(); + assert_eq!(out, b"[FILEONE|FILETWO]"); + assert_eq!(n, 17); + } + + #[test] + fn reconstruct_same_file_twice() { + let tmp = tempfile::tempdir().unwrap(); + std::fs::write(tmp.path().join("data"), b"REPEAT").unwrap(); + let dir_fd = open_dir(tmp.path()); + + let mut buf = Vec::new(); + { + let mut w = SplitdirfdstreamWriter::new(&mut buf); + w.write_file_backed_data(0, 6, b"data").unwrap(); + w.write_metadata(b"-").unwrap(); + w.write_file_backed_data(0, 6, b"data").unwrap(); + w.finish().unwrap(); + } + + let dirfds: &[BorrowedFd<'_>] = &[dir_fd.as_fd()]; + let mut out = Vec::new(); + let n = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap(); + // pread always starts from offset 0, so both refs read the full file. + assert_eq!(out, b"REPEAT-REPEAT"); + assert_eq!(n, 13); + } + + #[test] + fn reconstruct_length_shorter_than_file() { + let tmp = tempfile::tempdir().unwrap(); + std::fs::write(tmp.path().join("big"), b"ABCDEFGH").unwrap(); // 8 bytes + let dir_fd = open_dir(tmp.path()); + + let mut buf = Vec::new(); + { + let mut w = SplitdirfdstreamWriter::new(&mut buf); + w.write_file_backed_data(0, 4, b"big").unwrap(); // only first 4 bytes + w.finish().unwrap(); + } + + let dirfds: &[BorrowedFd<'_>] = &[dir_fd.as_fd()]; + let mut out = Vec::new(); + let n = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap(); + assert_eq!(out, b"ABCD"); + assert_eq!(n, 4); + } + + #[test] + fn reconstruct_length_longer_than_file_is_error() { + let tmp = tempfile::tempdir().unwrap(); + std::fs::write(tmp.path().join("small"), b"HI").unwrap(); // 2 bytes + let dir_fd = open_dir(tmp.path()); + + let mut buf = Vec::new(); + { + let mut w = SplitdirfdstreamWriter::new(&mut buf); + w.write_file_backed_data(0, 100, b"small").unwrap(); // declare 100, file has 2 + w.finish().unwrap(); + } + + let dirfds: &[BorrowedFd<'_>] = &[dir_fd.as_fd()]; + let mut out = Vec::new(); + let err = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap_err(); + assert!( + matches!( + err, + Error::ExternalTooShort { + declared: 100, + actual: 2 + } + ), + "expected ExternalTooShort, got {err:?}" + ); + } + + /// Build a raw splitdirfdstream buffer containing a single FileBackedData chunk + /// with a given `length` field and filename, without going through + /// the writer (so we can set length=u64::MAX which the writer itself would + /// also accept). + fn make_external_chunk_buf(dirfd_index: u32, length: u64, filename: &[u8]) -> Vec { + let mut buf = Vec::new(); + buf.push(0x02u8); // type byte + buf.extend_from_slice(&length.to_le_bytes()); // content_length + buf.extend_from_slice(&dirfd_index.to_le_bytes()); // dirfd_index + buf.extend_from_slice(&(filename.len() as u32).to_le_bytes()); // name_len + buf.extend_from_slice(filename); // filename + buf + } + + #[test] + fn reconstruct_length_u64_max_does_not_overflow() { + // Guards the buffer-sizing path: `BUF_SIZE.min(usize::try_from(length).unwrap_or(usize::MAX))` + // must not panic or produce a wrong result when length = u64::MAX. + // The file has only a few real bytes; the first pread reads them, the + // second sees EOF and must return ExternalTooShort — not a panic. + let tmp = tempfile::tempdir().unwrap(); + let fname = b"tiny"; + std::fs::write(tmp.path().join("tiny"), b"abc").unwrap(); // 3 bytes + let dir_fd = open_dir(tmp.path()); + + let buf = make_external_chunk_buf(0, u64::MAX, fname); + + let dirfds: &[BorrowedFd<'_>] = &[dir_fd.as_fd()]; + let mut out = Vec::new(); + let err = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap_err(); + assert!( + matches!( + err, + Error::ExternalTooShort { + declared: u64::MAX, + .. + } + ), + "expected ExternalTooShort with declared=u64::MAX, got {err:?}" + ); + } + + #[test] + fn reconstruct_multiple_dirfds() { + let tmp0 = tempfile::tempdir().unwrap(); + let tmp1 = tempfile::tempdir().unwrap(); + std::fs::write(tmp0.path().join("x"), b"FROM0").unwrap(); + std::fs::write(tmp1.path().join("y"), b"FROM1").unwrap(); + let dir0 = open_dir(tmp0.path()); + let dir1 = open_dir(tmp1.path()); + + let mut buf = Vec::new(); + { + let mut w = SplitdirfdstreamWriter::new(&mut buf); + w.write_file_backed_data(0, 5, b"x").unwrap(); + w.write_metadata(b"+").unwrap(); + w.write_file_backed_data(1, 5, b"y").unwrap(); + w.finish().unwrap(); + } + + let dirfds: &[BorrowedFd<'_>] = &[dir0.as_fd(), dir1.as_fd()]; + let mut out = Vec::new(); + let n = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap(); + assert_eq!(out, b"FROM0+FROM1"); + assert_eq!(n, 11); + } + + // ------------------------------------------------------------------------- + // open_beneath safety tests + // ------------------------------------------------------------------------- + + #[test] + fn open_beneath_rejects_escape_via_dotdot() { + // validate_filename catches ".." before any syscall, so this is + // kernel-independent. + let tmp = tempfile::tempdir().unwrap(); + let dir_fd = open_dir(tmp.path()); + let err = open_beneath(dir_fd.as_fd(), b"../anything").unwrap_err(); + assert!( + matches!( + err, + Error::InvalidFilename { + reason: "`..` component" + } + ), + "expected InvalidFilename, got {err:?}" + ); + } + + #[test] + fn open_beneath_follows_symlink_within_base() { + // A symlink whose target resolves *within* the base directory must be + // followed successfully — this is the new behaviour after removing + // RESOLVE_NO_SYMLINKS. The l/ symlinks created by + // containers/storage are exactly this kind of within-base symlink. + let tmp = tempfile::tempdir().unwrap(); + std::fs::write(tmp.path().join("target"), b"data").unwrap(); + std::os::unix::fs::symlink("target", tmp.path().join("link")).unwrap(); + + let dir_fd = open_dir(tmp.path()); + let result = open_beneath(dir_fd.as_fd(), b"link"); + assert!( + result.is_ok(), + "symlink resolving within the base directory must be followed; got {result:?}" + ); + } + + #[test] + fn open_beneath_rejects_escape_via_symlink() { + // A symlink whose target escapes the base directory must be rejected + // (RESOLVE_BENEATH) on kernels with openat2. On old kernels the + // fallback cannot enforce this; skip in that case. + let tmp = tempfile::tempdir().unwrap(); + std::fs::create_dir(tmp.path().join("real")).unwrap(); + std::fs::write(tmp.path().join("real/passwd"), b"data").unwrap(); + // Symlink to /etc — outside the base dir. + std::os::unix::fs::symlink("/etc", tmp.path().join("link")).unwrap(); + + let dir_fd = open_dir(tmp.path()); + + // Probe openat2 availability. + let probe = rustix::fs::openat2( + dir_fd.as_fd(), + rustix::cstr!("real/passwd"), + OFlags::RDONLY | OFlags::CLOEXEC, + Mode::empty(), + ResolveFlags::BENEATH | ResolveFlags::NO_MAGICLINKS | ResolveFlags::NO_XDEV, + ); + if let Err(e) = probe + && e == Errno::NOSYS + { + // openat2 not available: skip this test. + return; + } + + // openat2 is available: symlink escaping outside the base must be rejected. + let result = open_beneath(dir_fd.as_fd(), b"link/passwd"); + assert!( + result.is_err(), + "openat2 path must reject symlink escaping the base directory; got Ok" + ); + } + + // ------------------------------------------------------------------------- + // InlineData chunk tests + // ------------------------------------------------------------------------- + + #[test] + fn file_content_wire_layout() { + // Exact bytes: [0x01][5u32 LE][b"hello"] + let data = b"hello"; + let mut buf = Vec::new(); + SplitdirfdstreamWriter::new(&mut buf) + .write_inline_data(data) + .unwrap(); + + let expected: Vec = { + let mut v = Vec::new(); + v.push(0x01u8); // type byte + v.extend_from_slice(&5u32.to_le_bytes()); // length + v.extend_from_slice(b"hello"); // data + v + }; + assert_eq!(buf, expected, "InlineData wire layout mismatch"); + } + + #[test] + fn roundtrip_file_content_zero_bytes() { + // Zero-length FileContent must still be written and round-trip. + let (buf, chunks) = roundtrip_stream(&[WriteCmd::InlineData(b"")]); + // 1-byte type + 4-byte u32 length + 0 data = 5 bytes + assert_eq!(buf.len(), 5, "zero-length InlineData should be 5 bytes"); + assert_eq!(chunks, vec![DecodedChunk::InlineData(vec![])]); + } + + #[test] + fn roundtrip_file_content() { + for size in [0usize, 1, 64, 65, 4096] { + let data: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + let (buf, chunks) = roundtrip_stream(&[WriteCmd::InlineData(&data)]); + + // Wire layout: 1-byte type + 4-byte u32 length + `size` bytes + assert_eq!(buf.len(), 5 + size, "buf.len() for InlineData size={size}"); + assert_eq!( + chunks, + vec![DecodedChunk::InlineData(data)], + "decoded data for size={size}" + ); + } + } + + #[test] + fn reconstruct_file_content() { + // FileContent in a stream reconstructs to the verbatim bytes. + let tmp = tempfile::tempdir().unwrap(); + let dir_fd = open_dir(tmp.path()); + + let mut buf = Vec::new(); + { + let mut w = SplitdirfdstreamWriter::new(&mut buf); + w.write_metadata(b"[").unwrap(); + w.write_inline_data(b"CONTENT").unwrap(); + w.write_metadata(b"]").unwrap(); + w.finish().unwrap(); + } + + let dirfds: &[BorrowedFd<'_>] = &[dir_fd.as_fd()]; + let mut out = Vec::new(); + let n = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap(); + assert_eq!(out, b"[CONTENT]"); + assert_eq!(n, 9); + } + + // ------------------------------------------------------------------------- + // Proptest suite + // ------------------------------------------------------------------------- + + mod proptest_tests { + use super::*; + use proptest::prelude::*; + + /// Strategy: 1–4 path components of [a-z0-9]{1,8} joined by '/', + /// always stays below MAX_FILENAME_LEN and never contains `..`. + fn filename_strategy() -> impl Strategy> { + let component = prop::string::string_regex("[a-z0-9]{1,8}").unwrap(); + prop::collection::vec(component, 1..=4).prop_map(|parts| parts.join("/").into_bytes()) + } + + /// A chunk that carries its own content for proptest generation. + #[derive(Debug, Clone)] + enum TestChunk { + Metadata(Vec), + InlineData(Vec), + FileBackedData { + /// Unnormalized index in `0..4`; normalized to `% num_dirs` at test time. + dirfd_index: usize, + filename: Vec, + content: Vec, + }, + } + + fn metadata_strategy() -> impl Strategy { + prop::collection::vec(any::(), 1..=4096).prop_map(TestChunk::Metadata) + } + + fn file_backed_data_strategy() -> impl Strategy { + ( + 0usize..4, + filename_strategy(), + prop::collection::vec(any::(), 0..=8192), + ) + .prop_map(|(idx, name, content)| TestChunk::FileBackedData { + dirfd_index: idx, + filename: name, + content, + }) + } + + fn inline_data_strategy() -> impl Strategy { + prop::collection::vec(any::(), 0..=4096).prop_map(TestChunk::InlineData) + } + + fn chunk_strategy() -> impl Strategy { + prop_oneof![ + metadata_strategy(), + file_backed_data_strategy(), + inline_data_strategy() + ] + } + + fn chunks_strategy() -> impl Strategy> { + prop::collection::vec(chunk_strategy(), 0..=32) + } + + proptest! { + #![proptest_config(ProptestConfig::with_cases(256))] + + #[test] + fn proptest_roundtrip(chunks in chunks_strategy()) { + use std::collections::HashMap; + + let num_dirs = 4usize; + let tmpdirs: Vec = (0..num_dirs) + .map(|_| tempfile::tempdir().unwrap()) + .collect(); + + // Normalize chunks: assign each External chunk a name of the + // form "N/" where N is a small bucket index (0..=3). + // Using only 4 buckets instead of a per-chunk unique counter + // means some chunks will intentionally share (dir_idx, name). + // + // When two chunks share a (dir_idx, name) key, the *first* + // chunk's content wins: its bytes are written to disk and both + // references reconstruct the same content, so the expected + // output stays well-defined. This exercises the pread-from- + // offset-0 restart property (same file opened twice). + #[derive(Debug)] + enum ResolvedChunk { + Metadata(Vec), + InlineData(Vec), + FileBackedData { dir_idx: usize, unique_name: Vec, content: Vec }, + } + + // Map (dir_idx, unique_name) -> first-seen content. + let mut content_map: HashMap<(usize, Vec), Vec> = HashMap::new(); + + let mut resolved: Vec = Vec::with_capacity(chunks.len()); + let mut ext_counter = 0usize; + for chunk in &chunks { + match chunk { + TestChunk::Metadata(data) => { + resolved.push(ResolvedChunk::Metadata(data.clone())); + } + TestChunk::InlineData(data) => { + resolved.push(ResolvedChunk::InlineData(data.clone())); + } + TestChunk::FileBackedData { dirfd_index, filename, content } => { + let dir_idx = dirfd_index % num_dirs; + // Flatten any '/' in the generated name so a file + // path can never be a *prefix* of another file path + // (which on disk would require the prefix to be both + // a regular file and a directory, an impossible + // fixture that yields ENOTDIR). The library itself + // supports nested paths; this only keeps the + // on-disk test fixtures internally consistent. + let orig = std::str::from_utf8(filename) + .unwrap() + .replace('/', "_"); + // Bucket index cycles through 0..=3 so collisions + // happen with probability ~(1 - 3/4^k) for k chunks, + // exercising the "same file referenced twice" path. + let bucket = ext_counter % 4; + ext_counter += 1; + let unique_name = + format!("{bucket}/{orig}").into_bytes(); + // Canonical content: first writer wins. + let key = (dir_idx, unique_name.clone()); + let canonical = content_map + .entry(key) + .or_insert_with(|| content.clone()) + .clone(); + resolved.push(ResolvedChunk::FileBackedData { + dir_idx, + unique_name, + content: canonical, + }); + } + } + } + + // Materialize external files on disk (write each unique path once; + // duplicates are skipped because the file already exists). + for chunk in &resolved { + if let ResolvedChunk::FileBackedData { dir_idx, unique_name, content } = chunk { + let path = tmpdirs[*dir_idx].path().join( + std::str::from_utf8(unique_name).unwrap() + ); + if !path.exists() { + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).unwrap(); + } + std::fs::write(&path, content).unwrap(); + } + } + } + + // Build the stream and expected output. + let mut stream_buf = Vec::new(); + let mut expected_output: Vec = Vec::new(); + { + let mut w = SplitdirfdstreamWriter::new(&mut stream_buf); + for chunk in &resolved { + match chunk { + ResolvedChunk::Metadata(data) => { + w.write_metadata(data).unwrap(); + expected_output.extend_from_slice(data); + } + ResolvedChunk::InlineData(data) => { + w.write_inline_data(data).unwrap(); + expected_output.extend_from_slice(data); + } + ResolvedChunk::FileBackedData { dir_idx, unique_name, content } => { + let len = content.len() as u64; + w.write_file_backed_data(*dir_idx as u32, len, unique_name).unwrap(); + expected_output.extend_from_slice(content); + } + } + } + w.finish().unwrap(); + } + + // Open dir fds. + let dir_fds: Vec = tmpdirs + .iter() + .map(|d| open_dir(d.path())) + .collect(); + let borrowed: Vec> = dir_fds.iter().map(|fd| fd.as_fd()).collect(); + + // Verify chunk sequence matches what we wrote. + { + let mut reader = SplitdirfdstreamReader::new(stream_buf.as_slice()); + let mut res_iter = resolved.iter(); + while let Some(chunk) = reader.next_chunk().unwrap() { + let expected = res_iter.next().unwrap(); + match (&chunk, expected) { + (Chunk::Metadata(data), ResolvedChunk::Metadata(exp)) => { + prop_assert_eq!(*data, exp.as_slice()); + } + (Chunk::InlineData(data), ResolvedChunk::InlineData(exp)) => { + prop_assert_eq!(*data, exp.as_slice()); + } + ( + Chunk::FileBackedData { dirfd_index, length, filename }, + ResolvedChunk::FileBackedData { dir_idx, unique_name, content }, + ) => { + prop_assert_eq!(*dirfd_index, *dir_idx as u32); + prop_assert_eq!(*length, content.len() as u64); + prop_assert_eq!(*filename, unique_name.as_slice()); + } + _ => { + return Err(TestCaseError::fail("chunk type mismatch")); + } + } + } + } + + // Verify reconstruction produces the expected concatenation. + let mut out = Vec::new(); + reconstruct(stream_buf.as_slice(), &borrowed, &mut out).unwrap(); + prop_assert_eq!(out, expected_output); + } + } + } +} diff --git a/crates/composefs-splitdirfdstream/src/transport.rs b/crates/composefs-splitdirfdstream/src/transport.rs new file mode 100644 index 00000000..cd3cbfd6 --- /dev/null +++ b/crates/composefs-splitdirfdstream/src/transport.rs @@ -0,0 +1,663 @@ +//! Source-agnostic FD-transport mechanics for the splitdirfdstream wire protocol. +//! +//! This module contains the pure, Storage/Layer-independent helpers that govern +//! *how* a set of file descriptors is partitioned into transport frames and how +//! the sparse dirfd-slot layout is computed. Both the `composefs-storage` +//! layer-transfer producer and future producers (e.g. a composefs-repo producer) +//! can share these primitives without pulling in any higher-level dependencies. +//! +//! # Public API +//! +//! | Item | Role | +//! |------|------| +//! | [`MAX_FDS_PER_FRAME`] | Enforced per-frame fd cap (240, safely below 253). | +//! | [`seed_from_id`] | Derive a deterministic `u64` seed from an opaque string id. | +//! | [`open_devnull`] | Open `/dev/null` for use as a dummy fd. | +//! | [`FdLimitError`] | Error from [`split_fds_into_frames`] when `more=false` overflows. | +//! | [`LayerFdLayout`] | Complete assembled wire layout from [`build_layer_fd_layout`]. | +//! | [`build_layer_fd_layout`] | Assemble the full sparse dirfd+keepalive+lifetime fd array. | +//! | [`split_fds_into_frames`] | Partition fds into frames, enforcing `more=false` cap. | +//! | [`spawn_self_reaping_producer`] | Spawn the 3-phase keepalive-lock producer task (requires `tokio` feature). | +//! +//! The producer-side seed jitter (sparse-slot placement, frame count, extra +//! lifetime fds) is computed by crate-internal helpers (`sparse_dir_slots`, +//! `compute_n_frames`, `n_extra_lifetime_fds`); the consumer never recomputes +//! it, since every `FileBackedData` chunk carries an explicit `dirfd_index`. + +use std::os::fd::OwnedFd; + +use rand::seq::SliceRandom as _; +use rand_pcg::Pcg64; +use rand_pcg::rand_core::SeedableRng as _; + +// ── Constants ──────────────────────────────────────────────────────────────── + +/// Our enforced per-frame fd cap. +/// +/// Kept safely below the hard kernel limit of 253 fds per `sendmsg(SCM_RIGHTS)` +/// call. Every transport frame carries at most this many fds. +pub const MAX_FDS_PER_FRAME: usize = 240; + +// ── Frame splitting ─────────────────────────────────────────────────────────── + +/// Partition `fds` into `n_frames` contiguous batches as evenly as possible. +/// +/// The first `fds.len() % n_frames` batches receive one extra fd (so all +/// batches are non-empty when `n_frames <= fds.len()`). Ordering is preserved: +/// concatenating the returned batches in order reconstructs the original vec. +/// +/// # Panics +/// Panics if `n_frames == 0` or `n_frames > fds.len()`. +pub(crate) fn split_into_frames(fds: Vec, n_frames: usize) -> Vec> { + assert!(n_frames > 0 && n_frames <= fds.len()); + let fd_count = fds.len(); + let base = fd_count / n_frames; + let remainder = fd_count % n_frames; + let mut result = Vec::with_capacity(n_frames); + let mut it = fds.into_iter(); + for i in 0..n_frames { + let batch_size = base + if i < remainder { 1 } else { 0 }; + result.push(it.by_ref().take(batch_size).collect()); + } + result +} + +// ── Seed derivation ─────────────────────────────────────────────────────────── + +/// Derive a deterministic `u64` seed from an opaque string identifier. +/// +/// Uses the first 8 bytes of SHA-256(id) interpreted as little-endian u64. +/// The result is stable across runs for the same id string. +pub fn seed_from_id(id: &str) -> u64 { + use sha2::Digest as _; + let hash = sha2::Sha256::digest(id.as_bytes()); + u64::from_le_bytes(hash[..8].try_into().expect("sha256 is at least 8 bytes")) +} + +// ── Frame-count helpers ─────────────────────────────────────────────────────── + +/// Compute the *hash-derived* minimum frame count for `fd_count` FDs keyed by `seed`. +/// +/// - If `fd_count < 3`: returns `max(fd_count, 1)` (never more frames than FDs, +/// but always at least one so `split_into_frames` has a valid divisor). This +/// branch is effectively dead: a real transport array is always ≥4 FDs +/// (pipe + ≥2 dirfd slots + keepalive). +/// - Otherwise: `max(3, seed % (fd_count + 1))` clamped to `fd_count`. +/// +/// This guarantees ≥3 frames whenever there are ≥3 FDs, which exercises the +/// multi-frame path in tests while keeping the arithmetic deterministic. +/// +/// **This is the hash-min component only.** The call site additionally enforces +/// a per-frame cap ([`MAX_FDS_PER_FRAME`]) by taking `hash_min.max(cap_min)` where +/// `cap_min = ceil(fd_count / MAX_FDS_PER_FRAME)`. For small layers +/// (≤`MAX_FDS_PER_FRAME` fds) `cap_min = 1` so the hash-min always wins, +/// preserving test-hardening behaviour. +pub(crate) fn n_frames_for(seed: u64, fd_count: usize) -> usize { + if fd_count < 3 { + return fd_count.max(1); + } + let raw = (seed % (fd_count as u64 + 1)) as usize; + raw.max(3).min(fd_count) +} + +/// Compute the actual number of transport frames for a `more=true` call. +/// +/// Returns the larger of: +/// - the hash-derived minimum ([`n_frames_for`]`(seed, fd_count)`, ≥3 for real +/// layers), which exercises multi-frame paths in tests; and +/// - `ceil(fd_count / MAX_FDS_PER_FRAME)`, the minimum needed so every frame +/// carries at most [`MAX_FDS_PER_FRAME`] fds (kernel SCM_RIGHTS cap). +/// +/// The result is additionally clamped to `[1, fd_count]` so `split_into_frames` +/// never receives an out-of-range value. +/// +/// Proved invariant: `fd_count.div_ceil(n) <= MAX_FDS_PER_FRAME` for the +/// returned `n`, because `n >= ceil(fd_count / MAX_FDS_PER_FRAME)`. +pub(crate) fn compute_n_frames(seed: u64, fd_count: usize) -> usize { + let hash_min = n_frames_for(seed, fd_count); + let cap_min = fd_count.div_ceil(MAX_FDS_PER_FRAME); + hash_min.max(cap_min).min(fd_count).max(1) +} + +// ── Dummy-fd helpers ────────────────────────────────────────────────────────── + +/// Open `/dev/null` as an opaque dummy fd. +/// +/// Returns a plain [`std::io::Result`] so callers can map it to their own error type. +pub fn open_devnull() -> std::io::Result { + rustix::fs::open( + c"/dev/null", + rustix::fs::OFlags::RDONLY, + rustix::fs::Mode::empty(), + ) + .map_err(std::io::Error::from) +} + +/// Open an anonymous `memfd` as an opaque dummy fd. +/// +/// Returns a plain [`std::io::Result`] so callers can map it to their own error type. +pub(crate) fn open_memfd() -> std::io::Result { + rustix::fs::memfd_create( + c"composefs-layer-transfer-dummy", + rustix::fs::MemfdFlags::CLOEXEC, + ) + .map_err(std::io::Error::from) +} + +// ── Sparse-slot layout ──────────────────────────────────────────────────────── + +/// Describes the sparse dirfd-slot layout produced by [`sparse_dir_slots`]. +/// +/// The `dirfds` array passed to the consumer has length `total_slots` and contains +/// real diff-dir fds at `real_slot_indices[i]` (for chain layer `i`) and dummy fds +/// at all other positions. +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct SparseLayout { + /// Total number of slots in the dirfds array (real + dummy). + pub(crate) total_slots: usize, + /// Ascending slot indices assigned to real layers. + /// + /// `real_slot_indices[i]` is the slot index for chain layer `i`. + pub(crate) real_slot_indices: Vec, + /// Slot indices that are dummy (complement of `real_slot_indices` in `0..total_slots`). + pub(crate) dummy_slot_indices: Vec, +} + +/// Compute the sparse slot assignment for `n_real` real layers using `seed`. +/// +/// This is a pure function (no I/O) that encodes the deterministic slot-placement +/// algorithm: +/// +/// 1. `n_dummy = (seed >> 8) % (n_real + 1) + 1` (at least 1 dummy). +/// 2. `total_slots = n_real + n_dummy`. +/// 3. Deterministically shuffle `0..total_slots` with a `seed`-seeded PCG RNG, +/// take the first `n_real` as real-layer slot indices, sort them ascending. +/// 4. Dummy slots are `0..total_slots` minus the real indices. +/// +/// The shuffle uses [`Pcg64`], whose algorithm is stable across `rand_pcg` +/// releases, so the placement is reproducible from the seed (useful for tests). +/// Reproducibility is *not* a correctness requirement: the consumer is driven +/// entirely by the explicit `dirfd_index` in each `FileBackedData` chunk and +/// never recomputes this layout. +/// +/// The returned [`SparseLayout`] records `total_slots`, `real_slot_indices` +/// (ascending, length `n_real`), and `dummy_slot_indices` (ascending). +pub(crate) fn sparse_dir_slots(n_real: usize, seed: u64) -> SparseLayout { + // ── 1/2: total slot count (at least 1 dummy) ───────────────────────────── + let n_dummy: usize = ((seed >> 8) % (n_real as u64 + 1) + 1) as usize; + let total_slots = n_real + n_dummy; + + // ── 3: select real-layer slot positions ────────────────────────────────── + // Deterministically shuffle all slot indices from the seed, take the first + // n_real as the real-layer positions, then sort them ascending. + let mut rng = Pcg64::seed_from_u64(seed); + let mut shuffled_slots: Vec = (0..total_slots as u32).collect(); + shuffled_slots.shuffle(&mut rng); + let mut real_indices: Vec = shuffled_slots[..n_real].to_vec(); + real_indices.sort_unstable(); + + // ── 4: compute dummy indices (complement) ───────────────────────────────── + let real_set: std::collections::HashSet = real_indices.iter().copied().collect(); + let dummy_indices: Vec = (0..total_slots as u32) + .filter(|i| !real_set.contains(i)) + .collect(); + + SparseLayout { + total_slots, + real_slot_indices: real_indices, + dummy_slot_indices: dummy_indices, + } +} + +// ── Lifetime dummy count ────────────────────────────────────────────────────── + +/// Compute the number of extra opaque lifetime dummy fds for a given seed. +/// +/// Returns 0, 1, or 2 — derived as `(seed >> 16) % 3`. These are additional +/// dummy fds (beyond the keepalive pipe write-end) that the client must hold open +/// until it has finished reading the splitdirfdstream, then close. Alternating +/// `/dev/null` and `memfd` fd kinds exercise transparent client pass-through. +pub(crate) fn n_extra_lifetime_fds(seed: u64) -> usize { + ((seed >> 16) % 3) as usize +} + +// ── High-level fd-layout helpers ───────────────────────────────────────────── + +/// Error returned by [`split_fds_into_frames`] when `more=false` and the fd +/// count exceeds [`MAX_FDS_PER_FRAME`]. +/// +/// The caller must map this to their interface error and instruct the client to +/// retry with `more=true`. +#[derive(Debug, thiserror::Error)] +#[error("fd count {fd_count} exceeds per-frame limit {max_per_frame}; retry with more=true")] +pub struct FdLimitError { + /// Total number of fds that would be sent. + pub fd_count: usize, + /// The per-frame cap that was exceeded. + pub max_per_frame: usize, +} + +/// Assembled FD layout ready to wire to the client in one or more transport frames. +/// +/// This is the output of [`build_layer_fd_layout`]: a complete description of +/// every fd that will be sent across the socket, split into two groups: +/// +/// * The **wire region** — everything that travels to the client: +/// - `fds_all[0]` — pipe read end (carries the `splitdirfdstream` bytes). +/// - `fds_all[1..=dir_count]` — dirfds region (sparse: real dirs + dummies). +/// - `fds_all[dir_count+1..]` — lifetime region (keepalive write + extras). +/// * The **keepalive read end** — retained on the server; when the client drops +/// the keepalive write end the server sees EOF and may release resources. +/// +/// The `real_indices` slice maps chain layer `i` → slot index within the +/// *dirfds region* where that layer's real diff-dir fd sits. Pass this to the +/// producer (via [`build_layer_fd_layout`]'s return value) so it can write the +/// correct `dirfd_index` into each `FileBackedData` chunk. +#[derive(Debug)] +pub struct LayerFdLayout { + /// All fds to send to the client, ordered as described above. + pub fds_all: Vec, + /// Number of slots in the dirfds region (`dir_count` in reply and in the + /// wire layout: `fds_all[1..=dir_count]`). + pub dir_count: u32, + /// Slot index within the dirfds region for each real layer (ascending). + /// + /// `real_indices[i]` is the `dirfd_index` the producer must use for chain + /// layer `i`. + pub real_indices: Vec, + /// Server-side read end of the keepalive pipe. + /// + /// Hold this `OwnedFd` until the client has finished consuming the layer + /// (or until the request is complete if keepalive is not needed for resource + /// management). When it drops, the client's matching write end sees EOF. + pub keepalive_read: OwnedFd, +} + +/// Build the complete wire fd layout for a producer serving `n_real` real diff +/// directories. +/// +/// This pure-ish (only I/O: `pipe`, `/dev/null`, `memfd`) function: +/// +/// 1. Calls [`sparse_dir_slots`] to compute the sparse placement. +/// 2. Opens dummy fds (`/dev/null` alternating with `memfd`) for gap slots. +/// 3. Builds the dirfds region by interleaving real and dummy fds in slot order. +/// 4. Creates a keepalive pipe; the write end goes into `fds_all`, the read end +/// is returned in [`LayerFdLayout::keepalive_read`]. +/// 5. Adds `n_extra_lifetime_fds(seed)` extra dummy lifetime fds. +/// 6. Prepends the `pipe_read` fd so `fds_all[0]` is always the data pipe. +/// +/// The caller is responsible for: +/// * Passing the correct `real_fds` (in chain order) — the first real_fd gets +/// slot `real_indices[0]`, the second gets `real_indices[1]`, etc. +/// * Spawning the producer with `write_fd` (not returned here; the caller opens +/// the pipe and passes `write_fd` to the producer separately). +/// +/// # Arguments +/// * `pipe_read` — Read end of the data pipe (will become `fds_all[0]`). +/// * `real_fds` — Pre-opened real diff-directory file descriptors, one per +/// chain layer, in chain order. +/// * `seed` — Deterministic seed (see [`seed_from_id`]). +/// +/// # Errors +/// Returns `io::Error` if any dummy-fd or keepalive-pipe creation fails. +pub fn build_layer_fd_layout( + pipe_read: OwnedFd, + real_fds: Vec, + seed: u64, +) -> std::io::Result { + let n_real = real_fds.len(); + let layout = sparse_dir_slots(n_real, seed); + let total_slots = layout.total_slots; + + // ── Build dirfds region: slot i = real fd or dummy fd ──────────────────── + let mut dirfd_region: Vec> = (0..total_slots).map(|_| None).collect(); + + // Place real fds at their assigned slots. + for (chain_idx, real_fd) in real_fds.into_iter().enumerate() { + let slot = layout.real_slot_indices[chain_idx] as usize; + dirfd_region[slot] = Some(real_fd); + } + + // Fill dummy slots, alternating /dev/null and memfd. + for (dummy_ordinal, &dummy_slot) in layout.dummy_slot_indices.iter().enumerate() { + let dummy_fd = if dummy_ordinal % 2 == 0 { + open_devnull()? + } else { + open_memfd()? + }; + dirfd_region[dummy_slot as usize] = Some(dummy_fd); + } + + // Unwrap: every slot was filled above. + let dirfd_region: Vec = dirfd_region.into_iter().map(|o| o.unwrap()).collect(); + + // ── Keepalive pipe ──────────────────────────────────────────────────────── + let (keepalive_read, keepalive_write) = + rustix::pipe::pipe_with(rustix::pipe::PipeFlags::CLOEXEC).map_err(std::io::Error::from)?; + + // ── Extra lifetime dummy fds ────────────────────────────────────────────── + let n_extra = n_extra_lifetime_fds(seed); + let mut extra_lifetime: Vec = Vec::with_capacity(n_extra); + for i in 0..n_extra { + let fd = if i % 2 == 0 { + open_devnull()? + } else { + open_memfd()? + }; + extra_lifetime.push(fd); + } + + // ── Assemble fds_all ────────────────────────────────────────────────────── + // index 0 : pipe read + // index 1..=dir_count : dirfds region + // index dir_count+1 : keepalive write end + // remaining : extra lifetime dummies + let dir_count = total_slots as u32; + let mut fds_all: Vec = Vec::with_capacity(1 + total_slots + 1 + n_extra); + fds_all.push(pipe_read); + fds_all.extend(dirfd_region); + fds_all.push(keepalive_write); + fds_all.extend(extra_lifetime); + + Ok(LayerFdLayout { + fds_all, + dir_count, + real_indices: layout.real_slot_indices, + keepalive_read, + }) +} + +/// Partition a complete fd array into transport frames. +/// +/// When `more` is `true` the fds are split across `compute_n_frames(seed, n)` +/// batches (≥3 for test hardening, cap-limited to ≤[`MAX_FDS_PER_FRAME`] per +/// frame). +/// +/// When `more` is `false` all fds must fit in a single frame. If the total +/// count exceeds [`MAX_FDS_PER_FRAME`] the fds are returned intact inside an +/// [`Err(FdLimitError)`] so they can be dropped cleanly by the caller before +/// returning an error reply. +/// +/// # Returns +/// `Ok(Vec>)` — one inner vec per transport frame (1…N frames). +/// `Err(FdLimitError { fds, .. })` — the original fds are returned so the +/// caller can drop them; the error contains the count and cap for a diagnostic. +pub fn split_fds_into_frames( + fds: Vec, + seed: u64, + more: bool, +) -> Result>, (Vec, FdLimitError)> { + let fd_count = fds.len(); + if !more && fd_count > MAX_FDS_PER_FRAME { + return Err(( + fds, + FdLimitError { + fd_count, + max_per_frame: MAX_FDS_PER_FRAME, + }, + )); + } + let n = if more { + compute_n_frames(seed, fd_count) + } else { + 1 + }; + Ok(split_into_frames(fds, n)) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Self-reaping producer task (optional: requires `tokio` feature) +// ───────────────────────────────────────────────────────────────────────────── + +/// Spawn a self-reaping blocking producer task that implements the 3-phase +/// keepalive-lock protocol used by both the repo and cstor `GetLayer` +/// implementations. +/// +/// # Phases +/// +/// 1. **Produce**: call `produce(write_fd_as_File)`. When `produce` returns the +/// write end of the data pipe drops, signalling data-EOF to the consumer. +/// 2. **Keepalive wait**: read from `keepalive_read` until EOF. The consumer +/// drops its keepalive write end after it finishes draining the data pipe, +/// so this phase acts as a rendezvous before releasing any held resources. +/// 3. **Release**: drop `guard`. For the repo path `guard` is `()` (no-op); +/// for the cstor path `guard` is a `LayerStoreLock` whose drop releases the +/// shared flock. +/// +/// # Ordering guarantee +/// +/// Phase 2 begins **after** `write_fd` is dropped (end of Phase 1), so the +/// data pipe is always at EOF before we start waiting on the keepalive. The +/// reverse would cause a deadlock: if the consumer must drain to EOF to drop +/// the keepalive but the producer never finishes, the pipe never EOF. +/// +/// # Arguments +/// +/// * `write_fd` — Write end of the data pipe; moved into the closure and +/// dropped at the end of Phase 1. +/// * `keepalive_read` — Read end of the keepalive pipe; retained until Phase 2 +/// EOF, then dropped. +/// * `guard` — Opaque lifetime guard released in Phase 3 (e.g. a lock). +/// * `produce` — Called with a `std::fs::File` wrapping `write_fd`. Should +/// write the complete `splitdirfdstream` into the file and return. Errors +/// are logged as warnings; a short/corrupt stream will be detected later by +/// the consumer's integrity check. +#[cfg(feature = "tokio")] +pub fn spawn_self_reaping_producer( + write_fd: OwnedFd, + keepalive_read: OwnedFd, + guard: impl Send + 'static, + produce: impl FnOnce(std::fs::File) + Send + 'static, +) { + tokio::task::spawn_blocking(move || { + // Phase 1: produce into write_fd; drop write_fd (data-pipe EOF). + { + let wf = std::fs::File::from(write_fd); + produce(wf); + // write_fd drops here → data-pipe EOF visible to consumer. + } + + // Phase 2: wait for consumer to signal completion via keepalive EOF. + let mut buf = [0u8; 64]; + loop { + match rustix::io::read(&keepalive_read, &mut buf) { + Ok(0) => break, // EOF — consumer dropped its write end + Ok(_) => {} // unexpected bytes — drain and continue + Err(rustix::io::Errno::INTR) => {} // EINTR — retry + Err(_) => break, // other error — bail + } + } + + // Phase 3: release guard (e.g. drop the shared flock). + drop(guard); + }); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Tests +// ───────────────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + /// Seed for "child-layer-001" (SHA-256("child-layer-001")[0..8] as LE-u64). + /// + /// Verified: `seed_from_id("child-layer-001") == LAYER_SEED`. + const LAYER_SEED: u64 = 9_551_015_030_439_334_514; + + /// `compute_n_frames` must ensure every frame holds ≤ MAX_FDS_PER_FRAME fds + /// for a large synthetic fd count where the hash-min alone would be too small. + #[test] + fn test_compute_n_frames_caps_large_fd_count() { + // Use a seed whose hash-min is small (e.g. seed=0 → raw=0 → max(3,0)=3). + let seed: u64 = 0; + // Choose fd_count large enough that 3 frames would exceed 240 each. + // ceil(1000 / 3) = 334 > 240, so cap_min must win. + let fd_count: usize = 1000; + + let n = compute_n_frames(seed, fd_count); + + // Invariant: every frame carries at most MAX_FDS_PER_FRAME fds. + let max_frame_size = fd_count.div_ceil(n); + assert!( + max_frame_size <= MAX_FDS_PER_FRAME, + "frame size {max_frame_size} exceeds cap {MAX_FDS_PER_FRAME} (n={n}, fd_count={fd_count})", + ); + + // Also verify n is large enough: ceil(1000/240) = 5. + let cap_min = fd_count.div_ceil(MAX_FDS_PER_FRAME); + assert!(n >= cap_min, "n={n} < cap_min={cap_min}",); + } + + /// For small fd counts (≤ MAX_FDS_PER_FRAME) the hash-min should still + /// dominate so test-hardening (≥3 frames for real layers) is preserved. + #[test] + fn test_compute_n_frames_small_fd_count_preserves_hash_min() { + // The existing test layer has 9 fds → cap_min=1, hash_min=4. + let n = compute_n_frames(LAYER_SEED, 9); + // EXPECTED_N_FRAMES = 4 for "child-layer-001" + assert_eq!(n, 4, "hash_min should win for small fd counts"); + + // Any fd count ≤ MAX_FDS_PER_FRAME has cap_min=1; hash_min (≥3) always wins. + for fd_count in 3..=MAX_FDS_PER_FRAME { + let n = compute_n_frames(LAYER_SEED, fd_count); + let max_frame_size = fd_count.div_ceil(n); + assert!( + max_frame_size <= MAX_FDS_PER_FRAME, + "frame size {max_frame_size} > cap for fd_count={fd_count}", + ); + } + } + + /// `compute_n_frames` invariant holds across a sweep of large fd counts. + #[test] + fn test_compute_n_frames_cap_invariant_sweep() { + let seeds: &[u64] = &[0, 1, 42, LAYER_SEED, u64::MAX]; + let fd_counts: &[usize] = &[ + MAX_FDS_PER_FRAME, + MAX_FDS_PER_FRAME + 1, + MAX_FDS_PER_FRAME * 2, + MAX_FDS_PER_FRAME * 5, + 1000, + 5000, + ]; + for &seed in seeds { + for &fd_count in fd_counts { + let n = compute_n_frames(seed, fd_count); + let max_frame_size = fd_count.div_ceil(n); + assert!( + max_frame_size <= MAX_FDS_PER_FRAME, + "invariant violated: seed={seed} fd_count={fd_count} n={n} \ + max_frame_size={max_frame_size} cap={MAX_FDS_PER_FRAME}", + ); + } + } + } + + /// A non-streaming (`more=false`) call that would exceed MAX_FDS_PER_FRAME + /// must be detected by the over-cap check before the producer is spawned. + /// + /// We test the pure decision logic (`!more && fd_count > MAX_FDS_PER_FRAME`) + /// without opening real fds, keeping the test deterministic and cheap. + #[test] + fn test_more_false_over_cap_decision() { + // Verify the threshold is exactly MAX_FDS_PER_FRAME. + assert!( + !(!false && MAX_FDS_PER_FRAME > MAX_FDS_PER_FRAME), + "fd_count == cap should NOT trigger error", + ); + assert!( + !false && (MAX_FDS_PER_FRAME + 1) > MAX_FDS_PER_FRAME, + "fd_count == cap+1 MUST trigger error", + ); + + // Simulate: if !more and fd_count > MAX_FDS_PER_FRAME → error. + let should_error = + |more: bool, fd_count: usize| -> bool { !more && fd_count > MAX_FDS_PER_FRAME }; + + assert!( + !should_error(true, MAX_FDS_PER_FRAME + 1), + "more=true should never error on fd count" + ); + assert!( + !should_error(false, MAX_FDS_PER_FRAME), + "more=false at exactly cap should not error" + ); + assert!( + should_error(false, MAX_FDS_PER_FRAME + 1), + "more=false over cap must error" + ); + assert!( + should_error(false, 1000), + "more=false with 1000 fds must error" + ); + } + + /// `seed_from_id` must match the precomputed value for "child-layer-001". + #[test] + fn test_seed_from_id_child_layer() { + assert_eq!( + seed_from_id("child-layer-001"), + LAYER_SEED, + "seed_from_id must match precomputed SHA-256-derived value" + ); + } + + /// `sparse_dir_slots` must produce a valid, seed-reproducible layout. + /// + /// The dummy count derives from the seed independently of placement, so for + /// "child-layer-001" (seed = 9551015030439334514): `n_dummy = (seed >> 8) % + /// (2+1) + 1 = 3`, hence `total_slots = 5`. Real-slot *placement* comes from + /// a `Pcg64` shuffle: we assert the structural invariants plus determinism + /// (same seed → same layout) rather than pinning the exact slot indices. + #[test] + fn test_sparse_dir_slots_known_layout() { + let layout = sparse_dir_slots(2, LAYER_SEED); + + // Seed-derived counts (independent of the shuffle). + assert_eq!(layout.total_slots, 5, "total_slots must be 5"); + assert_eq!(layout.real_slot_indices.len(), 2, "two real layers"); + assert_eq!(layout.dummy_slot_indices.len(), 3, "three dummy slots"); + + // Structural invariants: both lists ascending, disjoint, and together + // covering exactly 0..total_slots. + assert!( + layout.real_slot_indices.windows(2).all(|w| w[0] < w[1]), + "real_slot_indices ascending" + ); + assert!( + layout.dummy_slot_indices.windows(2).all(|w| w[0] < w[1]), + "dummy_slot_indices ascending" + ); + let mut all: Vec = layout + .real_slot_indices + .iter() + .chain(&layout.dummy_slot_indices) + .copied() + .collect(); + all.sort_unstable(); + assert_eq!( + all, + (0..layout.total_slots as u32).collect::>(), + "real and dummy slots must partition 0..total_slots" + ); + + // Determinism: the Pcg64 shuffle is reproducible from the seed. + assert_eq!( + sparse_dir_slots(2, LAYER_SEED), + layout, + "same seed must yield the same layout" + ); + } + + /// `n_extra_lifetime_fds` must return 2 for "child-layer-001"'s seed. + #[test] + fn test_n_extra_lifetime_fds_child_layer() { + assert_eq!( + n_extra_lifetime_fds(LAYER_SEED), + 2, + "n_extra_lifetime_fds must be 2 for child-layer-001 seed" + ); + } +} diff --git a/crates/composefs-splitfdstream/Cargo.toml b/crates/composefs-splitfdstream/Cargo.toml deleted file mode 100644 index 1aba5988..00000000 --- a/crates/composefs-splitfdstream/Cargo.toml +++ /dev/null @@ -1,22 +0,0 @@ -[package] -name = "composefs-splitfdstream" -description = "Binary format for serializing data with external file descriptor references" -keywords = ["splitfdstream", "fd", "serialization"] -publish = false - -edition.workspace = true -license.workspace = true -readme.workspace = true -repository.workspace = true -rust-version.workspace = true -version.workspace = true - -[dependencies] -rustix = { version = "1.0.0", default-features = false, features = ["fs", "std"] } - -[dev-dependencies] -proptest = "1" -tempfile = { version = "3.8.0", default-features = false } - -[lints] -workspace = true diff --git a/crates/composefs-splitfdstream/src/lib.rs b/crates/composefs-splitfdstream/src/lib.rs deleted file mode 100644 index 1c575868..00000000 --- a/crates/composefs-splitfdstream/src/lib.rs +++ /dev/null @@ -1,1328 +0,0 @@ -//! Split file descriptor stream format for serializing binary data with external chunks. -//! -//! This module implements a binary format for representing serialized binary files -//! (tar archives, zip files, filesystem images, etc.) where data chunks can be stored -//! externally as file descriptors rather than inline in the stream. -//! -//! # Format Overview -//! -//! A splitfdstream is a sequential stream of chunks. Each chunk begins with a signed -//! 64-bit little-endian prefix that determines the chunk type: -//! -//! | Prefix Value | Meaning | -//! |--------------|---------| -//! | `< 0` | **Inline**: The next `abs(prefix)` bytes are literal data | -//! | `>= 0` | **External**: Content comes from `fd[prefix + 1]` | -//! -//! # Use Cases -//! -//! The splitfdstream format is designed for scenarios where: -//! -//! - Large binary files need to be transferred with some data stored externally -//! - File descriptors can be passed alongside the stream (e.g., via Unix sockets) -//! - Deduplication is desired by referencing the same external fd multiple times -//! - Zero-copy operations are possible by referencing files directly -//! -//! # Example -//! -//! ``` -//! use composefs_splitfdstream::{SplitfdstreamWriter, SplitfdstreamReader, Chunk}; -//! -//! // Write a stream with mixed inline and external chunks -//! let mut buffer = Vec::new(); -//! let mut writer = SplitfdstreamWriter::new(&mut buffer); -//! writer.write_inline(b"inline data").unwrap(); -//! writer.write_external(0).unwrap(); // Reference fd[1] -//! writer.write_inline(b"more inline").unwrap(); -//! writer.finish().unwrap(); -//! -//! // Read the stream back -//! let mut reader = SplitfdstreamReader::new(buffer.as_slice()); -//! while let Some(chunk) = reader.next_chunk().unwrap() { -//! match chunk { -//! Chunk::Inline(data) => println!("Inline: {} bytes", data.len()), -//! Chunk::External(fd_index) => println!("External: fd[{}]", fd_index + 1), -//! } -//! } -//! ``` -//! -//! # Wire Format Details -//! -//! The stream consists of a sequence of chunks with no framing header or footer. -//! Each chunk is: -//! -//! 1. An 8-byte signed little-endian integer (the prefix) -//! 2. For inline chunks only: `abs(prefix)` bytes of literal data -//! -//! External chunks have no additional data after the prefix; the content is -//! retrieved from the file descriptor array passed alongside the stream. - -// This is a library: emit diagnostics via the `log` crate (or return them), -// never by writing to the process's stdout/stderr. Genuinely-intentional -// exceptions carry a local `#[allow]` with justification. Test code is exempt. -#![cfg_attr(not(test), deny(clippy::print_stdout, clippy::print_stderr))] - -use std::io::{self, Read, Write}; -#[cfg(test)] -use std::os::fd::AsFd; - -/// Maximum size for an inline chunk (256 MB). -/// -/// This limit prevents denial-of-service attacks where a malicious stream -/// could specify an extremely large inline chunk size, causing unbounded -/// memory allocation. -pub const MAX_INLINE_CHUNK_SIZE: usize = 256 * 1024 * 1024; - -/// A chunk read from a splitfdstream. -/// -/// Chunks are either inline data embedded in the stream, or references to -/// external file descriptors that should be read separately. -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum Chunk<'a> { - /// Inline data embedded directly in the stream. - Inline(&'a [u8]), - - /// Reference to an external file descriptor. - /// - /// The value is the fd index (0-based), meaning the actual fd is at - /// position `fd_index + 1` in the fd array (fd\[0\] is typically the - /// stream itself). - External(u32), -} - -/// Writer for building a splitfdstream. -/// -/// The writer encodes inline data and external fd references into the -/// splitfdstream binary format. -/// -/// # Example -/// -/// ``` -/// use composefs_splitfdstream::{SplitfdstreamWriter, SplitfdstreamReader, Chunk}; -/// -/// // Write a stream with mixed inline and external chunks -/// let mut buffer = Vec::new(); -/// let mut writer = SplitfdstreamWriter::new(&mut buffer); -/// writer.write_inline(b"inline data").unwrap(); -/// writer.write_external(0).unwrap(); // Reference fd[1] -/// writer.write_inline(b"more inline").unwrap(); -/// writer.finish().unwrap(); -/// -/// // Read the stream back -/// let mut reader = SplitfdstreamReader::new(buffer.as_slice()); -/// while let Some(chunk) = reader.next_chunk().unwrap() { -/// match chunk { -/// Chunk::Inline(data) => println!("Inline: {} bytes", data.len()), -/// Chunk::External(fd_index) => println!("External: fd[{}]", fd_index + 1), -/// } -/// } -/// ``` -#[derive(Debug)] -pub struct SplitfdstreamWriter { - writer: W, -} - -impl SplitfdstreamWriter { - /// Create a new splitfdstream writer wrapping the given writer. - pub fn new(writer: W) -> Self { - Self { writer } - } - - /// Write inline data to the stream. - /// - /// The data is prefixed with a negative i64 indicating the length, - /// followed by the literal bytes. - /// - /// # Errors - /// - /// Returns an error if writing to the underlying writer fails. - pub fn write_inline(&mut self, data: &[u8]) -> io::Result<()> { - if data.is_empty() { - return Ok(()); - } - - // Prefix is negative length - let len = data.len() as i64; - let prefix = -len; - self.writer.write_all(&prefix.to_le_bytes())?; - self.writer.write_all(data)?; - Ok(()) - } - - /// Write an external fd reference to the stream. - /// - /// The fd_index is the 0-based index into the fd array. The actual - /// file descriptor is at position `fd_index + 1` (since fd\[0\] is - /// typically the stream itself). - /// - /// # Errors - /// - /// Returns an error if writing to the underlying writer fails. - pub fn write_external(&mut self, fd_index: u32) -> io::Result<()> { - // Prefix is fd_index (non-negative), actual fd is at fd_index + 1 - let prefix = fd_index as i64; - self.writer.write_all(&prefix.to_le_bytes())?; - Ok(()) - } - - /// Finish writing and return the underlying writer. - /// - /// This consumes the writer and returns the underlying `Write` impl. - pub fn finish(self) -> io::Result { - Ok(self.writer) - } -} - -/// Reader for parsing a splitfdstream. -/// -/// The reader parses the binary format and yields chunks that are either -/// inline data or references to external file descriptors. -/// -/// # Example -/// -/// ``` -/// use composefs_splitfdstream::{SplitfdstreamReader, Chunk}; -/// -/// let data = vec![ -/// // Inline chunk: prefix = -5, then 5 bytes -/// 0xfb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // -5 as i64 LE -/// b'h', b'e', b'l', b'l', b'o', -/// ]; -/// -/// let mut reader = SplitfdstreamReader::new(data.as_slice()); -/// let chunk = reader.next_chunk().unwrap().unwrap(); -/// assert_eq!(chunk, Chunk::Inline(b"hello")); -/// ``` -#[derive(Debug)] -pub struct SplitfdstreamReader { - reader: R, - /// Buffer for reading inline data - buffer: Vec, -} - -impl SplitfdstreamReader { - /// Create a new splitfdstream reader wrapping the given reader. - pub fn new(reader: R) -> Self { - Self { - reader, - buffer: Vec::new(), - } - } - - /// Consume this reader, returning the underlying reader. - pub fn into_inner(self) -> R { - self.reader - } - - /// Read the next chunk from the stream. - /// - /// Returns `Ok(None)` when the stream is exhausted. - /// - /// # Errors - /// - /// Returns an error if: - /// - Reading from the underlying reader fails - /// - The stream contains invalid data (e.g., inline size exceeds maximum) - pub fn next_chunk(&mut self) -> io::Result>> { - // Read the 8-byte prefix - let mut prefix_bytes = [0u8; 8]; - match self.reader.read_exact(&mut prefix_bytes) { - Ok(()) => {} - Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), - Err(e) => return Err(e), - } - - let prefix = i64::from_le_bytes(prefix_bytes); - - if prefix < 0 { - // Inline chunk: read abs(prefix) bytes - let len = (-prefix) as usize; - if len > MAX_INLINE_CHUNK_SIZE { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - format!( - "inline chunk size {} exceeds maximum allowed size {}", - len, MAX_INLINE_CHUNK_SIZE - ), - )); - } - self.buffer.clear(); - self.buffer.resize(len, 0); - self.reader.read_exact(&mut self.buffer)?; - Ok(Some(Chunk::Inline(&self.buffer))) - } else { - // External chunk: prefix is the fd index - Ok(Some(Chunk::External(prefix as u32))) - } - } -} - -#[cfg(test)] -/// A helper that reads a file from offset 0 using positional reads. -/// -/// This allows reading the same file multiple times without seeking, -/// since each read specifies its position explicitly. -#[derive(Debug)] -struct ReadAtReader<'a, F> { - file: &'a F, - offset: u64, -} - -#[cfg(test)] -impl<'a, F: AsFd> ReadAtReader<'a, F> { - fn new(file: &'a F) -> Self { - Self { file, offset: 0 } - } -} - -#[cfg(test)] -impl Read for ReadAtReader<'_, F> { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - let n = rustix::io::pread(self.file, buf, self.offset)?; - self.offset += n as u64; - Ok(n) - } -} - -#[cfg(test)] -/// A `Read` adapter that reconstructs a byte stream from a splitfdstream. -/// -/// This struct implements `Read` by combining inline chunks and external file -/// descriptor content into a contiguous byte stream. -/// -/// External files are read using positional read (pread/read_at), so the -/// same file can be referenced multiple times in the splitfdstream without -/// needing to reopen or seek it. -#[derive(Debug)] -struct SplitfdstreamAsRead<'files, R: Read> { - reader: SplitfdstreamReader, - files: &'files [std::fs::File], - /// Buffer for inline data (partially consumed) - inline_buffer: Vec, - /// Position within inline_buffer - inline_pos: usize, - /// Current external file being read (if any) - current_external: Option>, -} - -#[cfg(test)] -impl<'files, R: Read> SplitfdstreamAsRead<'files, R> { - /// Create a new reader from a splitfdstream and files. - /// - /// The `files` slice provides the external files referenced by the - /// splitfdstream. Each external chunk at index N reads from `files[N]`. - fn new(splitfdstream: R, files: &'files [std::fs::File]) -> Self { - Self { - reader: SplitfdstreamReader::new(splitfdstream), - files, - inline_buffer: Vec::new(), - inline_pos: 0, - current_external: None, - } - } -} - -#[cfg(test)] -impl Read for SplitfdstreamAsRead<'_, R> { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - // First, drain any buffered inline data - if self.inline_pos < self.inline_buffer.len() { - let remaining = &self.inline_buffer[self.inline_pos..]; - let n = buf.len().min(remaining.len()); - buf[..n].copy_from_slice(&remaining[..n]); - self.inline_pos += n; - return Ok(n); - } - - // Next, drain current external file if any - if let Some(ref mut ext) = self.current_external { - let n = ext.read(buf)?; - if n > 0 { - return Ok(n); - } - // External exhausted, move to next chunk - self.current_external = None; - } - - // Get next chunk from splitfdstream - match self.reader.next_chunk()? { - None => Ok(0), // EOF - Some(Chunk::Inline(data)) => { - let n = buf.len().min(data.len()); - buf[..n].copy_from_slice(&data[..n]); - if n < data.len() { - // Buffer remaining data for next read - self.inline_buffer.clear(); - self.inline_buffer.extend_from_slice(&data[n..]); - self.inline_pos = 0; - } - Ok(n) - } - Some(Chunk::External(idx)) => { - let idx = idx as usize; - if idx >= self.files.len() { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - format!( - "external chunk references fd index {} but only {} files provided", - idx, - self.files.len() - ), - )); - } - self.current_external = Some(ReadAtReader::new(&self.files[idx])); - // Recurse to read from the new external - self.read(buf) - } - } - } -} - -#[cfg(test)] -/// Reconstruct a stream from splitfdstream + file descriptors. -/// -/// This function reads a splitfdstream and writes the reconstructed data to `output`. -/// Inline chunks are written directly, while external chunks are read from the -/// corresponding file descriptors in `files`. -fn reconstruct(splitfdstream: R, files: &[std::fs::File], output: &mut W) -> io::Result -where - R: Read, - W: Write, -{ - let mut reader = SplitfdstreamReader::new(splitfdstream); - let mut bytes_written = 0u64; - - while let Some(chunk) = reader.next_chunk()? { - match chunk { - Chunk::Inline(data) => { - output.write_all(data)?; - bytes_written += data.len() as u64; - } - Chunk::External(idx) => { - let file = files.get(idx as usize).ok_or_else(|| { - io::Error::new( - io::ErrorKind::InvalidData, - format!( - "external chunk references fd index {} but only {} files provided", - idx, - files.len() - ), - ) - })?; - let mut ext_reader = ReadAtReader::new(file); - let copied = io::copy(&mut ext_reader, output)?; - bytes_written += copied; - } - } - } - - Ok(bytes_written) -} - -#[cfg(test)] -mod tests { - use super::*; - - /// Helper to write and read back chunks, verifying round-trip. - fn roundtrip_chunks( - inline_chunks: &[&[u8]], - external_indices: &[u32], - interleave: bool, - ) -> Vec<(bool, Vec, u32)> { - let mut buffer = Vec::new(); - { - let mut writer = SplitfdstreamWriter::new(&mut buffer); - - if interleave { - let max_len = inline_chunks.len().max(external_indices.len()); - for i in 0..max_len { - if i < inline_chunks.len() { - writer.write_inline(inline_chunks[i]).unwrap(); - } - if i < external_indices.len() { - writer.write_external(external_indices[i]).unwrap(); - } - } - } else { - for chunk in inline_chunks { - writer.write_inline(chunk).unwrap(); - } - for &idx in external_indices { - writer.write_external(idx).unwrap(); - } - } - - writer.finish().unwrap(); - } - - // Read back - let mut reader = SplitfdstreamReader::new(buffer.as_slice()); - let mut results = Vec::new(); - - while let Some(chunk) = reader.next_chunk().unwrap() { - match chunk { - Chunk::Inline(data) => { - results.push((true, data.to_vec(), 0)); - } - Chunk::External(idx) => { - results.push((false, Vec::new(), idx)); - } - } - } - - results - } - - #[test] - fn test_empty_stream() { - let buffer: Vec = Vec::new(); - let mut reader = SplitfdstreamReader::new(buffer.as_slice()); - assert!(reader.next_chunk().unwrap().is_none()); - } - - #[test] - fn test_only_inline_chunks() { - let chunks: &[&[u8]] = &[b"hello", b"world", b"test"]; - let results = roundtrip_chunks(chunks, &[], false); - - assert_eq!(results.len(), 3); - assert!(results[0].0); // is_inline - assert_eq!(results[0].1, b"hello"); - assert!(results[1].0); - assert_eq!(results[1].1, b"world"); - assert!(results[2].0); - assert_eq!(results[2].1, b"test"); - } - - #[test] - fn test_only_external_chunks() { - let results = roundtrip_chunks(&[], &[0, 5, 42, 100], false); - - assert_eq!(results.len(), 4); - assert!(!results[0].0); // is_external - assert_eq!(results[0].2, 0); - assert!(!results[1].0); - assert_eq!(results[1].2, 5); - assert!(!results[2].0); - assert_eq!(results[2].2, 42); - assert!(!results[3].0); - assert_eq!(results[3].2, 100); - } - - #[test] - fn test_mixed_inline_external() { - let inline: &[&[u8]] = &[b"header", b"middle", b"footer"]; - let external: &[u32] = &[0, 1, 2]; - let results = roundtrip_chunks(inline, external, true); - - // Interleaved: inline0, ext0, inline1, ext1, inline2, ext2 - assert_eq!(results.len(), 6); - - assert!(results[0].0); - assert_eq!(results[0].1, b"header"); - - assert!(!results[1].0); - assert_eq!(results[1].2, 0); - - assert!(results[2].0); - assert_eq!(results[2].1, b"middle"); - - assert!(!results[3].0); - assert_eq!(results[3].2, 1); - - assert!(results[4].0); - assert_eq!(results[4].1, b"footer"); - - assert!(!results[5].0); - assert_eq!(results[5].2, 2); - } - - #[test] - fn test_large_inline_chunk() { - // Test with a large chunk to verify i64 handles sizes correctly - let large_data: Vec = (0..100_000).map(|i| (i % 256) as u8).collect(); - - let mut buffer = Vec::new(); - { - let mut writer = SplitfdstreamWriter::new(&mut buffer); - writer.write_inline(&large_data).unwrap(); - writer.finish().unwrap(); - } - - let mut reader = SplitfdstreamReader::new(buffer.as_slice()); - let chunk = reader.next_chunk().unwrap().unwrap(); - - match chunk { - Chunk::Inline(data) => { - assert_eq!(data.len(), 100_000); - assert_eq!(data, large_data.as_slice()); - } - Chunk::External(_) => panic!("Expected inline chunk"), - } - - assert!(reader.next_chunk().unwrap().is_none()); - } - - #[test] - fn test_empty_inline_chunk_is_skipped() { - // Empty inline writes should be no-ops - let mut buffer = Vec::new(); - { - let mut writer = SplitfdstreamWriter::new(&mut buffer); - writer.write_inline(b"").unwrap(); - writer.write_inline(b"actual").unwrap(); - writer.write_inline(b"").unwrap(); - writer.finish().unwrap(); - } - - let mut reader = SplitfdstreamReader::new(buffer.as_slice()); - let chunk = reader.next_chunk().unwrap().unwrap(); - assert_eq!(chunk, Chunk::Inline(b"actual")); - assert!(reader.next_chunk().unwrap().is_none()); - } - - #[test] - fn test_boundary_sizes() { - // Test various boundary sizes - let sizes = [ - 1, 7, 8, 9, 255, 256, 257, 1023, 1024, 1025, 4095, 4096, 4097, - ]; - - for &size in &sizes { - let data: Vec = (0..size).map(|i| (i % 256) as u8).collect(); - - let mut buffer = Vec::new(); - { - let mut writer = SplitfdstreamWriter::new(&mut buffer); - writer.write_inline(&data).unwrap(); - writer.finish().unwrap(); - } - - // Verify buffer structure: 8-byte prefix + data - assert_eq!(buffer.len(), 8 + size); - - // Verify prefix is correct negative value - let prefix = i64::from_le_bytes(buffer[..8].try_into().unwrap()); - assert_eq!(prefix, -(size as i64)); - - // Read back and verify - let mut reader = SplitfdstreamReader::new(buffer.as_slice()); - let chunk = reader.next_chunk().unwrap().unwrap(); - match chunk { - Chunk::Inline(read_data) => { - assert_eq!(read_data.len(), size); - assert_eq!(read_data, data.as_slice()); - } - Chunk::External(_) => panic!("Expected inline"), - } - } - } - - #[test] - fn test_external_fd_index_zero() { - // fd_index 0 means fd[1], test this boundary - let mut buffer = Vec::new(); - { - let mut writer = SplitfdstreamWriter::new(&mut buffer); - writer.write_external(0).unwrap(); - writer.finish().unwrap(); - } - - // Should be exactly 8 bytes (the prefix) - assert_eq!(buffer.len(), 8); - - // Prefix should be 0 - let prefix = i64::from_le_bytes(buffer[..8].try_into().unwrap()); - assert_eq!(prefix, 0); - - let mut reader = SplitfdstreamReader::new(buffer.as_slice()); - let chunk = reader.next_chunk().unwrap().unwrap(); - assert_eq!(chunk, Chunk::External(0)); - } - - #[test] - fn test_large_fd_index() { - // Test with maximum u32 fd index - let mut buffer = Vec::new(); - { - let mut writer = SplitfdstreamWriter::new(&mut buffer); - writer.write_external(u32::MAX).unwrap(); - writer.finish().unwrap(); - } - - let mut reader = SplitfdstreamReader::new(buffer.as_slice()); - let chunk = reader.next_chunk().unwrap().unwrap(); - assert_eq!(chunk, Chunk::External(u32::MAX)); - } - - #[test] - fn test_single_byte_inline() { - let results = roundtrip_chunks(&[b"x"], &[], false); - assert_eq!(results.len(), 1); - assert!(results[0].0); - assert_eq!(results[0].1, b"x"); - } - - #[test] - fn test_writer_finish_returns_writer() { - let mut buffer = Vec::new(); - let writer = SplitfdstreamWriter::new(&mut buffer); - let returned = writer.finish().unwrap(); - - // Verify we got the writer back (can write to it) - returned.len(); // Just verify it's accessible - } - - #[test] - fn test_chunk_equality() { - assert_eq!(Chunk::Inline(b"test"), Chunk::Inline(b"test")); - assert_ne!(Chunk::Inline(b"test"), Chunk::Inline(b"other")); - assert_eq!(Chunk::External(5), Chunk::External(5)); - assert_ne!(Chunk::External(5), Chunk::External(6)); - assert_ne!(Chunk::Inline(b"test"), Chunk::External(0)); - } - - #[test] - fn test_many_small_chunks() { - // Stress test with many small chunks - let chunks: Vec> = (0..1000).map(|i| vec![i as u8; (i % 10) + 1]).collect(); - let chunk_refs: Vec<&[u8]> = chunks.iter().map(|c| c.as_slice()).collect(); - - let mut buffer = Vec::new(); - { - let mut writer = SplitfdstreamWriter::new(&mut buffer); - for chunk in &chunk_refs { - writer.write_inline(chunk).unwrap(); - } - writer.finish().unwrap(); - } - - let mut reader = SplitfdstreamReader::new(buffer.as_slice()); - let mut count = 0; - while let Some(chunk) = reader.next_chunk().unwrap() { - match chunk { - Chunk::Inline(data) => { - assert_eq!(data, chunk_refs[count]); - count += 1; - } - Chunk::External(_) => panic!("Unexpected external"), - } - } - assert_eq!(count, 1000); - } - - #[test] - fn test_alternating_inline_external() { - let mut buffer = Vec::new(); - { - let mut writer = SplitfdstreamWriter::new(&mut buffer); - for i in 0..50 { - writer.write_inline(&[i as u8]).unwrap(); - writer.write_external(i as u32).unwrap(); - } - writer.finish().unwrap(); - } - - let mut reader = SplitfdstreamReader::new(buffer.as_slice()); - let mut inline_count = 0; - let mut external_count = 0; - - while let Some(chunk) = reader.next_chunk().unwrap() { - match chunk { - Chunk::Inline(data) => { - assert_eq!(data.len(), 1); - assert_eq!(data[0], inline_count as u8); - inline_count += 1; - } - Chunk::External(idx) => { - assert_eq!(idx, external_count as u32); - external_count += 1; - } - } - } - - assert_eq!(inline_count, 50); - assert_eq!(external_count, 50); - } - - #[test] - fn test_truncated_prefix_returns_none() { - // Partial prefix (less than 8 bytes) at end of stream - let buffer = vec![0x01, 0x02, 0x03]; // Only 3 bytes - - let mut reader = SplitfdstreamReader::new(buffer.as_slice()); - // Should return None (EOF) since we can't read a complete prefix - assert!(reader.next_chunk().unwrap().is_none()); - } - - #[test] - fn test_truncated_data_is_error() { - // Valid prefix saying 100 bytes, but only 10 bytes of data - let mut buffer = Vec::new(); - let prefix: i64 = -100; // Inline, 100 bytes - buffer.extend_from_slice(&prefix.to_le_bytes()); - buffer.extend_from_slice(&[0u8; 10]); // Only 10 bytes - - let mut reader = SplitfdstreamReader::new(buffer.as_slice()); - let result = reader.next_chunk(); - assert!(result.is_err()); - } - - #[test] - fn test_inline_chunk_size_limit() { - // Attempt to read a chunk that exceeds MAX_INLINE_CHUNK_SIZE - let mut buffer = Vec::new(); - // Request 512 MB (exceeds 256 MB limit) - let prefix: i64 = -(512 * 1024 * 1024); - buffer.extend_from_slice(&prefix.to_le_bytes()); - - let mut reader = SplitfdstreamReader::new(buffer.as_slice()); - let result = reader.next_chunk(); - assert!(result.is_err()); - let err = result.unwrap_err(); - assert_eq!(err.kind(), io::ErrorKind::InvalidData); - assert!(err.to_string().contains("exceeds maximum")); - } - - mod reconstruct_tests { - use super::*; - use std::io::Cursor; - use tempfile::NamedTempFile; - - #[test] - fn test_reconstruct_inline_only() { - // Create a splitfdstream with only inline data - let mut stream_buf = Vec::new(); - { - let mut writer = SplitfdstreamWriter::new(&mut stream_buf); - writer.write_inline(b"Hello, ").unwrap(); - writer.write_inline(b"world!").unwrap(); - writer.finish().unwrap(); - } - - let mut output = Vec::new(); - let files: &[std::fs::File] = &[]; - let bytes = reconstruct(stream_buf.as_slice(), files, &mut output).unwrap(); - - assert_eq!(output, b"Hello, world!"); - assert_eq!(bytes, 13); - } - - #[test] - fn test_reconstruct_empty_stream() { - let stream_buf: Vec = Vec::new(); - let mut output = Vec::new(); - let files: &[std::fs::File] = &[]; - let bytes = reconstruct(stream_buf.as_slice(), files, &mut output).unwrap(); - - assert!(output.is_empty()); - assert_eq!(bytes, 0); - } - - #[test] - fn test_reconstruct_with_external_fds() { - // Create temp files with known content - let mut file0 = NamedTempFile::new().unwrap(); - let mut file1 = NamedTempFile::new().unwrap(); - - use std::io::Write; - file0.write_all(b"EXTERNAL0").unwrap(); - file1.write_all(b"EXTERNAL1").unwrap(); - - // Create splitfdstream that references these files - let mut stream_buf = Vec::new(); - { - let mut writer = SplitfdstreamWriter::new(&mut stream_buf); - writer.write_inline(b"[start]").unwrap(); - writer.write_external(0).unwrap(); // Reference first fd - writer.write_inline(b"[mid]").unwrap(); - writer.write_external(1).unwrap(); // Reference second fd - writer.write_inline(b"[end]").unwrap(); - writer.finish().unwrap(); - } - - // Open files for reading - let f0 = std::fs::File::open(file0.path()).unwrap(); - let f1 = std::fs::File::open(file1.path()).unwrap(); - let files = [f0, f1]; - - let mut output = Vec::new(); - let bytes = reconstruct(stream_buf.as_slice(), &files, &mut output).unwrap(); - - assert_eq!(output, b"[start]EXTERNAL0[mid]EXTERNAL1[end]"); - assert_eq!(bytes, output.len() as u64); - } - - #[test] - fn test_reconstruct_external_fd_out_of_bounds() { - // Create splitfdstream referencing fd index 5, but only provide 1 file - let mut stream_buf = Vec::new(); - { - let mut writer = SplitfdstreamWriter::new(&mut stream_buf); - writer.write_external(5).unwrap(); // Out of bounds - writer.finish().unwrap(); - } - - let file = NamedTempFile::new().unwrap(); - let f = std::fs::File::open(file.path()).unwrap(); - let files = [f]; - - let mut output = Vec::new(); - let result = reconstruct(stream_buf.as_slice(), &files, &mut output); - - assert!(result.is_err()); - let err = result.unwrap_err(); - assert_eq!(err.kind(), std::io::ErrorKind::InvalidData); - assert!(err.to_string().contains("fd index 5")); - } - - #[test] - fn test_reconstruct_large_external_file() { - // Create a larger external file to test efficient copying - let mut file = NamedTempFile::new().unwrap(); - let large_data: Vec = (0..100_000).map(|i| (i % 256) as u8).collect(); - - use std::io::Write; - file.write_all(&large_data).unwrap(); - - let mut stream_buf = Vec::new(); - { - let mut writer = SplitfdstreamWriter::new(&mut stream_buf); - writer.write_inline(b"header").unwrap(); - writer.write_external(0).unwrap(); - writer.write_inline(b"footer").unwrap(); - writer.finish().unwrap(); - } - - let f = std::fs::File::open(file.path()).unwrap(); - let files = [f]; - - let mut output = Vec::new(); - let bytes = reconstruct(stream_buf.as_slice(), &files, &mut output).unwrap(); - - // Verify header + large data + footer - assert_eq!(&output[..6], b"header"); - assert_eq!(&output[6..100_006], large_data.as_slice()); - assert_eq!(&output[100_006..], b"footer"); - assert_eq!(bytes, 6 + 100_000 + 6); - } - - #[test] - fn test_reconstruct_same_fd_multiple_times() { - // Test that the same fd can be referenced multiple times - let mut file = NamedTempFile::new().unwrap(); - - use std::io::Write; - file.write_all(b"REPEATED").unwrap(); - - let mut stream_buf = Vec::new(); - { - let mut writer = SplitfdstreamWriter::new(&mut stream_buf); - writer.write_external(0).unwrap(); - writer.write_inline(b"-").unwrap(); - writer.write_external(0).unwrap(); - writer.write_inline(b"-").unwrap(); - writer.write_external(0).unwrap(); - writer.finish().unwrap(); - } - - let f = std::fs::File::open(file.path()).unwrap(); - let files = [f]; - - let mut output = Vec::new(); - let bytes = reconstruct(stream_buf.as_slice(), &files, &mut output).unwrap(); - - // Each reference uses pread from offset 0, so each reads from start - assert_eq!(output, b"REPEATED-REPEATED-REPEATED"); - assert_eq!(bytes, 26); - } - - #[test] - fn test_into_inner() { - let data = vec![1, 2, 3, 4]; - let cursor = Cursor::new(data.clone()); - let reader = SplitfdstreamReader::new(cursor); - let inner = reader.into_inner(); - assert_eq!(inner.into_inner(), data); - } - } - - mod proptest_tests { - use super::*; - use proptest::prelude::*; - use std::io::Write; - use tempfile::NamedTempFile; - - /// Represents a chunk in the stream for testing purposes. - #[derive(Debug, Clone)] - enum TestChunk { - Inline(Vec), - External { fd_index: usize, content: Vec }, - } - - /// Strategy for generating inline chunk data. - /// Bounded to reasonable sizes to keep tests fast. - fn inline_data_strategy() -> impl Strategy> { - prop::collection::vec(any::(), 0..4096) - } - - /// Strategy for generating external chunk content. - fn external_content_strategy() -> impl Strategy> { - prop::collection::vec(any::(), 0..8192) - } - - /// Strategy for generating a single test chunk. - /// The fd_index is relative and will be resolved during test execution. - fn chunk_strategy() -> impl Strategy { - prop_oneof![ - inline_data_strategy().prop_map(TestChunk::Inline), - (0..16usize, external_content_strategy()).prop_map(|(idx, content)| { - TestChunk::External { - fd_index: idx, - content, - } - }) - ] - } - - /// Strategy for generating a sequence of chunks. - fn chunks_strategy() -> impl Strategy> { - prop::collection::vec(chunk_strategy(), 0..64) - } - - /// Execute a roundtrip test: write chunks, read them back, verify reconstruction. - fn roundtrip_test(chunks: Vec) -> Result<(), TestCaseError> { - // Collect unique external contents and assign fd indices - let mut external_contents: Vec> = Vec::new(); - - // Normalize fd_indices to actual file indices - let normalized_chunks: Vec = chunks - .into_iter() - .filter_map(|chunk| match chunk { - TestChunk::Inline(data) => { - // Skip empty inline chunks (writer skips them) - if data.is_empty() { - None - } else { - Some(TestChunk::Inline(data)) - } - } - TestChunk::External { fd_index, content } => { - // Map fd_index to actual position in external_contents - let actual_index = fd_index % 8.max(1); // Limit to 8 files max - - // Ensure we have enough files - while external_contents.len() <= actual_index { - external_contents.push(Vec::new()); - } - - // Store content (may overwrite previous) - external_contents[actual_index] = content.clone(); - - Some(TestChunk::External { - fd_index: actual_index, - content, - }) - } - }) - .collect(); - - // Create temp files for external data - let mut temp_files: Vec = Vec::new(); - for content in &external_contents { - let mut f = NamedTempFile::new().map_err(|e| TestCaseError::fail(e.to_string()))?; - f.write_all(content) - .map_err(|e| TestCaseError::fail(e.to_string()))?; - temp_files.push(f); - } - - // Write the splitfdstream - let mut stream_buf = Vec::new(); - { - let mut writer = SplitfdstreamWriter::new(&mut stream_buf); - for chunk in &normalized_chunks { - match chunk { - TestChunk::Inline(data) => { - writer - .write_inline(data) - .map_err(|e| TestCaseError::fail(e.to_string()))?; - } - TestChunk::External { fd_index, .. } => { - writer - .write_external(*fd_index as u32) - .map_err(|e| TestCaseError::fail(e.to_string()))?; - } - } - } - writer - .finish() - .map_err(|e| TestCaseError::fail(e.to_string()))?; - } - - // Read back and verify chunk sequence - let mut reader = SplitfdstreamReader::new(stream_buf.as_slice()); - let mut read_chunks = Vec::new(); - while let Some(chunk) = reader - .next_chunk() - .map_err(|e| TestCaseError::fail(e.to_string()))? - { - match chunk { - Chunk::Inline(data) => read_chunks.push(TestChunk::Inline(data.to_vec())), - Chunk::External(idx) => read_chunks.push(TestChunk::External { - fd_index: idx as usize, - content: external_contents - .get(idx as usize) - .cloned() - .unwrap_or_default(), - }), - } - } - - // Verify we got the same number of chunks - prop_assert_eq!( - normalized_chunks.len(), - read_chunks.len(), - "Chunk count mismatch" - ); - - // Verify each chunk matches - for (i, (expected, actual)) in - normalized_chunks.iter().zip(read_chunks.iter()).enumerate() - { - match (expected, actual) { - (TestChunk::Inline(expected_data), TestChunk::Inline(actual_data)) => { - prop_assert_eq!( - expected_data, - actual_data, - "Inline chunk {} data mismatch", - i - ); - } - ( - TestChunk::External { fd_index: ei, .. }, - TestChunk::External { fd_index: ai, .. }, - ) => { - prop_assert_eq!(ei, ai, "External chunk {} fd_index mismatch", i); - } - _ => { - return Err(TestCaseError::fail(format!( - "Chunk {} type mismatch: expected {:?}, got {:?}", - i, expected, actual - ))); - } - } - } - - // Verify reconstruction produces correct output - let files: Vec = temp_files - .iter() - .map(|f| std::fs::File::open(f.path())) - .collect::, _>>() - .map_err(|e| TestCaseError::fail(e.to_string()))?; - - let mut output = Vec::new(); - reconstruct(stream_buf.as_slice(), &files, &mut output) - .map_err(|e| TestCaseError::fail(e.to_string()))?; - - // Build expected output - let mut expected_output = Vec::new(); - for chunk in &normalized_chunks { - match chunk { - TestChunk::Inline(data) => expected_output.extend_from_slice(data), - TestChunk::External { fd_index, .. } => { - expected_output.extend_from_slice(&external_contents[*fd_index]); - } - } - } - - prop_assert_eq!(output, expected_output, "Reconstructed output mismatch"); - - Ok(()) - } - - proptest! { - #![proptest_config(ProptestConfig::with_cases(256))] - - #[test] - fn test_arbitrary_chunk_sequences(chunks in chunks_strategy()) { - roundtrip_test(chunks)?; - } - - #[test] - fn test_inline_only_sequences( - chunks in prop::collection::vec(inline_data_strategy(), 0..32) - ) { - let test_chunks: Vec = chunks.into_iter() - .map(TestChunk::Inline) - .collect(); - roundtrip_test(test_chunks)?; - } - - #[test] - fn test_external_only_sequences( - chunks in prop::collection::vec( - (0..8usize, external_content_strategy()), - 0..32 - ) - ) { - let test_chunks: Vec = chunks.into_iter() - .map(|(idx, content)| TestChunk::External { fd_index: idx, content }) - .collect(); - roundtrip_test(test_chunks)?; - } - - #[test] - fn test_alternating_pattern( - inline_data in prop::collection::vec(inline_data_strategy(), 1..16), - external_data in prop::collection::vec(external_content_strategy(), 1..16) - ) { - let mut test_chunks = Vec::new(); - let max_len = inline_data.len().max(external_data.len()); - for i in 0..max_len { - if i < inline_data.len() { - test_chunks.push(TestChunk::Inline(inline_data[i].clone())); - } - if i < external_data.len() { - test_chunks.push(TestChunk::External { - fd_index: i % 8, - content: external_data[i].clone(), - }); - } - } - roundtrip_test(test_chunks)?; - } - - #[test] - fn test_same_fd_multiple_references( - content in external_content_strategy(), - ref_count in 1..10usize - ) { - let mut test_chunks = Vec::new(); - for _ in 0..ref_count { - test_chunks.push(TestChunk::External { - fd_index: 0, - content: content.clone(), - }); - } - roundtrip_test(test_chunks)?; - } - - #[test] - fn test_varying_chunk_sizes( - small in prop::collection::vec(any::(), 0..16), - medium in prop::collection::vec(any::(), 256..1024), - large in prop::collection::vec(any::(), 4096..8192) - ) { - let test_chunks = vec![ - TestChunk::Inline(small), - TestChunk::Inline(medium), - TestChunk::Inline(large), - ]; - roundtrip_test(test_chunks)?; - } - } - - /// Test SplitfdstreamAsRead with property-based approach - mod tar_reader_tests { - use super::*; - - proptest! { - #![proptest_config(ProptestConfig::with_cases(128))] - - #[test] - fn test_tar_reader_matches_reconstruct(chunks in chunks_strategy()) { - tar_reader_test(chunks)?; - } - } - - fn tar_reader_test(chunks: Vec) -> Result<(), TestCaseError> { - // Collect unique external contents and assign fd indices - let mut external_contents: Vec> = Vec::new(); - - // Normalize fd_indices to actual file indices - let normalized_chunks: Vec = chunks - .into_iter() - .filter_map(|chunk| match chunk { - TestChunk::Inline(data) => { - if data.is_empty() { - None - } else { - Some(TestChunk::Inline(data)) - } - } - TestChunk::External { fd_index, content } => { - let actual_index = fd_index % 8.max(1); - while external_contents.len() <= actual_index { - external_contents.push(Vec::new()); - } - external_contents[actual_index] = content.clone(); - Some(TestChunk::External { - fd_index: actual_index, - content, - }) - } - }) - .collect(); - - // Create temp files for external data - let mut temp_files: Vec = Vec::new(); - for content in &external_contents { - let mut f = - NamedTempFile::new().map_err(|e| TestCaseError::fail(e.to_string()))?; - f.write_all(content) - .map_err(|e| TestCaseError::fail(e.to_string()))?; - temp_files.push(f); - } - - // Write the splitfdstream - let mut stream_buf = Vec::new(); - { - let mut writer = SplitfdstreamWriter::new(&mut stream_buf); - for chunk in &normalized_chunks { - match chunk { - TestChunk::Inline(data) => { - writer - .write_inline(data) - .map_err(|e| TestCaseError::fail(e.to_string()))?; - } - TestChunk::External { fd_index, .. } => { - writer - .write_external(*fd_index as u32) - .map_err(|e| TestCaseError::fail(e.to_string()))?; - } - } - } - writer - .finish() - .map_err(|e| TestCaseError::fail(e.to_string()))?; - } - - // Open files for reading - let files: Vec = temp_files - .iter() - .map(|f| std::fs::File::open(f.path())) - .collect::, _>>() - .map_err(|e| TestCaseError::fail(e.to_string()))?; - - // Read via SplitfdstreamAsRead - let mut tar_reader = SplitfdstreamAsRead::new(stream_buf.as_slice(), &files); - let mut tar_output = Vec::new(); - std::io::copy(&mut tar_reader, &mut tar_output) - .map_err(|e| TestCaseError::fail(e.to_string()))?; - - // Read via reconstruct - let mut reconstruct_output = Vec::new(); - reconstruct(stream_buf.as_slice(), &files, &mut reconstruct_output) - .map_err(|e| TestCaseError::fail(e.to_string()))?; - - prop_assert_eq!( - tar_output, - reconstruct_output, - "SplitfdstreamAsRead and reconstruct outputs differ" - ); - - Ok(()) - } - } - } -} diff --git a/crates/composefs-storage/Cargo.toml b/crates/composefs-storage/Cargo.toml index 904551a5..6d0b037c 100644 --- a/crates/composefs-storage/Cargo.toml +++ b/crates/composefs-storage/Cargo.toml @@ -17,25 +17,29 @@ cap-std = { version = "4.0", default-features = false } cap-std-ext = { version = "4.0", default-features = false } crc = { version = "3.0", default-features = false } flate2 = { version = "1.0", default-features = false, features = ["rust_backend"] } -jsonrpc-fdpass = { workspace = true, optional = true } +composefs-splitdirfdstream = { path = "../composefs-splitdirfdstream", version = "0.7.0", optional = true, features = ["tokio"] } oci-spec = { version = "0.9", default-features = false, features = ["image"] } -rustix = { version = "1.0", default-features = false, features = ["fs", "std", "process", "thread"] } +rustix = { version = "1.0", default-features = false, features = ["fs", "std", "process", "thread", "pipe"] } serde = { version = "1.0", default-features = false, features = ["derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } -sha2 = { version = "0.10", default-features = false, features = ["std"] } tar-core = "0.1.0" thiserror = { version = "2.0", default-features = false } -tokio = { version = "1.40", default-features = false, features = ["rt", "net", "sync"], optional = true } +tokio = { version = "1.40", default-features = false, features = ["rt", "rt-multi-thread", "net", "sync", "macros"], optional = true } toml = { version = "0.8", default-features = false, features = ["parse"] } tracing = { version = "0.1", default-features = false, optional = true } +zlink = { workspace = true, optional = true } zstd = { version = "0.13", default-features = false } [features] default = [] -userns-helper = ["dep:jsonrpc-fdpass", "dep:tokio", "dep:tracing"] +# Layer transfer over zlink + splitdirfdstream. Enables both the in-process +# transport and the rootless `podman unshare` helper (init_if_helper / +# StorageProxy). +layer-transfer = ["dep:zlink", "dep:composefs-splitdirfdstream", "dep:tokio", "dep:tracing"] [dev-dependencies] tempfile = { version = "3.8", default-features = false } +tokio = { version = "1.40", default-features = false, features = ["rt", "rt-multi-thread", "macros", "net", "sync"] } [lints] workspace = true diff --git a/crates/composefs-storage/src/cstor_service.rs b/crates/composefs-storage/src/cstor_service.rs new file mode 100644 index 00000000..da5a34e5 --- /dev/null +++ b/crates/composefs-storage/src/cstor_service.rs @@ -0,0 +1,1096 @@ +//! Stateless `org.composefs.Oci` service backed by containers-storage. +//! +//! Exposes `GetInfo` and `GetLayer` from the `org.composefs.Oci` varlink +//! interface, using the `containers-storage` layer store as the source. +#![allow(missing_docs)] +//! +//! # Design +//! +//! The service is **stateless**: there is no handle map and no `Open`/`Close` +//! lifecycle. `GetLayer` receives all necessary parameters inline via +//! [`GetLayerParams`]: +//! +//! ```text +//! GetLayer(handle, params: GetLayerParams) → streaming frames +//! ``` +//! +//! Where `params.storage` carries both the `storage_path` and `layer_id`. +//! +//! # Lock safety +//! +//! The service acquires a **shared flock** on `overlay-layers/layers.lock` +//! before opening any diff-directory file descriptors. This prevents a +//! concurrent `podman rmi` (which holds an exclusive lock) from deleting a +//! layer's diff directory while we are streaming it. +//! +//! The lock is held by a **self-reaping producer task** (a `spawn_blocking` +//! closure) across three phases: +//! +//! 1. **Phase 1**: Produce the `splitdirfdstream` bytes into the data pipe +//! write end; drop the write end (data-pipe EOF to consumer). +//! 2. **Phase 2**: Read the keepalive-pipe read end to EOF, blocking until +//! the consumer drops its keepalive write end (signalling it is finished). +//! 3. **Phase 3**: Drop the lock guard (releases the flock) and all other +//! resources. +//! +//! **Critical ordering**: Phase 2 comes AFTER the write end drops (Phase 1 +//! end), otherwise the data pipe never EOF and the consumer deadlocks. +//! +//! On process death (SIGKILL, etc.) the kernel closes all fds, automatically +//! releasing the flock — so the userns-helper subprocess can be killed +//! safely without leaving containers-storage in a locked state. + +use std::collections::HashMap; +use std::io::Write; +use std::os::fd::OwnedFd; + +use composefs_splitdirfdstream::{ + MAX_FDS_PER_FRAME, SplitdirfdstreamWriter, build_layer_fd_layout, seed_from_id, + spawn_self_reaping_producer, split_fds_into_frames, +}; +use serde::{Deserialize, Serialize}; + +use crate::layer::Layer; +use crate::lock::LayerStoreLock; +use crate::storage::Storage; +use crate::tar_split::{TarSplitFdStream, TarSplitItem as TarItem}; + +// ── Wire types (compatible with composefs-oci::varlink_types) ──────────────── +// +// These definitions are independent of composefs-oci to avoid a dependency +// cycle. They are wire-format-compatible (same field names, same JSON +// encoding) with the types in composefs-oci::varlink_types. + +/// Locator for a layer inside a containers-storage store. +/// +/// This is the wire-format counterpart of `composefs_oci::varlink_types::StorageLocator`. +#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] +pub struct StorageLocator { + /// Absolute path to the containers-storage root directory. + pub storage_path: String, + /// The layer ID within that storage root. + pub layer_id: String, +} + +/// Parameters for the `GetLayer` method. +/// +/// This is the wire-format counterpart of `composefs_oci::varlink_types::GetLayerParams`. +/// The `diff_id` field is ignored by this service; only `storage` is used. +#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] +pub struct GetLayerParams { + /// Ignored by the cstor service (used by the repo service). + pub diff_id: Option, + /// Location of the layer in containers-storage (required). + pub storage: Option, +} + +/// Reply from `GetInfo`. +#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] +pub struct GetInfoReply { + /// Capability tokens supported by this service. + pub features: Vec, +} + +/// Reply from `GetLayer`. +#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)] +pub struct GetLayerReply { + /// Number of diff-directory fd slots in the logical FD array. + pub dir_count: u32, +} + +/// Errors from the `org.composefs.Oci` interface (cstor service subset). +#[derive(Debug, zlink::ReplyError, zlink::introspect::ReplyError)] +#[zlink(interface = "org.composefs.Oci")] +pub enum CstorOciError { + /// The repository could not be found or opened. + RepoNotFound { message: String }, + /// The given handle is ignored (stateless service). + InvalidHandle { handle: u64 }, + /// The named OCI image/reference does not exist. + NoSuchImage { image: String }, + /// Internal error. + InternalError { message: String }, + /// Layer not found. + NoSuchLayer { diff_id: String }, + /// Malformed digest/diff-id. + InvalidDigest { message: String }, + /// Diff-id mismatch (unused in this service, kept for interface compat). + DiffIdMismatch { expected: String, actual: String }, + /// Malformed request. + InvalidRequest { message: String }, + /// Too many fds for a single non-streaming reply. + FdLimitExceeded { fd_count: u64, max_per_frame: u64 }, +} + +// ── CstorLayerService ───────────────────────────────────────────────────────── + +/// Stateless service implementing the `org.composefs.Oci` interface, backed +/// by a containers-storage layer store. +/// +/// Only `GetInfo` and `GetLayer` are implemented; all other methods return +/// a `MethodNotFound` error from zlink. +#[derive(Debug, Default)] +pub struct CstorLayerService; + +// ── Producer helpers ────────────────────────────────────────────────────────── + +/// Produce a `splitdirfdstream`-encoded byte sequence for `layer` in +/// `storage` into `out`. +/// +/// `link_id_to_dirfd` maps each layer's link ID to the *sparse* dirfd slot +/// index where its pre-opened diff-directory fd was placed. +pub(crate) fn produce_splitdirfdstream( + storage: &Storage, + layer: &Layer, + link_id_to_dirfd: &HashMap, + real_indices: &std::collections::HashSet, + out: W, +) -> crate::Result<()> { + let mut stream = TarSplitFdStream::new(storage, layer)?; + let mut writer = SplitdirfdstreamWriter::new(out); + const ZERO_PADDING: [u8; 512] = [0u8; 512]; + let mut prev_pad: usize = 0; + + while let Some(item) = stream.next()? { + match item { + TarItem::Segment(bytes) => { + let stripped = if prev_pad <= bytes.len() { + &bytes[prev_pad..] + } else { + &[][..] + }; + if !stripped.is_empty() { + writer.write_metadata(stripped).map_err(|e| { + crate::error::StorageError::TarSplitError(format!( + "splitdirfdstream write_inline: {e}" + )) + })?; + } + prev_pad = 0; + } + TarItem::FileContent { + fd, + size, + name, + link_id, + } => { + let relpath = name.strip_prefix("./").unwrap_or(&name); + let dirfd_index = *link_id_to_dirfd.get(link_id.as_str()).ok_or_else(|| { + crate::error::StorageError::TarSplitError(format!( + "link_id {link_id} not found in dirfd map" + )) + })?; + assert!( + real_indices.contains(&dirfd_index), + "BUG: producer emitting dirfd_index={dirfd_index} for {relpath:?} \ + (link_id={link_id:?}) but that slot is NOT a real diff-dir slot; \ + real_indices={real_indices:?} link_id_to_dirfd={link_id_to_dirfd:?}" + ); + let stat = rustix::fs::fstat(&fd).map_err(|e| { + crate::error::StorageError::TarSplitError(format!( + "fstat failed for {relpath}: {e}" + )) + })?; + let world_readable = stat.st_mode & 0o004 != 0; + if world_readable { + writer + .write_file_backed_data(dirfd_index, size, relpath.as_bytes()) + .map_err(|e| { + crate::error::StorageError::TarSplitError(format!( + "splitdirfdstream write_external {relpath}: {e}" + )) + })?; + } else { + let mut buf = vec![0u8; size as usize]; + let mut off = 0u64; + while (off as usize) < buf.len() { + let n = + rustix::io::pread(&fd, &mut buf[off as usize..], off).map_err(|e| { + crate::error::StorageError::TarSplitError(format!( + "pread failed for {relpath}: {e}" + )) + })?; + if n == 0 { + return Err(crate::error::StorageError::TarSplitError(format!( + "unexpected EOF reading {relpath}" + ))); + } + off += n as u64; + } + writer.write_inline_data(&buf).map_err(|e| { + crate::error::StorageError::TarSplitError(format!( + "write_file_content {relpath}: {e}" + )) + })?; + } + let pad = (size.next_multiple_of(512) - size) as usize; + if pad > 0 { + writer.write_metadata(&ZERO_PADDING[..pad]).map_err(|e| { + crate::error::StorageError::TarSplitError(format!( + "splitdirfdstream write_inline padding: {e}" + )) + })?; + } + prev_pad = pad; + } + } + } + + writer.finish().map_err(|e| { + crate::error::StorageError::TarSplitError(format!("splitdirfdstream finish: {e}")) + })?; + Ok(()) +} + +/// Open one diff-directory fd per layer in the chain, in chain order. +/// +/// Returns `(real_fds, link_ids)` where `real_fds[i]` is the diff-dir fd for +/// `link_ids[i]`. The sparse-slot placement is **not** done here; callers +/// should pass `real_fds` to [`composefs_splitdirfdstream::build_layer_fd_layout`] +/// which fills dummy slots using the canonical ordinal-parity rule. The +/// returned `link_ids` can then be zipped with `layout.real_indices` to build +/// the `link_id → dirfd_slot_index` map required by [`produce_splitdirfdstream`]. +pub(crate) fn open_layer_diff_fds( + storage: &Storage, + layer: &Layer, +) -> crate::Result<(Vec, Vec)> { + use std::os::fd::AsFd as _; + let chain = crate::layer::Layer::open(storage, layer.id())? + .layer_chain(storage) + .map_err(|e| crate::error::StorageError::Io(std::io::Error::other(format!("{e:#}"))))?; + + let mut real_fds = Vec::with_capacity(chain.len()); + let mut link_ids = Vec::with_capacity(chain.len()); + + for l in &chain { + let fd = rustix::io::dup(l.diff_dir().as_fd()) + .map_err(|e| crate::error::StorageError::Io(std::io::Error::from(e)))?; + real_fds.push(fd); + link_ids.push(l.link_id().to_string()); + } + + Ok((real_fds, link_ids)) +} + +// ── zlink service impl ──────────────────────────────────────────────────────── + +mod service_impl { + #![allow(missing_docs)] + + use super::{ + CstorLayerService, CstorOciError, GetInfoReply, GetLayerParams, GetLayerReply, Layer, + LayerStoreLock, MAX_FDS_PER_FRAME, Storage, build_layer_fd_layout, open_layer_diff_fds, + produce_splitdirfdstream, seed_from_id, spawn_self_reaping_producer, split_fds_into_frames, + }; + + #[zlink::service( + interface = "org.composefs.Oci", + vendor = "org.composefs", + product = "composefs-storage", + version = env!("CARGO_PKG_VERSION"), + url = "https://github.com/composefs/composefs-rs" + )] + impl CstorLayerService { + /// Advertise the capabilities of this service. + async fn get_info(&self) -> std::result::Result { + Ok(GetInfoReply { + features: vec![ + "splitdirfdstream-v0".into(), + "source-containers-storage".into(), + "read-only".into(), + ], + }) + } + + /// Stream a layer from containers-storage as a `splitdirfdstream`. + /// + /// `params.storage` must be set; `params.diff_id` is ignored. + /// + /// The self-reaping producer task: + /// Phase 1 — produce stream into data pipe; drop write end (data EOF). + /// Phase 2 — wait for keepalive-pipe EOF (consumer finished). + /// Phase 3 — drop lock guard + all resources. + #[zlink(more, return_fds)] + async fn get_layer( + &self, + more: bool, + handle: u64, + params: GetLayerParams, + #[zlink(fds)] _fds: Vec, + ) -> impl zlink::futures_util::Stream< + Item = ( + std::result::Result, CstorOciError>, + Vec, + ), + > + Unpin { + use zlink::futures_util::stream::{self, StreamExt as _}; + + let _ = handle; // stateless — handle is ignored + + type StreamItem = ( + std::result::Result, CstorOciError>, + Vec, + ); + + macro_rules! err_stream { + ($e:expr) => { + return stream::iter(std::iter::once::((Err($e), vec![]))) + .left_stream() + }; + } + + // Extract storage locator (required). + let locator = match params.storage { + Some(l) => l, + None => err_stream!(CstorOciError::InvalidRequest { + message: "GetLayer: params.storage is required for the cstor service".into(), + }), + }; + let storage_path = locator.storage_path; + let layer_id = locator.layer_id; + + // Open storage and locate the layer. + let storage = match Storage::open(&storage_path) { + Ok(s) => s, + Err(e) => err_stream!(CstorOciError::RepoNotFound { + message: format!("{e:#}"), + }), + }; + let layer = match Layer::open(&storage, &layer_id) { + Ok(l) => l, + Err(_) => err_stream!(CstorOciError::NoSuchLayer { + diff_id: layer_id.clone(), + }), + }; + + // Seed for sparse layout (derived from layer_id). + let seed = seed_from_id(&layer_id); + + // Acquire shared lock FIRST (before opening diff-dir fds). + // Lock → open ordering is atomic w.r.t. concurrent `podman rmi`. + let lock: LayerStoreLock = match storage.lock_layers_shared() { + Ok(l) => l, + Err(e) => err_stream!(CstorOciError::InternalError { + message: format!("lock_layers_shared: {e:#}"), + }), + }; + + // Open one real diff-dir fd per chain layer (in chain order). + // Sparse placement and dummy-fd insertion are delegated to + // build_layer_fd_layout so that both paths share one implementation. + let (real_fds, link_ids) = match open_layer_diff_fds(&storage, &layer) { + Ok(pair) => pair, + Err(e) => err_stream!(CstorOciError::InternalError { + message: format!("open diff dirs: {e:#}"), + }), + }; + + // Create the data pipe. + let (read_fd, write_fd) = + match rustix::pipe::pipe_with(rustix::pipe::PipeFlags::CLOEXEC) { + Ok(p) => p, + Err(e) => err_stream!(CstorOciError::InternalError { + message: format!("pipe: {e}"), + }), + }; + + // Build the full sparse fd layout (dirfds region + keepalive + extras). + // This is the same call the repo path makes, ensuring one canonical + // dummy-fd parity rule (ordinal-based, not slot-value-based). + let layout = match build_layer_fd_layout(read_fd, real_fds, seed) { + Ok(l) => l, + Err(e) => err_stream!(CstorOciError::InternalError { + message: format!("build_layer_fd_layout: {e}"), + }), + }; + let dir_count = layout.dir_count; + // Build the link_id → sparse-slot-index map from the layout's real_indices. + // layout.real_indices[i] is the slot where chain layer i's real fd was placed. + let link_id_to_dirfd: std::collections::HashMap = link_ids + .into_iter() + .zip(layout.real_indices.iter().copied()) + .collect(); + let real_indices: std::collections::HashSet = + layout.real_indices.iter().copied().collect(); + + // Split into transport frames (with more=false guard). + let (batches, n_frames) = match split_fds_into_frames(layout.fds_all, seed, more) { + Ok(b) => { + let n = b.len(); + (b, n) + } + Err((_fds, e)) => { + err_stream!(CstorOciError::FdLimitExceeded { + fd_count: e.fd_count as u64, + max_per_frame: MAX_FDS_PER_FRAME as u64, + }) + } + }; + + // Spawn the 3-phase self-reaping producer task via the shared helper. + // Phase 1: produce; drop write_fd (data-pipe EOF). + // Phase 2: keepalive-pipe EOF wait (consumer finished). + // Phase 3: drop lock (releases shared flock). + let keepalive_read = layout.keepalive_read; + spawn_self_reaping_producer(write_fd, keepalive_read, lock, move |wf| { + if let Err(e) = + produce_splitdirfdstream(&storage, &layer, &link_id_to_dirfd, &real_indices, wf) + { + tracing::warn!("cstor producer error: {e:#}"); + } + }); + + stream::iter(batches.into_iter().enumerate().map(move |(i, batch)| { + let is_last = i == n_frames - 1; + ( + Ok(zlink::Reply::new(Some(GetLayerReply { dir_count })) + .set_continues(Some(!is_last))), + batch, + ) + })) + .right_stream() + } + } +} + +// ── In-process transport ────────────────────────────────────────────────────── + +/// Handle to an in-process [`CstorLayerService`] server. +/// +/// Drop or call [`InProcessServer::shutdown`] when done. +#[derive(Debug)] +pub struct InProcessServer { + shutdown: Option>, + thread: Option>, +} + +impl InProcessServer { + /// Signal the server to stop and reap its thread. + pub async fn shutdown(mut self) { + if let Some(tx) = self.shutdown.take() { + let _ = tx.send(()); + } + if let Some(thread) = self.thread.take() { + let _ = tokio::task::spawn_blocking(move || thread.join()).await; + } + } +} + +impl Drop for InProcessServer { + fn drop(&mut self) { + if let Some(tx) = self.shutdown.take() { + let _ = tx.send(()); + } + } +} + +/// Spawn a [`CstorLayerService`] in-process over a Unix socket pair. +/// +/// Returns a connected `zlink::unix::Connection` and an [`InProcessServer`] +/// handle for shutdown. +pub fn spawn_in_process( + service: CstorLayerService, +) -> std::io::Result<(zlink::unix::Connection, InProcessServer)> { + let (client_std, server_std) = std::os::unix::net::UnixStream::pair()?; + client_std.set_nonblocking(true)?; + server_std.set_nonblocking(true)?; + + let client_stream = tokio::net::UnixStream::from_std(client_std)?; + let client_zlink = zlink::unix::Stream::from(client_stream); + let client_conn = zlink::Connection::from(client_zlink); + + let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel::<()>(); + + let handle = std::thread::Builder::new() + .name("cstor-oci-server".into()) + .spawn(move || { + let rt = match tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + { + Ok(rt) => rt, + Err(e) => { + tracing::error!("CstorLayerService server runtime build failed: {e:#?}"); + return; + } + }; + let local = tokio::task::LocalSet::new(); + local.block_on(&rt, async move { + let server_stream = match tokio::net::UnixStream::from_std(server_std) { + Ok(s) => s, + Err(e) => { + tracing::error!( + "CstorLayerService server stream conversion failed: {e:#?}" + ); + return; + } + }; + let server_zlink = zlink::unix::Stream::from(server_stream); + let listener = zlink::ReadyListener::new(server_zlink); + let server = zlink::Server::new(listener, service); + + tokio::select! { + res = server.run() => { + if let Err(e) = res { + tracing::warn!("CstorLayerService in-process server error: {e:#?}"); + } + } + _ = shutdown_rx => { + tracing::trace!("CstorLayerService in-process server received stop signal"); + } + } + }); + })?; + + Ok(( + client_conn, + InProcessServer { + shutdown: Some(shutdown_tx), + thread: Some(handle), + }, + )) +} + +/// Serve the `CstorLayerService` on `socket` until the peer disconnects. +/// +/// Intended for the userns helper subprocess: the helper is shut down by the +/// parent (`cfsctl`) `kill()`ing it once the import is done (see +/// [`crate::userns_helper::StorageProxy`]). Abnormal parent death is covered by +/// the helper's `PR_SET_PDEATHSIG` net (see `init_if_helper`), so there is no +/// need to detect peer close here. +pub fn serve_on_socket_blocking(socket: std::os::unix::net::UnixStream) -> std::io::Result<()> { + socket.set_nonblocking(true)?; + + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?; + + let local = tokio::task::LocalSet::new(); + local.block_on(&rt, async move { + // `from_std` must be called inside the runtime context. + let stream = tokio::net::UnixStream::from_std(socket)?; + let zs = zlink::unix::Stream::from(stream); + let listener = zlink::ReadyListener::new(zs); + let server = zlink::Server::new(listener, CstorLayerService); + server + .run() + .await + .map_err(|e| std::io::Error::other(format!("{e:#?}")))?; + Ok(()) + }) +} + +// ── Tests ───────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::io::Read as _; + use std::os::fd::{AsFd as _, BorrowedFd}; + + use composefs_splitdirfdstream::{SplitdirfdstreamReader, build_layer_fd_layout, seed_from_id}; + use flate2::Compression; + use flate2::write::GzEncoder; + use tempfile::TempDir; + + use super::*; + use crate::layer::Layer; + use crate::storage::Storage; + + // ── Shared test fixture ────────────────────────────────────────────────── + + fn create_two_layer_mock(root: &std::path::Path) -> Storage { + for d in ["overlay", "overlay-layers", "overlay-images"] { + std::fs::create_dir_all(root.join(d)).unwrap(); + } + + let child_id = "child-layer-001"; + let child_link = "CHILDLINKID000000000000000"; + let child_diff = root.join("overlay").join(child_id).join("diff"); + std::fs::create_dir_all(child_diff.join("etc")).unwrap(); + std::fs::write(child_diff.join("etc/child.txt"), b"child content!").unwrap(); + std::fs::write(root.join("overlay").join(child_id).join("link"), child_link).unwrap(); + + let parent_id = "parent-layer-001"; + let parent_link = "PARENTLINKID000000000000000"; + let parent_diff = root.join("overlay").join(parent_id).join("diff"); + std::fs::create_dir_all(parent_diff.join("etc")).unwrap(); + std::fs::write(parent_diff.join("etc/parent.txt"), b"parent content!").unwrap(); + std::fs::write( + root.join("overlay").join(parent_id).join("link"), + parent_link, + ) + .unwrap(); + + std::fs::write( + root.join("overlay").join(child_id).join("lower"), + format!("l/{}", parent_link), + ) + .unwrap(); + + let l_dir = root.join("overlay").join("l"); + std::fs::create_dir_all(&l_dir).unwrap(); + std::os::unix::fs::symlink(format!("../{}/diff", child_id), l_dir.join(child_link)) + .unwrap(); + std::os::unix::fs::symlink(format!("../{}/diff", parent_id), l_dir.join(parent_link)) + .unwrap(); + + let ndjson = concat!( + r#"{"type":1,"name":"./etc/child.txt","size":14}"#, + "\n", + r#"{"type":1,"name":"./etc/parent.txt","size":15}"#, + "\n", + ); + let gz_path = root + .join("overlay-layers") + .join(format!("{}.tar-split.gz", child_id)); + let gz_file = std::fs::File::create(&gz_path).unwrap(); + let mut encoder = GzEncoder::new(gz_file, Compression::default()); + encoder.write_all(ndjson.as_bytes()).unwrap(); + encoder.finish().unwrap(); + + Storage::open(root).unwrap() + } + + // Deterministic constants for "child-layer-001". + // + // These derive from the layer-id seed independently of the sparse-slot + // *placement* (which comes from a seeded shuffle): the number of dummy slots, + // extra lifetime fds, and transport frames are all fixed for this seed. The + // real-layer slot indices themselves are read from the layout at runtime. + const LAYER_SEED: u64 = 9_551_015_030_439_334_514; + const EXPECTED_DIR_COUNT: u32 = 5; + const EXPECTED_N_EXTRA_LIFETIME: usize = 2; + const EXPECTED_N_FRAMES: usize = 4; + + /// Build the fd layout for a layer under test and return the full layout + /// plus the `link_id → sparse-slot-index` map. + /// + /// Uses a `/dev/null` fd as a throwaway pipe_read (tests don't drain the + /// data pipe; they call `produce_splitdirfdstream` separately into a vec). + fn build_test_layout( + storage: &Storage, + layer: &Layer, + seed: u64, + ) -> ( + composefs_splitdirfdstream::LayerFdLayout, + HashMap, + std::collections::HashSet, + ) { + let (real_fds, link_ids) = open_layer_diff_fds(storage, layer).unwrap(); + let dummy_pipe_read = composefs_splitdirfdstream::open_devnull().unwrap(); + let layout = build_layer_fd_layout(dummy_pipe_read, real_fds, seed).unwrap(); + let real_indices: std::collections::HashSet = + layout.real_indices.iter().copied().collect(); + let map: HashMap = link_ids + .into_iter() + .zip(layout.real_indices.iter().copied()) + .collect(); + (layout, map, real_indices) + } + + // ── C1: sparse layout + producer ──────────────────────────────────────── + + #[test] + fn test_produce_splitdirfdstream_chunk_sequence() { + let tmp = TempDir::new().unwrap(); + let storage = create_two_layer_mock(tmp.path()); + let layer = Layer::open(&storage, "child-layer-001").unwrap(); + + let seed = seed_from_id("child-layer-001"); + assert_eq!(seed, LAYER_SEED, "seed must match precomputed value"); + + let (layout, link_id_to_dirfd, real_indices) = build_test_layout(&storage, &layer, seed); + + let child_link = "CHILDLINKID000000000000000"; + let parent_link = "PARENTLINKID000000000000000"; + + // The real-layer slot positions come from the seeded shuffle; rather + // than pin them to specific values, take them from the layout map and + // assert the produced chunks reference those same slots. + let child_idx = link_id_to_dirfd[child_link]; + let parent_idx = link_id_to_dirfd[parent_link]; + + // dir_count = total slot count including dummies. + assert_eq!(layout.dir_count, EXPECTED_DIR_COUNT); + assert!(child_idx < EXPECTED_DIR_COUNT && parent_idx < EXPECTED_DIR_COUNT); + assert_ne!(child_idx, parent_idx, "real layers occupy distinct slots"); + + let mut buf = Vec::::new(); + produce_splitdirfdstream(&storage, &layer, &link_id_to_dirfd, &real_indices, &mut buf) + .unwrap(); + + let mut reader = SplitdirfdstreamReader::new(buf.as_slice()); + + // child.txt → FileBackedData at child's sparse slot. + match reader.next_chunk().unwrap().expect("chunk 1") { + composefs_splitdirfdstream::Chunk::FileBackedData { + dirfd_index, + length, + filename, + } => { + assert_eq!(dirfd_index, child_idx); + assert_eq!(length, 14); + assert_eq!(filename, b"etc/child.txt"); + } + other => panic!("expected FileBackedData, got {other:?}"), + } + + // 498 bytes padding. + match reader.next_chunk().unwrap().expect("chunk 2") { + composefs_splitdirfdstream::Chunk::Metadata(data) => { + assert_eq!(data.len(), 498); + assert!(data.iter().all(|&b| b == 0)); + } + other => panic!("expected Metadata (padding), got {other:?}"), + } + + // parent.txt → FileBackedData at parent's sparse slot. + match reader.next_chunk().unwrap().expect("chunk 3") { + composefs_splitdirfdstream::Chunk::FileBackedData { + dirfd_index, + length, + filename, + } => { + assert_eq!(dirfd_index, parent_idx); + assert_eq!(length, 15); + assert_eq!(filename, b"etc/parent.txt"); + } + other => panic!("expected FileBackedData, got {other:?}"), + } + + // 497 bytes padding. + match reader.next_chunk().unwrap().expect("chunk 4") { + composefs_splitdirfdstream::Chunk::Metadata(data) => { + assert_eq!(data.len(), 497); + assert!(data.iter().all(|&b| b == 0)); + } + other => panic!("expected Metadata (padding), got {other:?}"), + } + + assert!(reader.next_chunk().unwrap().is_none(), "expected EOF"); + } + + #[test] + fn test_non_world_readable_file_uses_file_content() { + use std::os::unix::fs::PermissionsExt as _; + + let tmp = TempDir::new().unwrap(); + let storage = create_two_layer_mock(tmp.path()); + + let child_txt = tmp + .path() + .join("overlay/child-layer-001/diff/etc/child.txt"); + std::fs::set_permissions(&child_txt, std::fs::Permissions::from_mode(0o640)).unwrap(); + + let layer = Layer::open(&storage, "child-layer-001").unwrap(); + let seed = seed_from_id("child-layer-001"); + let (_, link_id_to_dirfd, real_indices) = build_test_layout(&storage, &layer, seed); + let mut buf = Vec::::new(); + produce_splitdirfdstream(&storage, &layer, &link_id_to_dirfd, &real_indices, &mut buf) + .unwrap(); + + let mut reader = SplitdirfdstreamReader::new(buf.as_slice()); + let mut saw_file_content = false; + while let Some(chunk) = reader.next_chunk().unwrap() { + match chunk { + composefs_splitdirfdstream::Chunk::Metadata(_) => {} + composefs_splitdirfdstream::Chunk::InlineData(data) => { + assert_eq!(data, b"child content!"); + saw_file_content = true; + break; + } + composefs_splitdirfdstream::Chunk::FileBackedData { filename, .. } => { + panic!( + "non-world-readable must produce InlineData, got FileBackedData({:?})", + std::str::from_utf8(filename).unwrap_or("") + ); + } + } + } + assert!(saw_file_content, "expected InlineData chunk for child.txt"); + } + + #[test] + fn test_per_layer_diff_fds() { + let tmp = TempDir::new().unwrap(); + let storage = create_two_layer_mock(tmp.path()); + let layer = Layer::open(&storage, "child-layer-001").unwrap(); + + let seed = seed_from_id("child-layer-001"); + let (layout, link_id_to_dirfd, _real_indices) = build_test_layout(&storage, &layer, seed); + + // dir_count counts the full sparse region (real + dummy slots). + assert_eq!(layout.dir_count, EXPECTED_DIR_COUNT); + + // Real-layer slots come from the seeded shuffle; read them from the map. + let child_idx = link_id_to_dirfd["CHILDLINKID000000000000000"]; + let parent_idx = link_id_to_dirfd["PARENTLINKID000000000000000"]; + assert_ne!(child_idx, parent_idx, "real layers occupy distinct slots"); + + // fds_all layout: [dummy_pipe, dirfds region (dir_count), keepalive_write, extras] + // dirfds region starts at index 1. + let child_fd = layout.fds_all[1 + child_idx as usize].as_fd(); + assert!( + composefs_splitdirfdstream::open_beneath(child_fd, b"etc/child.txt").is_ok(), + "child diff-dir must allow opening etc/child.txt" + ); + + let parent_fd = layout.fds_all[1 + parent_idx as usize].as_fd(); + assert!( + composefs_splitdirfdstream::open_beneath(parent_fd, b"etc/parent.txt").is_ok(), + "parent diff-dir must allow opening etc/parent.txt" + ); + + // Every other slot in the dirfds region is a dummy and must not open + // any real file. + for dummy_slot in (0..EXPECTED_DIR_COUNT).filter(|s| *s != child_idx && *s != parent_idx) { + assert!( + composefs_splitdirfdstream::open_beneath( + layout.fds_all[1 + dummy_slot as usize].as_fd(), + b"etc/child.txt" + ) + .is_err(), + "dummy slot {dummy_slot} must not open real files" + ); + } + } + + #[test] + fn test_reconstruct_produces_file_content() { + let tmp = TempDir::new().unwrap(); + let storage = create_two_layer_mock(tmp.path()); + let layer = Layer::open(&storage, "child-layer-001").unwrap(); + + let seed = seed_from_id("child-layer-001"); + let (layout, link_id_to_dirfd, real_indices) = build_test_layout(&storage, &layer, seed); + assert_eq!(layout.dir_count, EXPECTED_DIR_COUNT); + + let mut buf = Vec::::new(); + produce_splitdirfdstream(&storage, &layer, &link_id_to_dirfd, &real_indices, &mut buf) + .unwrap(); + + // The dirfds region in fds_all starts at index 1. + let dir_count = layout.dir_count as usize; + let borrowed_fds: Vec> = layout.fds_all[1..=dir_count] + .iter() + .map(|f| f.as_fd()) + .collect(); + let mut out = Vec::::new(); + let bytes_written = + composefs_splitdirfdstream::reconstruct(buf.as_slice(), &borrowed_fds, &mut out) + .unwrap(); + + assert!(bytes_written > 0); + assert!( + out.windows(b"child content!".len()) + .any(|w| w == b"child content!") + ); + assert!( + out.windows(b"parent content!".len()) + .any(|w| w == b"parent content!") + ); + assert_eq!(bytes_written, 1024); + } + + // ── C2/C3: CstorLayerService in-process GetLayer round-trip ───────────── + + /// Proxy trait for the org.composefs.Oci interface (cstor client side). + #[zlink::proxy(interface = "org.composefs.Oci")] + trait CstorOciProxy { + async fn get_info( + &mut self, + ) -> zlink::Result>; + + #[zlink(more, return_fds)] + async fn get_layer( + &mut self, + handle: u64, + params: GetLayerParams, + ) -> zlink::Result< + impl zlink::futures_util::Stream< + Item = zlink::Result<( + std::result::Result, + Vec, + )>, + >, + >; + } + + /// Collect all frames from a streaming get_layer call. + async fn collect_get_layer( + client: &mut zlink::unix::Connection, + storage_path: &str, + layer_id: &str, + ) -> (GetLayerReply, Vec, usize) { + use zlink::futures_util::StreamExt as _; + + let params = GetLayerParams { + diff_id: None, + storage: Some(StorageLocator { + storage_path: storage_path.to_owned(), + layer_id: layer_id.to_owned(), + }), + }; + + let mut stream = std::pin::pin!(client.get_layer(0, params).await.unwrap()); + let mut all_fds: Vec = Vec::new(); + let mut last_reply: Option = None; + let mut frame_count = 0usize; + + while let Some(item) = stream.next().await { + let (result, fds) = item.unwrap(); + last_reply = Some(result.unwrap()); + all_fds.extend(fds); + frame_count += 1; + } + + (last_reply.expect("no frames"), all_fds, frame_count) + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_cstor_oci_get_layer_in_process() { + let tmp = TempDir::new().unwrap(); + let _ = create_two_layer_mock(tmp.path()); + let storage_path = tmp.path().to_str().unwrap().to_string(); + + let (mut client, server) = spawn_in_process(CstorLayerService).unwrap(); + + // GetInfo. + let info = client.get_info().await.unwrap().unwrap(); + assert!(info.features.contains(&"splitdirfdstream-v0".to_string())); + assert!( + info.features + .contains(&"source-containers-storage".to_string()) + ); + + // GetLayer — full round-trip. + let (reply, all_fds, frame_count) = + collect_get_layer(&mut client, &storage_path, "child-layer-001").await; + + assert_eq!(frame_count, EXPECTED_N_FRAMES); + assert_eq!(reply.dir_count, EXPECTED_DIR_COUNT); + + let expected_total = 1 + EXPECTED_DIR_COUNT as usize + 1 + EXPECTED_N_EXTRA_LIFETIME; + assert_eq!(all_fds.len(), expected_total); + + let mut it = all_fds.into_iter(); + let pipe_fd = it.next().unwrap(); + let dir_fds: Vec = it.by_ref().take(reply.dir_count as usize).collect(); + let lifetime_fds: Vec = it.collect(); + + // Verify the real diff-dirs are present somewhere in the sparse region. + // The exact slots come from the seeded shuffle and are not known to the + // client; the stream reconstruction below proves the producer used the + // right indices. + assert!( + dir_fds + .iter() + .any( + |fd| composefs_splitdirfdstream::open_beneath(fd.as_fd(), b"etc/child.txt") + .is_ok() + ), + "some dir slot must open etc/child.txt" + ); + assert!( + dir_fds + .iter() + .any( + |fd| composefs_splitdirfdstream::open_beneath(fd.as_fd(), b"etc/parent.txt") + .is_ok() + ), + "some dir slot must open etc/parent.txt" + ); + + // Read and reconstruct the stream. + let pipe_dup = rustix::io::dup(pipe_fd.as_fd()).unwrap(); + drop(pipe_fd); + let borrowed: Vec> = dir_fds.iter().map(|f| f.as_fd()).collect(); + + let mut stream_bytes = Vec::new(); + std::fs::File::from(pipe_dup) + .read_to_end(&mut stream_bytes) + .unwrap(); + + let mut reconstructed = Vec::new(); + let n = composefs_splitdirfdstream::reconstruct( + stream_bytes.as_slice(), + &borrowed, + &mut reconstructed, + ) + .unwrap(); + assert_eq!(n, 1024); + assert!( + reconstructed + .windows(b"child content!".len()) + .any(|w| w == b"child content!") + ); + assert!( + reconstructed + .windows(b"parent content!".len()) + .any(|w| w == b"parent content!") + ); + + // Drop lifetime fds → signals producer (keepalive EOF). + drop(lifetime_fds); + + drop(client); + server.shutdown().await; + } + + /// Regression test: the self-reaping producer must not deadlock. + #[test] + fn test_in_process_teardown_does_not_deadlock() { + let tmp = TempDir::new().unwrap(); + let _ = create_two_layer_mock(tmp.path()); + let storage_path = tmp.path().to_str().unwrap().to_string(); + + let worker = std::thread::spawn(move || { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build() + .unwrap(); + + rt.block_on(async move { + let (mut client, server) = spawn_in_process(CstorLayerService).unwrap(); + + let (reply, all_fds, frame_count) = + collect_get_layer(&mut client, &storage_path, "child-layer-001").await; + + assert_eq!(frame_count, EXPECTED_N_FRAMES); + assert_eq!(reply.dir_count, EXPECTED_DIR_COUNT); + + let mut it = all_fds.into_iter(); + let pipe_fd = it.next().unwrap(); + let _dir_fds: Vec = it.by_ref().take(reply.dir_count as usize).collect(); + let lifetime_fds: Vec = it.collect(); + + let pipe_dup = rustix::io::dup(pipe_fd.as_fd()).unwrap(); + drop(pipe_fd); + + tokio::task::spawn_blocking(move || { + let _lifetime_fds = lifetime_fds; + let mut f = std::fs::File::from(pipe_dup); + let mut buf = Vec::new(); + f.read_to_end(&mut buf).unwrap(); + assert!(!buf.is_empty()); + }) + .await + .unwrap(); + + drop(client); + server.shutdown().await; + }); + }); + + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(5); + while !worker.is_finished() { + if std::time::Instant::now() >= deadline { + panic!("producer deadlocked: worker did not finish within 5s"); + } + std::thread::sleep(std::time::Duration::from_millis(10)); + } + worker.join().expect("worker thread panicked"); + } +} diff --git a/crates/composefs-storage/src/layer.rs b/crates/composefs-storage/src/layer.rs index 9a05a6b9..3f877d16 100644 --- a/crates/composefs-storage/src/layer.rs +++ b/crates/composefs-storage/src/layer.rs @@ -155,33 +155,21 @@ impl Layer { /// /// Returns layers in order: [self, parent, grandparent, ..., base] /// + /// The overlay `lower` file already lists the complete ordered ancestor + /// chain (immediate parent first, base last), so the chain is simply + /// `self` followed by each resolved parent. No recursive expansion needed. + /// /// # Errors /// - /// Returns an error if the layer chain exceeds the maximum depth of 500 layers. + /// Returns an error if any parent layer cannot be resolved or opened. pub fn layer_chain(self, storage: &Storage) -> Result> { - let mut chain = vec![self]; - let mut current_idx = 0; - - // Maximum depth to prevent infinite loops - const MAX_DEPTH: usize = 500; - - while current_idx < chain.len() && chain.len() < MAX_DEPTH { - let parent_ids = chain[current_idx].parents(storage)?; - - // Add all parents to the chain - for parent_id in parent_ids { - chain.push(Layer::open(storage, &parent_id)?); - } - - current_idx += 1; - } - - if chain.len() >= MAX_DEPTH { - return Err(StorageError::InvalidStorage( - "Layer chain exceeds maximum depth of 500".to_string(), - )); + // Call parents() before moving self into the vec. + let parent_ids = self.parents(storage)?; + let mut chain = Vec::with_capacity(1 + parent_ids.len()); + chain.push(self); + for id in parent_ids { + chain.push(Layer::open(storage, &id)?); } - Ok(chain) } @@ -309,6 +297,81 @@ mod tests { Layer::open(&storage, layer_id).unwrap() } + // --- layer_chain tests --- + + /// Create a 3-layer mock: A → B → C (A's lower lists B and C). + /// + /// The overlay `lower` file for layer A is `l/:l/`, meaning + /// B is the immediate parent and C is the base. `layer_chain` should return + /// exactly [A, B, C]. + fn create_three_layer_mock(root: &Path) -> Storage { + for d in ["overlay", "overlay/l", "overlay-layers", "overlay-images"] { + std::fs::create_dir_all(root.join(d)).unwrap(); + } + + let layers = [ + ("layer-a", "AAAAAAAAAAAAAAAAAAAAAAAAAA"), + ("layer-b", "BBBBBBBBBBBBBBBBBBBBBBBBBB"), + ("layer-c", "CCCCCCCCCCCCCCCCCCCCCCCCCC"), + ]; + + for (id, link) in &layers { + let layer_dir = root.join("overlay").join(id); + std::fs::create_dir_all(layer_dir.join("diff")).unwrap(); + std::fs::write(layer_dir.join("link"), link).unwrap(); + // Create symlink overlay/l/ → ..//diff + std::os::unix::fs::symlink( + format!("../{}/diff", id), + root.join("overlay/l").join(link), + ) + .unwrap(); + } + + // Layer A's lower: B is immediate parent, C is base. + std::fs::write( + root.join("overlay/layer-a/lower"), + format!( + "l/{}:l/{}", + layers[1].1, // B link + layers[2].1, // C link + ), + ) + .unwrap(); + // Layer B has C as its parent (single entry). + std::fs::write( + root.join("overlay/layer-b/lower"), + format!("l/{}", layers[2].1), + ) + .unwrap(); + // Layer C is the base — no lower file. + + Storage::open(root).unwrap() + } + + /// Regression test: `layer_chain` must return exactly [A, B, C] for a + /// 3-layer chain where A's `lower` already lists both B and C. + /// + /// The old BFS expansion would open B, then expand B's parents (C), giving + /// [A, B, C, C] — duplicating C and growing exponentially for real images. + #[test] + fn test_layer_chain_no_bfs_expansion() { + let dir = tempfile::tempdir().unwrap(); + let storage = create_three_layer_mock(dir.path()); + + let layer_a = Layer::open(&storage, "layer-a").unwrap(); + let chain = layer_a.layer_chain(&storage).unwrap(); + + assert_eq!( + chain.len(), + 3, + "layer_chain must return exactly 3 layers [A, B, C], got {}", + chain.len() + ); + assert_eq!(chain[0].id(), "layer-a"); + assert_eq!(chain[1].id(), "layer-b"); + assert_eq!(chain[2].id(), "layer-c"); + } + // --- has_whiteout tests --- #[test] diff --git a/crates/composefs-storage/src/lib.rs b/crates/composefs-storage/src/lib.rs index b6a5dbbb..83c9f83f 100644 --- a/crates/composefs-storage/src/lib.rs +++ b/crates/composefs-storage/src/lib.rs @@ -55,27 +55,35 @@ pub mod config; pub mod error; pub mod image; pub mod layer; +pub mod lock; pub mod storage; pub mod tar_split; +// Stateless CstorLayerService implementing org.composefs.Oci +#[cfg(feature = "layer-transfer")] +pub mod cstor_service; + // User namespace support for rootless access pub mod userns; -#[cfg(feature = "userns-helper")] +#[cfg(feature = "layer-transfer")] pub mod userns_helper; // Re-export commonly used types pub use config::{AdditionalLayerStore, StorageConfig}; +#[cfg(feature = "layer-transfer")] +pub use cstor_service::{ + CstorLayerService, InProcessServer as CstorInProcessServer, + spawn_in_process as spawn_cstor_in_process, +}; pub use error::{Result, StorageError}; pub use image::Image; pub use layer::Layer; +pub use lock::LayerStoreLock; pub use storage::{LayerMetadata, Storage}; pub use tar_split::{TarHeader, TarSplitFdStream, TarSplitItem}; pub use userns::can_bypass_file_permissions; -#[cfg(feature = "userns-helper")] -pub use userns_helper::{ - GetImageResult, HelperError, ImageInfo, ProxiedLayerStream, ProxiedTarSplitItem, StorageProxy, - init_if_helper, -}; +#[cfg(feature = "layer-transfer")] +pub use userns_helper::{HelperError, StorageProxy, init_if_helper}; // Re-export OCI spec types for convenience pub use oci_spec::image::{Descriptor, ImageConfiguration, ImageManifest}; diff --git a/crates/composefs-storage/src/lock.rs b/crates/composefs-storage/src/lock.rs new file mode 100644 index 00000000..c11e203b --- /dev/null +++ b/crates/composefs-storage/src/lock.rs @@ -0,0 +1,150 @@ +//! Layer-store locking for safe concurrent access. +//! +//! containers-storage (podman/buildah) uses an `flock(2)`-based protocol on +//! `/overlay-layers/layers.lock` to coordinate readers and +//! writers. Taking a shared (read) lock here ensures that a concurrent +//! `podman rmi` cannot delete a layer's diff directory while we are +//! streaming it to the consumer. +//! +//! # Why flock? +//! +//! * **Interoperability**: containers-storage uses `flock(2)` on this exact +//! path; taking the same lock type gives correct mutual exclusion with all +//! c/storage users. +//! * **Auto-release on process death**: the kernel closes all fds and +//! releases any flocks when the process exits, even on SIGKILL. This is +//! critical for the userns helper subprocess: if the parent kills it mid- +//! transfer, the lock is released automatically, unblocking podman. +//! * **RAII**: the guard is just a wrapped `OwnedFd`; the lock releases when +//! the guard drops (fd close). + +use std::os::fd::OwnedFd; + +use rustix::fs::{FlockOperation, Mode, OFlags, flock}; + +use crate::storage::Storage; + +/// A shared (read) lock on the containers-storage layer store. +/// +/// The lock is held for as long as this guard exists; it releases +/// automatically when the guard is dropped (or when the process exits). +/// +/// Acquire via [`Storage::lock_layers_shared`]. +/// +/// The lock is released by closing the fd, which happens automatically when +/// this guard is dropped (the [`OwnedFd`] closes itself) or when the process +/// exits. No explicit unlock is needed. +#[derive(Debug)] +pub struct LayerStoreLock(#[allow(dead_code)] OwnedFd); + +impl Storage { + /// Acquire a shared (read) flock on `overlay-layers/layers.lock`. + /// + /// Blocks until the lock is available. A concurrent writer (e.g. + /// `podman rmi`) holds an exclusive lock while removing layers; we block + /// until it finishes. Multiple readers may hold shared locks simultaneously. + /// + /// The lock is released when the returned [`LayerStoreLock`] is dropped. + /// + /// # Errors + /// + /// Returns an error if `overlay-layers/layers.lock` cannot be opened + /// or if the `flock` syscall fails. + pub fn lock_layers_shared(&self) -> crate::Result { + use crate::error::StorageError; + + // Open (or create) the lock file with O_RDWR so that a freshly + // created file gets the right mode for a later root podman that would + // open it O_RDWR. O_RDONLY|O_CREAT would create a 0-byte file with + // mode 0o644 but the file descriptor would be read-only, which is + // fine for flock but would break a privileged writer that re-opens it + // expecting read-write access. O_RDWR is the safe choice here. + use std::os::fd::AsFd as _; + let root_fd = self.root_dir().as_fd(); + let fd = rustix::fs::openat( + root_fd, + "overlay-layers/layers.lock", + OFlags::RDWR | OFlags::CLOEXEC | OFlags::CREATE, + Mode::from_bits_truncate(0o644), + ) + .map_err(|e| StorageError::Io(std::io::Error::from(e)))?; + + // Acquire a shared (read) lock; block if an exclusive (write) lock is + // held by another process. + flock(&fd, FlockOperation::LockShared) + .map_err(|e| StorageError::Io(std::io::Error::from(e)))?; + + Ok(LayerStoreLock(fd)) + } +} + +#[cfg(test)] +mod tests { + use tempfile::TempDir; + + use super::*; + use crate::storage::Storage; + + fn create_test_storage(tmp: &TempDir) -> Storage { + for d in ["overlay", "overlay-layers", "overlay-images"] { + std::fs::create_dir_all(tmp.path().join(d)).unwrap(); + } + Storage::open(tmp.path()).unwrap() + } + + /// Acquiring a shared lock should succeed. + #[test] + fn test_shared_lock_succeeds() { + let tmp = TempDir::new().unwrap(); + let storage = create_test_storage(&tmp); + + let lock = storage.lock_layers_shared().expect("shared lock"); + // Lock file must exist now. + assert!(tmp.path().join("overlay-layers/layers.lock").exists()); + drop(lock); // releases + } + + /// Two shared locks from the same process succeed simultaneously (shared + /// locking is not exclusive against other shared lockers). + #[test] + fn test_two_shared_locks() { + let tmp = TempDir::new().unwrap(); + let storage = create_test_storage(&tmp); + + let lock1 = storage.lock_layers_shared().expect("first shared lock"); + let lock2 = storage.lock_layers_shared().expect("second shared lock"); + drop(lock1); + drop(lock2); + } + + /// A non-blocking exclusive lock attempt fails while a shared lock is held. + /// + /// This exercises the mutual-exclusion between our shared reader lock and + /// a concurrent exclusive writer (e.g. podman rmi). We can't test the + /// blocking case in a unit test without spawning threads, but we can + /// assert that the non-blocking attempt fails with EWOULDBLOCK. + #[test] + fn test_exclusive_blocked_by_shared() { + let tmp = TempDir::new().unwrap(); + let storage = create_test_storage(&tmp); + + let _shared = storage.lock_layers_shared().expect("shared lock"); + + // Open the lock file and try a non-blocking exclusive lock — must fail. + use std::os::fd::AsFd as _; + let root_fd = storage.root_dir().as_fd(); + let fd = rustix::fs::openat( + root_fd, + "overlay-layers/layers.lock", + OFlags::RDWR | OFlags::CLOEXEC | OFlags::CREATE, + Mode::from_bits_truncate(0o644), + ) + .expect("open lock file"); + + let result = flock(&fd, FlockOperation::NonBlockingLockExclusive); + assert!( + result.is_err(), + "exclusive non-blocking lock must fail while a shared lock is held" + ); + } +} diff --git a/crates/composefs-storage/src/tar_split.rs b/crates/composefs-storage/src/tar_split.rs index 5f6a418c..704f2209 100644 --- a/crates/composefs-storage/src/tar_split.rs +++ b/crates/composefs-storage/src/tar_split.rs @@ -19,7 +19,7 @@ //! - CRC64-ISO algorithm for checksums use std::io::{BufRead, BufReader, Read, Seek}; -use std::os::fd::OwnedFd; +use std::os::fd::{AsFd as _, OwnedFd}; use base64::prelude::*; use cap_std::fs::{Dir, File}; @@ -34,6 +34,48 @@ use crate::storage::Storage; /// CRC64-ISO implementation for verifying file checksums. const CRC64_ISO: Crc = Crc::::new(&CRC_64_GO_ISO); +/// Overlay metacopy xattr names — checked in priority order. +/// +/// Root mode uses `trusted.overlay.metacopy`; rootless mode (with the +/// `userxattr` overlay mount option) uses `user.overlay.metacopy`. +const OVERLAY_METACOPY_XATTRS: &[&str] = &["trusted.overlay.metacopy", "user.overlay.metacopy"]; + +/// Returns `true` if `file` is an overlay metacopy stub. +/// +/// A metacopy file is a zero-byte upper-layer stub whose content lives in a +/// lower layer. The kernel signals this by attaching a +/// `trusted.overlay.metacopy` (or `user.overlay.metacopy` in rootless mode) +/// extended attribute to the file. Reading data from the stub via a plain +/// file descriptor returns EOF immediately, so the producer must detect and +/// skip it, falling back to the lower layer that holds the real data. +/// +/// Returns `Err` only for unexpected I/O errors; `ENODATA` / `ENOATTR` +/// (xattr not present) is treated as `Ok(false)`. +fn is_overlay_metacopy(file: &File) -> Result { + let fd = file.as_fd(); + for name in OVERLAY_METACOPY_XATTRS { + // Pass a 1-byte buffer: we only care whether the xattr exists, not + // its value. The kernel returns Ok(n) when the value fits and ERANGE + // when the buffer is too small — both mean the xattr is present. + match rustix::fs::fgetxattr(fd, *name, &mut [0u8; 1]) { + Ok(_) => return Ok(true), + // ERANGE: buffer too small, but the xattr is present. + Err(rustix::io::Errno::RANGE) => return Ok(true), + // ENODATA / ENOATTR: xattr not present on this file. + Err(rustix::io::Errno::NODATA) => continue, + // ENOATTR is the BSD spelling; some libc glue maps it the same way. + #[cfg(not(target_os = "linux"))] + Err(rustix::io::Errno::NOATTR) => continue, + Err(e) => { + return Err(StorageError::Io(std::io::Error::from_raw_os_error( + e.raw_os_error(), + ))); + } + } + } + Ok(false) +} + /// Item returned from tar-split stream iteration. #[derive(Debug)] pub enum TarSplitItem { @@ -58,6 +100,13 @@ pub enum TarSplitItem { /// This is the path as recorded in the original tar archive /// (e.g., "./etc/hosts"). name: String, + /// Short link ID of the layer in whose diff directory this file was found. + /// + /// This is the value of `overlay/l/` — the short symlink name + /// created by containers/storage that points at `..//diff`. + /// The consumer uses it to build filenames of the form + /// `l//` relative to the `overlay/` directory. + link_id: String, }, } @@ -292,6 +341,19 @@ pub struct TarSplitFdStream { /// Entry counter for debugging and error messages. entry_count: usize, + + /// Short link IDs for each layer in the chain. + /// + /// Index 0 is the target layer's own link ID; subsequent entries are the + /// ancestor layers in the same depth-first order produced by + /// [`Layer::layer_chain`]. Each link ID corresponds to a symlink under + /// `overlay/l/` that points to the layer's diff directory. + chain_link_ids: Vec, + + /// Layer IDs parallel to `chain_link_ids`. + /// + /// `chain_ids[i]` is the layer ID whose link ID is `chain_link_ids[i]`. + chain_ids: Vec, } impl TarSplitFdStream { @@ -319,28 +381,65 @@ impl TarSplitFdStream { let gz_decoder = GzDecoder::new(file); let reader = BufReader::new(gz_decoder); - // Open the layer for on-demand file lookups + // Build the ordered chain up front. + // layer_chain() consumes a Layer, so open a fresh one for that purpose; + // then open another fresh copy for the on-demand lookups below. + let chain_layer = Layer::open(storage, layer.id())?; + let chain = chain_layer.layer_chain(storage)?; + + let mut chain_link_ids = Vec::with_capacity(chain.len()); + let mut chain_ids = Vec::with_capacity(chain.len()); + for l in &chain { + chain_link_ids.push(l.link_id().to_string()); + chain_ids.push(l.id().to_string()); + } + + // Open the layer for on-demand file lookups (kept for the existing + // parent-search helpers that still operate on Layer objects). let layer = Layer::open(storage, layer.id())?; // Clone storage root dir for on-demand parent layer access let storage_root = storage.root_dir().try_clone()?; + // `chain` is no longer needed. + drop(chain); + Ok(Self { layer, storage_root, reader, entry_count: 0, + chain_link_ids, + chain_ids, }) } /// Open a file in the layer chain, trying current layer first then parents. - fn open_file_in_chain(&self, path: &str) -> Result { + /// + /// Returns the opened file together with the link ID of the layer in whose + /// diff directory the file was found (i.e. `overlay/l/` points + /// to that layer's diff dir). + /// + /// Overlay metacopy stubs (files whose content lives in a lower layer, + /// signalled by a `trusted.overlay.metacopy` or `user.overlay.metacopy` + /// xattr) are transparently skipped: finding one in a layer is treated as + /// "not found here" and the search continues down the parent chain until + /// the layer that contains the real file data is found. + fn open_file_in_chain(&self, path: &str) -> Result<(cap_std::fs::File, String)> { // Normalize path (remove leading ./) let normalized_path = path.strip_prefix("./").unwrap_or(path); - // Try to open in current layer first + // Try to open in current layer first (chain index 0 = own diff dir). + // Skip overlay metacopy stubs — they are zero-byte upper-layer files + // whose actual content lives in a lower layer. match self.layer.diff_dir().open(normalized_path) { - Ok(file) => return Ok(file), + Ok(file) if !is_overlay_metacopy(&file)? => { + let link_id = self.chain_link_ids[0].clone(); + return Ok((file, link_id)); + } + Ok(_) => { + // File exists but is a metacopy stub; fall through to parents. + } Err(e) if e.kind() == std::io::ErrorKind::NotFound => { // Continue to search parent layers } @@ -352,12 +451,15 @@ impl TarSplitFdStream { } /// Recursively search parent layers for a file. + /// + /// Returns the opened file together with the link ID of the layer in whose + /// diff directory the file was found. fn search_parent_layers( &self, current_layer: &Layer, path: &str, depth: usize, - ) -> Result { + ) -> Result<(cap_std::fs::File, String)> { const MAX_DEPTH: usize = 500; if depth >= MAX_DEPTH { @@ -377,11 +479,14 @@ impl TarSplitFdStream { // Try to open file directly in parent's diff directory match self.open_file_in_layer(&parent_id, path) { - Ok(file) => return Ok(file), + Ok(file) => { + let found_link_id = self.link_id_for(&parent_id)?; + return Ok((file, found_link_id)); + } Err(StorageError::Io(e)) if e.kind() == std::io::ErrorKind::NotFound => { // File not in this parent, recursively search its parents match self.search_by_layer_id(&parent_id, path, depth + 1) { - Ok(file) => return Ok(file), + Ok(result) => return Ok(result), Err(StorageError::TarSplitError(_)) => continue, // File not found in this branch, try next parent Err(e) => return Err(e), } @@ -397,12 +502,15 @@ impl TarSplitFdStream { } /// Search for a file starting from a layer ID. + /// + /// Returns the opened file together with the link ID of the layer in whose + /// diff directory the file was found. fn search_by_layer_id( &self, layer_id: &str, path: &str, depth: usize, - ) -> Result { + ) -> Result<(cap_std::fs::File, String)> { const MAX_DEPTH: usize = 500; if depth >= MAX_DEPTH { @@ -414,7 +522,10 @@ impl TarSplitFdStream { // Try to open file in this layer match self.open_file_in_layer(layer_id, path) { - Ok(file) => return Ok(file), + Ok(file) => { + let found_link_id = self.link_id_for(layer_id)?; + return Ok((file, found_link_id)); + } Err(StorageError::Io(e)) if e.kind() == std::io::ErrorKind::NotFound => { // File not found, check parents } @@ -428,7 +539,7 @@ impl TarSplitFdStream { for link_id in parent_links { let parent_id = self.resolve_link_direct(&link_id)?; match self.search_by_layer_id(&parent_id, path, depth + 1) { - Ok(file) => return Ok(file), + Ok(result) => return Ok(result), Err(StorageError::TarSplitError(_)) => continue, // File not found in this branch, try next parent Err(e) => return Err(e), } @@ -466,11 +577,22 @@ impl TarSplitFdStream { } /// Open a file in a specific layer's diff directory. + /// + /// Returns `Err(StorageError::Io(NotFound))` both when the file is absent + /// and when it is present but is an overlay metacopy stub, so that callers + /// uniformly fall back to the next lower layer in either case. fn open_file_in_layer(&self, layer_id: &str, path: &str) -> Result { let overlay_dir = self.storage_root.open_dir("overlay")?; let layer_dir = overlay_dir.open_dir(layer_id)?; let diff_dir = layer_dir.open_dir("diff")?; - diff_dir.open(path).map_err(StorageError::Io) + let file = diff_dir.open(path).map_err(StorageError::Io)?; + if is_overlay_metacopy(&file)? { + return Err(StorageError::Io(std::io::Error::new( + std::io::ErrorKind::NotFound, + "overlay metacopy stub — real data is in a lower layer", + ))); + } + Ok(file) } /// Read parent link IDs from a layer's lower file. @@ -490,6 +612,37 @@ impl TarSplitFdStream { } } + /// Look up the link ID for a layer by its layer ID. + /// + /// Returns the link ID (i.e. the short name under `overlay/l/`) for the + /// layer identified by `layer_id`. This should always succeed because the + /// on-demand search and the precomputed chain are built from the same + /// storage; a miss means the store mutated under us or the image is + /// malformed, so we fail closed. + fn link_id_for(&self, layer_id: &str) -> Result { + self.chain_ids + .iter() + .position(|id| id == layer_id) + .map(|idx| self.chain_link_ids[idx].clone()) + .ok_or_else(|| { + StorageError::TarSplitError(format!( + "resolved layer {} is not in the precomputed chain", + layer_id + )) + }) + } + + /// Return the ordered link-ID chain for this stream. + /// + /// Index 0 is the target layer's own link ID; subsequent entries are + /// ancestor layers in the same depth-first order produced by + /// [`Layer::layer_chain`]. The `link_id` field on every + /// [`TarSplitItem::FileContent`] yielded by [`Self::next`] is one of the + /// values in this slice. + pub fn chain_link_ids(&self) -> &[String] { + &self.chain_link_ids + } + /// Verify CRC64-ISO checksum of a file. fn verify_crc64( &self, @@ -604,7 +757,7 @@ impl TarSplitFdStream { ) })?; - let mut file = self.open_file_in_chain(path)?; + let (mut file, link_id) = self.open_file_in_chain(path)?; // Verify CRC64 if provided if let Some(ref crc64_b64) = crc64 { @@ -621,6 +774,7 @@ impl TarSplitFdStream { fd: owned_fd, size: file_size, name: path.clone(), + link_id, })); } // Empty file or directory - header already in preceding Segment @@ -645,7 +799,211 @@ impl TarSplitFdStream { #[cfg(test)] mod tests { + use std::io::Write as _; + + use flate2::Compression; + use flate2::write::GzEncoder; + use super::*; + use crate::storage::Storage; + + // --------------------------------------------------------------------------- + // Helper: build a minimal two-layer mock storage on disk. + // + // Layout created: + // / + // overlay/ + // l/ + // CHILDLINKID000000000000000 -> ../../overlay/child-layer-001/diff + // PARENTLINKID0000000000000 -> ../../overlay/parent-layer-001/diff + // child-layer-001/ + // diff/ + // etc/ + // child.txt ("child content") + // link ("CHILDLINKID000000000000000") + // lower ("l/PARENTLINKID0000000000000") + // parent-layer-001/ + // diff/ + // etc/ + // parent.txt ("parent content") + // link ("PARENTLINKID0000000000000") + // overlay-layers/ + // child-layer-001.tar-split.gz (two File entries, no CRC) + // overlay-images/ (empty, required by Storage::open) + // + // The tar-split for child-layer-001 references: + // - "./etc/child.txt" (size 14, found in child diff → dirfd_index 0) + // - "./etc/parent.txt" (size 15, found in parent diff → dirfd_index 1) + // --------------------------------------------------------------------------- + fn create_two_layer_mock(root: &std::path::Path) -> Storage { + // Required top-level dirs + for d in ["overlay", "overlay-layers", "overlay-images"] { + std::fs::create_dir_all(root.join(d)).unwrap(); + } + + // --- child layer --- + let child_id = "child-layer-001"; + let child_link = "CHILDLINKID000000000000000"; + let child_diff = root.join("overlay").join(child_id).join("diff"); + std::fs::create_dir_all(child_diff.join("etc")).unwrap(); + std::fs::write(child_diff.join("etc/child.txt"), b"child content!").unwrap(); // 14 bytes + std::fs::write(root.join("overlay").join(child_id).join("link"), child_link).unwrap(); + + // --- parent layer --- + let parent_id = "parent-layer-001"; + let parent_link = "PARENTLINKID000000000000000"; + let parent_diff = root.join("overlay").join(parent_id).join("diff"); + std::fs::create_dir_all(parent_diff.join("etc")).unwrap(); + std::fs::write(parent_diff.join("etc/parent.txt"), b"parent content!").unwrap(); // 15 bytes + std::fs::write( + root.join("overlay").join(parent_id).join("link"), + parent_link, + ) + .unwrap(); + + // child's lower file: references parent via its link + std::fs::write( + root.join("overlay").join(child_id).join("lower"), + format!("l/{}", parent_link), + ) + .unwrap(); + + // overlay/l/ symlinks: target is relative from overlay/l/ to overlay//diff + // i.e. "../../overlay//diff" won't work since overlay/l is already inside overlay/ + // The actual format used by containers/storage is: "..//diff" + let l_dir = root.join("overlay").join("l"); + std::fs::create_dir_all(&l_dir).unwrap(); + std::os::unix::fs::symlink(format!("../{}/diff", child_id), l_dir.join(child_link)) + .unwrap(); + std::os::unix::fs::symlink(format!("../{}/diff", parent_id), l_dir.join(parent_link)) + .unwrap(); + + // --- tar-split for child layer --- + // Two File entries (no CRC, so no checksum is verified). + // We also need at least one Segment entry so the reader has something + // to return for the Segment path; we emit a minimal one. + let ndjson = concat!( + r#"{"type":1,"name":"./etc/child.txt","size":14}"#, + "\n", + r#"{"type":1,"name":"./etc/parent.txt","size":15}"#, + "\n", + ); + + let gz_path = root + .join("overlay-layers") + .join(format!("{}.tar-split.gz", child_id)); + let gz_file = std::fs::File::create(&gz_path).unwrap(); + let mut encoder = GzEncoder::new(gz_file, Compression::default()); + encoder.write_all(ndjson.as_bytes()).unwrap(); + encoder.finish().unwrap(); + + Storage::open(root).unwrap() + } + + #[test] + fn test_link_id_two_layer_chain() { + let tmp = tempfile::tempdir().unwrap(); + let storage = create_two_layer_mock(tmp.path()); + let layer = Layer::open(&storage, "child-layer-001").unwrap(); + let mut stream = TarSplitFdStream::new(&storage, &layer).unwrap(); + + // chain_link_ids should have exactly 2 entries (child + parent) + assert_eq!( + stream.chain_link_ids().len(), + 2, + "expected chain length 2 (child + parent)" + ); + + let child_link = "CHILDLINKID000000000000000"; + let parent_link = "PARENTLINKID000000000000000"; + + // First item: ./etc/child.txt — present only in child diff + match stream.next().unwrap().expect("expected FileContent item") { + TarSplitItem::FileContent { name, link_id, .. } => { + assert_eq!(name, "./etc/child.txt"); + assert_eq!( + link_id, child_link, + "./etc/child.txt should be found in the child layer" + ); + } + TarSplitItem::Segment(_) => panic!("expected FileContent, got Segment"), + } + + // Second item: ./etc/parent.txt — present only in parent diff + match stream.next().unwrap().expect("expected FileContent item") { + TarSplitItem::FileContent { name, link_id, .. } => { + assert_eq!(name, "./etc/parent.txt"); + assert_eq!( + link_id, parent_link, + "./etc/parent.txt should be found in the parent layer" + ); + } + TarSplitItem::Segment(_) => panic!("expected FileContent, got Segment"), + } + + // Stream should be exhausted + assert!(stream.next().unwrap().is_none()); + } + + #[test] + fn test_link_id_single_layer() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + + // Minimal single-layer storage (no lower file → no parents) + for d in ["overlay", "overlay-layers", "overlay-images"] { + std::fs::create_dir_all(root.join(d)).unwrap(); + } + + let layer_id = "base-layer-001"; + let link_id = "BASELINKID0000000000000000"; + let diff = root.join("overlay").join(layer_id).join("diff"); + std::fs::create_dir_all(&diff).unwrap(); + std::fs::write(diff.join("hello.txt"), b"hello").unwrap(); // 5 bytes + std::fs::write(root.join("overlay").join(layer_id).join("link"), link_id).unwrap(); + + // overlay/l/ symlink (not strictly needed for single layer, but good hygiene) + let l_dir = root.join("overlay").join("l"); + std::fs::create_dir_all(&l_dir).unwrap(); + std::os::unix::fs::symlink(format!("../{}/diff", layer_id), l_dir.join(link_id)).unwrap(); + + // tar-split with one file entry + let ndjson = concat!(r#"{"type":1,"name":"./hello.txt","size":5}"#, "\n"); + let gz_path = root + .join("overlay-layers") + .join(format!("{}.tar-split.gz", layer_id)); + let gz_file = std::fs::File::create(&gz_path).unwrap(); + let mut encoder = GzEncoder::new(gz_file, Compression::default()); + encoder.write_all(ndjson.as_bytes()).unwrap(); + encoder.finish().unwrap(); + + let storage = Storage::open(root).unwrap(); + let layer = Layer::open(&storage, layer_id).unwrap(); + let mut stream = TarSplitFdStream::new(&storage, &layer).unwrap(); + + assert_eq!( + stream.chain_link_ids().len(), + 1, + "single layer → chain len 1" + ); + + match stream.next().unwrap().expect("expected item") { + TarSplitItem::FileContent { + name, + link_id: found_link_id, + .. + } => { + assert_eq!(name, "./hello.txt"); + assert_eq!( + found_link_id, link_id, + "base layer file must have its own link_id" + ); + } + TarSplitItem::Segment(_) => panic!("expected FileContent"), + } + + assert!(stream.next().unwrap().is_none()); + } #[test] fn test_tar_header_type_checks() { @@ -708,4 +1066,139 @@ mod tests { let result = TarSplitEntry::from_raw(raw); assert!(result.is_err()); } + + /// Build a two-layer mock where the child layer holds a metacopy stub for + /// `shared.txt` and the real content lives only in the parent layer. + /// + /// Uses `/var/tmp` as the temp-dir root because `user.*` extended + /// attributes are supported there (tmpfs, no `nosuid`/`nouser_xattr` + /// mount restrictions). If `/var/tmp` is unavailable or the filesystem + /// does not support `user.*` xattrs, the test is skipped. + /// + /// Layout: + /// ```text + /// /overlay/ + /// child-layer-001/diff/shared.txt — 0-byte metacopy stub + /// (user.overlay.metacopy xattr set) + /// parent-layer-001/diff/shared.txt — "real content" (12 bytes) + /// ``` + #[test] + fn test_metacopy_stub_falls_back_to_parent() { + // Use /var/tmp so user.* xattrs are available on tmpfs. + let tmp = match tempfile::Builder::new().tempdir_in("/var/tmp") { + Ok(d) => d, + Err(_) => { + // /var/tmp unavailable — skip rather than fail. + return; + } + }; + let root = + cap_std::fs::Dir::open_ambient_dir(tmp.path(), cap_std::ambient_authority()).unwrap(); + + // Required top-level dirs. + for d in ["overlay", "overlay-layers", "overlay-images"] { + root.create_dir_all(d).unwrap(); + } + + let child_id = "child-layer-001"; + let child_link = "CHILDLINKID000000000000000"; + let parent_id = "parent-layer-001"; + let parent_link = "PARENTLINKID000000000000000"; + + // Parent layer: real file content. + root.create_dir_all(format!("overlay/{parent_id}/diff")) + .unwrap(); + root.write( + format!("overlay/{parent_id}/diff/shared.txt"), + b"real content", // 12 bytes + ) + .unwrap(); + root.write(format!("overlay/{parent_id}/link"), parent_link) + .unwrap(); + + // Child layer: zero-byte metacopy stub with user.overlay.metacopy xattr. + root.create_dir_all(format!("overlay/{child_id}/diff")) + .unwrap(); + root.write(format!("overlay/{child_id}/diff/shared.txt"), b"") + .unwrap(); // 0 bytes — the stub + + // Set the metacopy xattr. If the filesystem refuses user.* xattrs, + // bail out with a skip rather than a spurious failure. + let stub_file = root + .open(format!("overlay/{child_id}/diff/shared.txt")) + .unwrap(); + match rustix::fs::fsetxattr( + stub_file.as_fd(), + "user.overlay.metacopy", + b"", + rustix::fs::XattrFlags::CREATE, + ) { + Ok(()) => {} + Err(rustix::io::Errno::OPNOTSUPP) => { + // Filesystem does not support xattrs — skip. + return; + } + Err(e) => panic!("unexpected fsetxattr error: {e}"), + } + + root.write(format!("overlay/{child_id}/link"), child_link) + .unwrap(); + root.write( + format!("overlay/{child_id}/lower"), + format!("l/{parent_link}"), + ) + .unwrap(); + + // overlay/l/ symlinks. + root.create_dir_all("overlay/l").unwrap(); + root.symlink( + format!("../{child_id}/diff"), + format!("overlay/l/{child_link}"), + ) + .unwrap(); + root.symlink( + format!("../{parent_id}/diff"), + format!("overlay/l/{parent_link}"), + ) + .unwrap(); + + // tar-split: one file entry referencing shared.txt (size 12). + let ndjson = concat!(r#"{"type":1,"name":"./shared.txt","size":12}"#, "\n"); + let gz_file = root + .open_with( + format!("overlay-layers/{child_id}.tar-split.gz"), + cap_std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true), + ) + .unwrap(); + let mut encoder = GzEncoder::new(gz_file.into_std(), Compression::default()); + encoder.write_all(ndjson.as_bytes()).unwrap(); + encoder.finish().unwrap(); + + let storage = Storage::open(tmp.path()).unwrap(); + let layer = Layer::open(&storage, child_id).unwrap(); + let mut stream = TarSplitFdStream::new(&storage, &layer).unwrap(); + + assert_eq!( + stream.chain_link_ids().len(), + 2, + "expected chain length 2 (child + parent)" + ); + + match stream.next().unwrap().expect("expected FileContent item") { + TarSplitItem::FileContent { name, link_id, .. } => { + assert_eq!(name, "./shared.txt"); + assert_eq!( + link_id, parent_link, + "./shared.txt metacopy stub in child must be skipped; \ + real data is at parent (link_id={parent_link})" + ); + } + TarSplitItem::Segment(_) => panic!("expected FileContent, got Segment"), + } + + assert!(stream.next().unwrap().is_none()); + } } diff --git a/crates/composefs-storage/src/userns_helper.rs b/crates/composefs-storage/src/userns_helper.rs index 0b50a606..55174258 100644 --- a/crates/composefs-storage/src/userns_helper.rs +++ b/crates/composefs-storage/src/userns_helper.rs @@ -3,8 +3,8 @@ //! This module provides a mechanism for unprivileged processes to access //! containers-storage content that has restrictive permissions. It works by //! spawning a helper process inside a user namespace (via `podman unshare`) -//! that can read any file, and communicating with it via JSON-RPC over a -//! Unix socket with fd-passing. +//! that can read any file, and communicating with it via the zlink +//! `org.composefs.Oci` service (`CstorLayerService`) over a Unix socket. //! //! # Why This Is Needed //! @@ -30,9 +30,8 @@ //! │ │ /proc/self/exe │ //! │ │ (child's stdin=socket) │ //! │ │ │ -//! │ proxy.stream_layer() ───────────► │ -//! │ │ │ -//! │ ◄─── receives OwnedFd via SCM_RIGHTS│ +//! │ proxy.connection() ──────────────►│ +//! │ │ (OciProxy/CstorLayerSvc) │ //! └─────────────────────────────────────┘ //! ``` //! @@ -49,152 +48,19 @@ //! // Normal application code continues here... //! ``` -use std::os::fd::AsFd; +use std::os::fd::AsFd as _; use std::os::unix::io::OwnedFd; use std::os::unix::net::UnixStream as StdUnixStream; -use std::path::Path; use std::process::{Child, Command, Stdio}; -use base64::prelude::*; -use jsonrpc_fdpass::transport::UnixSocketTransport; -use jsonrpc_fdpass::{JsonRpcMessage, JsonRpcRequest, JsonRpcResponse, MessageWithFds}; use rustix::io::dup; use rustix::process::{Signal, set_parent_process_death_signal}; -use serde::{Deserialize, Serialize}; -use tokio::net::UnixStream as TokioUnixStream; -use crate::layer::Layer; -use crate::storage::Storage; -use crate::tar_split::{TarSplitFdStream, TarSplitItem}; use crate::userns::can_bypass_file_permissions; /// Environment variable that indicates this process is a userns helper. const HELPER_ENV: &str = "__CSTORAGE_USERNS_HELPER"; -/// JSON-RPC 2.0 error codes. -/// -/// These codes follow the JSON-RPC 2.0 specification: -/// - Standard errors: -32700 to -32600 -/// - Server errors: -32099 to -32000 (implementation-defined) -mod error_codes { - /// Invalid params - the params passed to a method are invalid. - pub const INVALID_PARAMS: i32 = -32602; - - /// Method not found - the requested method does not exist. - pub const METHOD_NOT_FOUND: i32 = -32601; - - /// Resource not found - the requested resource (image, layer, etc.) was not found. - pub const RESOURCE_NOT_FOUND: i32 = -32000; - - /// Internal error - a server-side error occurred (I/O, storage access, etc.). - pub const INTERNAL_ERROR: i32 = -32003; -} - -/// JSON-RPC method names. -mod methods { - /// Open a file and return its fd. - pub const OPEN_FILE: &str = "userns.openFile"; - /// Shutdown the helper process. - pub const SHUTDOWN: &str = "userns.shutdown"; - /// List images in storage. - pub const LIST_IMAGES: &str = "userns.listImages"; - /// Get image metadata. - pub const GET_IMAGE: &str = "userns.getImage"; - /// Stream layer as tar-split entries with fds. - pub const STREAM_LAYER: &str = "userns.streamLayer"; -} - -/// Parameters for the open_file method. -#[derive(Debug, Serialize, Deserialize)] -pub struct OpenFileParams { - /// Path to open. - pub path: String, -} - -/// Result for the open_file method. -#[derive(Debug, Serialize, Deserialize)] -pub struct OpenFileResult { - /// True if successful (fd is passed out-of-band). - pub success: bool, -} - -/// Parameters for list_images method. -#[derive(Debug, Serialize, Deserialize)] -pub struct ListImagesParams { - /// Storage root path. - pub storage_path: String, -} - -/// Image info returned by list_images. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ImageInfo { - /// Image ID. - pub id: String, - /// Image names/tags. - pub names: Vec, -} - -/// Result for list_images method. -#[derive(Debug, Serialize, Deserialize)] -pub struct ListImagesResult { - /// List of images. - pub images: Vec, -} - -/// Parameters for get_image method. -#[derive(Debug, Serialize, Deserialize)] -pub struct GetImageParams { - /// Storage root path. - pub storage_path: String, - /// Image ID or name. - pub image_ref: String, -} - -/// Result for get_image method. -#[derive(Debug, Serialize, Deserialize)] -pub struct GetImageResult { - /// Image ID. - pub id: String, - /// Image names. - pub names: Vec, - /// Layer diff IDs (sha256:...). - pub layer_diff_ids: Vec, - /// Storage layer IDs (internal IDs used by containers-storage). - pub storage_layer_ids: Vec, -} - -/// Parameters for stream_layer method. -#[derive(Debug, Serialize, Deserialize)] -pub struct StreamLayerParams { - /// Storage root path. - pub storage_path: String, - /// Layer ID (storage layer ID, not diff ID). - pub layer_id: String, -} - -/// Streaming notification for a segment. -#[derive(Debug, Serialize, Deserialize)] -pub struct StreamSegmentNotification { - /// Base64-encoded segment data. - pub data: String, -} - -/// Streaming notification for a file (fd is passed out-of-band). -#[derive(Debug, Serialize, Deserialize)] -pub struct StreamFileNotification { - /// File path in the tar. - pub name: String, - /// File size. - pub size: u64, -} - -/// Result for stream_layer method (sent after all notifications). -#[derive(Debug, Serialize, Deserialize)] -pub struct StreamLayerResult { - /// Number of items streamed. - pub items_sent: usize, -} - /// Error type for userns helper operations. #[derive(Debug, thiserror::Error)] pub enum HelperError { @@ -202,890 +68,219 @@ pub enum HelperError { #[error("failed to create socket: {0}")] Socket(#[source] std::io::Error), + /// `podman` binary was not found on `PATH`. + /// + /// The userns helper requires `podman unshare` to set up a user + /// namespace. Install `podman` and ensure it is on `PATH`. + #[error("podman not found on PATH (required for user-namespace helper): {0}")] + PodmanNotFound(#[source] std::io::Error), + /// Failed to spawn helper process. #[error("failed to spawn helper process: {0}")] Spawn(#[source] std::io::Error), - /// IPC error. + /// IPC / transport error. #[error("IPC error: {0}")] Ipc(String), - /// Helper returned an error. - #[error("helper error: {0}")] - HelperError(String), - /// I/O error. #[error("I/O error: {0}")] Io(#[from] std::io::Error), - - /// JSON-RPC error from the helper. - #[error("RPC error: code={code}, message={message}")] - RpcError { - /// JSON-RPC error code. - code: i32, - /// Error message. - message: String, - }, } -/// Check if this process was spawned as a userns helper and run the helper loop if so. +/// Check if this process was spawned as a userns helper and serve the +/// `org.composefs.Oci` (`CstorLayerService`) zlink service if so. /// -/// This function **must** be called early in `main()`, before any other cstorage -/// operations. If this process was spawned as a helper, this function will: +/// This function **must** be called early in `main()`, before any other +/// `composefs_storage` operations. If the `__CSTORAGE_USERNS_HELPER` +/// environment variable is set this function will: /// -/// 1. Read from stdin (which is a Unix socket from the parent) -/// 2. Serve JSON-RPC requests for file operations -/// 3. Exit when the parent closes the connection +/// 1. Set a parent-death signal so the helper exits when the parent dies. +/// 2. Duplicate `stdin` into an owned [`StdUnixStream`]. +/// 3. Call [`crate::cstor_service::serve_on_socket_blocking`] to serve the +/// `org.composefs.Oci` zlink service until the parent closes the connection. +/// 4. Exit the process (0 on success, 1 on error). /// -/// If this is not a helper process, this function returns immediately. +/// If the environment variable is **not** set, this function returns +/// immediately and the caller continues normal execution. // // Runs in a forked helper subprocess whose stdin is the IPC socket; there is // no logging facility or error channel back to the parent, so fatal // diagnostics are written to stderr before exiting. #[allow(clippy::print_stderr)] pub fn init_if_helper() { - // Check if we're a helper via environment variable if std::env::var(HELPER_ENV).is_err() { - return; // Not a helper, continue normal execution + return; // Not a helper — continue normal execution. } - // Ensure we exit if parent dies (avoids orphan helper processes) + // Set a death signal on our immediate parent's exit, as a best-effort net + // against orphaned helpers. + // + // The PRIMARY shutdown mechanism is explicit: `StorageProxy` (in the parent + // cfsctl process) `kill()`s this helper when the import finishes or when the + // proxy is dropped. PDEATHSIG is only a fallback for abnormal parent death. + // + // CAVEAT: when spawned via `podman unshare `, the helper's PPID is the + // intermediate `podman` process, NOT cfsctl. So PDEATHSIG actually fires on + // *podman's* death, not cfsctl's; it does not track cfsctl through the + // podman intermediary. In the normal flow `podman unshare` waits on this + // helper, so cfsctl killing the helper (or podman) tears the whole chain + // down. This signal is purely a backstop. if let Err(e) = set_parent_process_death_signal(Some(Signal::TERM)) { - eprintln!("cstorage helper: failed to set parent death signal: {}", e); - // Continue anyway - this is a nice-to-have, not critical + eprintln!("cstorage helper: failed to set parent death signal: {e}"); + // Continue anyway — explicit kill() from the parent is the real guard. } - // We're a helper - stdin is our IPC socket. - // Use dup() to get a new owned fd from stdin (fd 0). - // This is safe because: - // 1. We were spawned with stdin set to a socket - // 2. dup() gives us a new fd that we own - // 3. We use std::io::stdin().as_fd() which is the safe way to get the fd - let stdin_fd = match dup(std::io::stdin().as_fd()) { + // stdin is our IPC socket. dup() it so we hold an owned fd. + let stdin_fd: OwnedFd = match dup(std::io::stdin().as_fd()) { Ok(fd) => fd, Err(e) => { - eprintln!("cstorage helper: failed to dup stdin: {}", e); + eprintln!("cstorage helper: failed to dup stdin: {e}"); std::process::exit(1); } }; let std_socket = StdUnixStream::from(stdin_fd); - // Run the helper loop (never returns on success) - if let Err(e) = run_helper_loop_blocking(std_socket) { - eprintln!("cstorage helper: error in helper loop: {}", e); + if let Err(e) = crate::cstor_service::serve_on_socket_blocking(std_socket) { + eprintln!("cstorage helper: server error: {e}"); std::process::exit(1); } std::process::exit(0); } -/// Run the helper loop synchronously by creating a tokio runtime. -fn run_helper_loop_blocking(std_socket: StdUnixStream) -> std::result::Result<(), HelperError> { - // Set non-blocking for tokio - std_socket.set_nonblocking(true)?; - - // Create a tokio runtime for the helper - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .map_err(|e| HelperError::Ipc(format!("failed to create tokio runtime: {}", e)))?; - - rt.block_on(run_helper_loop_async(std_socket)) -} - -/// Run the helper loop, serving requests from the parent. -async fn run_helper_loop_async(std_socket: StdUnixStream) -> std::result::Result<(), HelperError> { - // Convert std socket to tokio socket - let tokio_socket = TokioUnixStream::from_std(std_socket) - .map_err(|e| HelperError::Ipc(format!("failed to convert socket: {}", e)))?; - - let transport = UnixSocketTransport::new(tokio_socket); - let (mut sender, mut receiver) = transport.split(); - - tracing::debug!("userns helper: starting request loop"); - - loop { - let msg_with_fds = match receiver.receive().await { - Ok(m) => m, - Err(jsonrpc_fdpass::Error::ConnectionClosed) => { - tracing::debug!("userns helper: connection closed"); - return Ok(()); - } - Err(e) => { - return Err(HelperError::Ipc(format!( - "failed to receive message: {}", - e - ))); - } - }; - - match msg_with_fds.message { - JsonRpcMessage::Request(request) => { - let id = request.id.clone(); - - // Handle stream_layer specially since it needs to send multiple messages - if request.method == methods::STREAM_LAYER { - if let Err((code, msg)) = handle_stream_layer(&request, &mut sender).await { - let error = jsonrpc_fdpass::JsonRpcError::owned(code, msg, None::<()>); - let response = JsonRpcResponse::error(error, id); - let message = - MessageWithFds::new(JsonRpcMessage::Response(response), vec![]); - sender.send(message).await.map_err(|e| { - HelperError::Ipc(format!("failed to send error response: {}", e)) - })?; - } - // Success response is sent by handle_stream_layer - continue; - } - - let (result, fds) = handle_request(&request); - - match result { - Ok(response_value) => { - let response = JsonRpcResponse::success(response_value, id); - let message = MessageWithFds::new(JsonRpcMessage::Response(response), fds); - sender.send(message).await.map_err(|e| { - HelperError::Ipc(format!("failed to send response: {}", e)) - })?; - } - Err((code, message_str)) => { - let error = - jsonrpc_fdpass::JsonRpcError::owned(code, message_str, None::<()>); - let response = JsonRpcResponse::error(error, id); - let message = - MessageWithFds::new(JsonRpcMessage::Response(response), vec![]); - sender.send(message).await.map_err(|e| { - HelperError::Ipc(format!("failed to send error response: {}", e)) - })?; - } - } - - // Check for shutdown request (handle after sending response) - if request.method == methods::SHUTDOWN { - tracing::debug!("userns helper: received shutdown request"); - return Ok(()); - } - } - JsonRpcMessage::Notification(notif) => { - if notif.method == methods::SHUTDOWN { - tracing::debug!("userns helper: received shutdown notification"); - return Ok(()); - } - // Ignore other notifications - } - JsonRpcMessage::Response(_) => { - // Unexpected response - ignore - } - } - } -} - -/// Handle stream_layer request - sends multiple notifications with fds. -async fn handle_stream_layer( - request: &JsonRpcRequest, - sender: &mut jsonrpc_fdpass::transport::Sender, -) -> std::result::Result<(), (i32, String)> { - let params: StreamLayerParams = request - .params - .as_ref() - .and_then(|p| serde_json::from_value(p.clone()).ok()) - .ok_or(( - error_codes::INVALID_PARAMS, - "invalid params for streamLayer".to_string(), - ))?; - - let storage = Storage::open(¶ms.storage_path).map_err(|e| { - ( - error_codes::INTERNAL_ERROR, - format!("failed to open storage: {}", e), - ) - })?; - - let layer = Layer::open(&storage, ¶ms.layer_id).map_err(|e| { - ( - error_codes::RESOURCE_NOT_FOUND, - format!("layer not found: {}", e), - ) - })?; - - let mut stream = TarSplitFdStream::new(&storage, &layer).map_err(|e| { - ( - error_codes::INTERNAL_ERROR, - format!("failed to create tar-split stream: {}", e), - ) - })?; - - let mut items_sent = 0usize; - - // Stream all items as notifications - while let Some(item) = stream - .next() - .map_err(|e| (error_codes::INTERNAL_ERROR, format!("stream error: {}", e)))? - { - match item { - TarSplitItem::Segment(bytes) => { - // Send segment as base64-encoded notification - let params = StreamSegmentNotification { - data: BASE64_STANDARD.encode(&bytes), - }; - let notif = jsonrpc_fdpass::JsonRpcNotification::new( - "stream.segment".to_string(), - Some(serde_json::to_value(¶ms).unwrap()), - ); - let message = MessageWithFds::new(JsonRpcMessage::Notification(notif), vec![]); - sender.send(message).await.map_err(|e| { - ( - error_codes::INTERNAL_ERROR, - format!("failed to send segment: {}", e), - ) - })?; - items_sent += 1; - } - TarSplitItem::FileContent { fd, size, name } => { - // Send file notification with fd - let params = StreamFileNotification { name, size }; - let notif = jsonrpc_fdpass::JsonRpcNotification::new( - "stream.file".to_string(), - Some(serde_json::to_value(¶ms).unwrap()), - ); - let message = MessageWithFds::new(JsonRpcMessage::Notification(notif), vec![fd]); - sender.send(message).await.map_err(|e| { - ( - error_codes::INTERNAL_ERROR, - format!("failed to send file: {}", e), - ) - })?; - items_sent += 1; - } - } - } - - // Send success response - let result = StreamLayerResult { items_sent }; - let response = - JsonRpcResponse::success(serde_json::to_value(result).unwrap(), request.id.clone()); - let message = MessageWithFds::new(JsonRpcMessage::Response(response), vec![]); - sender.send(message).await.map_err(|e| { - ( - error_codes::INTERNAL_ERROR, - format!("failed to send response: {}", e), - ) - })?; - - Ok(()) -} - -/// Handle a JSON-RPC request. -fn handle_request( - request: &JsonRpcRequest, -) -> ( - std::result::Result, - Vec, -) { - match request.method.as_str() { - methods::OPEN_FILE => { - let params: OpenFileParams = match request - .params - .as_ref() - .and_then(|p| serde_json::from_value(p.clone()).ok()) - { - Some(p) => p, - None => { - return ( - Err(( - error_codes::INVALID_PARAMS, - "invalid params: missing 'path' field".to_string(), - )), - vec![], - ); - } - }; - - match std::fs::File::open(¶ms.path) { - Ok(file) => { - let fd: OwnedFd = file.into(); - let result = OpenFileResult { success: true }; - (Ok(serde_json::to_value(result).unwrap()), vec![fd]) - } - Err(e) => ( - Err(( - error_codes::INTERNAL_ERROR, - format!("failed to open file: {}", e), - )), - vec![], - ), - } - } - methods::LIST_IMAGES => handle_list_images(request), - methods::GET_IMAGE => handle_get_image(request), - methods::SHUTDOWN => { - // Just return success - the loop will exit after sending the response - (Ok(serde_json::json!({"success": true})), vec![]) - } - _ => ( - Err(( - error_codes::METHOD_NOT_FOUND, - format!("method not found: {}", request.method), - )), - vec![], - ), - } -} - -/// Handle list_images request. -fn handle_list_images( - request: &JsonRpcRequest, -) -> ( - std::result::Result, - Vec, -) { - let params: ListImagesParams = match request - .params - .as_ref() - .and_then(|p| serde_json::from_value(p.clone()).ok()) - { - Some(p) => p, - None => { - return ( - Err(( - error_codes::INVALID_PARAMS, - "invalid params for listImages".to_string(), - )), - vec![], - ); - } - }; - - let storage = match Storage::open(¶ms.storage_path) { - Ok(s) => s, - Err(e) => { - return ( - Err(( - error_codes::INTERNAL_ERROR, - format!("failed to open storage: {}", e), - )), - vec![], - ); - } - }; - - let images = match storage.list_images() { - Ok(imgs) => imgs, - Err(e) => { - return ( - Err(( - error_codes::INTERNAL_ERROR, - format!("failed to list images: {}", e), - )), - vec![], - ); - } - }; - - let image_infos: Vec = images - .iter() - .map(|img| ImageInfo { - id: img.id().to_string(), - names: img.names(&storage).unwrap_or_default(), - }) - .collect(); - - let result = ListImagesResult { - images: image_infos, - }; - (Ok(serde_json::to_value(result).unwrap()), vec![]) -} - -/// Handle get_image request. -fn handle_get_image( - request: &JsonRpcRequest, -) -> ( - std::result::Result, - Vec, -) { - let params: GetImageParams = match request - .params - .as_ref() - .and_then(|p| serde_json::from_value(p.clone()).ok()) - { - Some(p) => p, - None => { - return ( - Err(( - error_codes::INVALID_PARAMS, - "invalid params for getImage".to_string(), - )), - vec![], - ); - } - }; - - let storage = match Storage::open(¶ms.storage_path) { - Ok(s) => s, - Err(e) => { - return ( - Err(( - error_codes::INTERNAL_ERROR, - format!("failed to open storage: {}", e), - )), - vec![], - ); - } - }; - - // Try by ID first, then by name - let image = match crate::image::Image::open(&storage, ¶ms.image_ref) { - Ok(img) => img, - Err(_) => match storage.find_image_by_name(¶ms.image_ref) { - Ok(img) => img, - Err(e) => { - return ( - Err(( - error_codes::RESOURCE_NOT_FOUND, - format!("image not found: {}", e), - )), - vec![], - ); - } - }, - }; - - let config = match image.config() { - Ok(cfg) => cfg, - Err(e) => { - return ( - Err(( - error_codes::INTERNAL_ERROR, - format!("failed to read config: {}", e), - )), - vec![], - ); - } - }; - - let diff_ids: Vec = config - .rootfs() - .diff_ids() - .iter() - .map(|s| s.parse().expect("config diff_id should be valid digest")) - .collect(); - - let storage_layer_ids = match image.storage_layer_ids(std::slice::from_ref(&storage)) { - Ok(ids) => ids, - Err(e) => { - return ( - Err(( - error_codes::INTERNAL_ERROR, - format!("failed to get storage layer IDs: {}", e), - )), - vec![], - ); - } - }; - - let result = GetImageResult { - id: image.id().to_string(), - names: image.names(&storage).unwrap_or_default(), - layer_diff_ids: diff_ids, - storage_layer_ids, - }; - (Ok(serde_json::to_value(result).unwrap()), vec![]) -} - -/// Proxy for accessing files via the userns helper process. +/// Proxy for accessing storage content via the userns helper process. +/// +/// When the caller cannot bypass file permissions, `StorageProxy::spawn()` +/// starts a helper process inside a user namespace using `podman unshare` and +/// returns a connected [`zlink::unix::Connection`] the caller can use with the +/// [`composefs_oci::varlink_types::OciProxy`] trait to stream layers. /// -/// This spawns a helper process (via `podman unshare`) that runs inside a -/// user namespace and can read files with restrictive permissions. File -/// descriptors are passed back via SCM_RIGHTS. +/// # Dependency on `podman` +/// +/// This type requires the `podman` binary to be present on `PATH`. If +/// `podman` is not found, [`StorageProxy::spawn`] returns +/// [`HelperError::PodmanNotFound`]. +/// +/// # Liveness +/// +/// The helper subprocess is shut down explicitly: [`StorageProxy::shutdown`] +/// (and the `Drop` impl) `kill()` the child once the caller is done. A +/// best-effort `PR_SET_PDEATHSIG` in the helper is a fallback for abnormal +/// parent death (see `init_if_helper`). +/// +/// Dropping this struct kills the child process. pub struct StorageProxy { - child: Child, - sender: jsonrpc_fdpass::transport::Sender, - receiver: jsonrpc_fdpass::transport::Receiver, - next_id: u64, + /// The spawned helper child process. Wrapped in `Option` so that + /// `shutdown` can take ownership and call `wait()` without fighting the + /// `Drop` impl. + child: Option, + conn: zlink::unix::Connection, } impl std::fmt::Debug for StorageProxy { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("StorageProxy") - .field("child_pid", &self.child.id()) + .field("child_pid", &self.child.as_ref().map(|c| c.id())) .finish_non_exhaustive() } } impl StorageProxy { - /// Spawn a userns helper process. + /// Spawn a userns helper process if required. /// - /// If the current process can already bypass file permissions (running as - /// root or has CAP_DAC_OVERRIDE), this returns `Ok(None)` since no helper - /// is needed. - pub async fn spawn() -> std::result::Result, HelperError> { - // Check if we even need a helper + /// Returns `Ok(None)` when the current process can already bypass file + /// permissions (running as root, or has `CAP_DAC_OVERRIDE`), since no + /// helper is needed in that case. + pub async fn spawn() -> Result, HelperError> { if can_bypass_file_permissions() { return Ok(None); } - Self::spawn_helper().await.map(Some) } - /// Spawn the helper unconditionally. - async fn spawn_helper() -> std::result::Result { + /// Spawn the helper unconditionally using `/proc/self/exe`. + async fn spawn_helper() -> Result { let exe = std::fs::read_link("/proc/self/exe").map_err(HelperError::Io)?; Self::spawn_helper_with_binary(exe).await } - /// Spawn the helper with a specific binary path. + /// Spawn the helper with an explicit binary path. /// - /// This is used when the default /proc/self/exe is not suitable, - /// such as when running from a test harness. - async fn spawn_helper_with_binary( - exe: std::path::PathBuf, - ) -> std::result::Result { - // Create a socket pair - one end for us, one for the child's stdin + /// Useful when `/proc/self/exe` is not the right choice (e.g. test + /// harnesses that run under a wrapper binary). + async fn spawn_helper_with_binary(exe: std::path::PathBuf) -> Result { + // Create a socket pair — one end becomes the child's stdin, the other + // stays in the parent for the zlink connection. let (parent_sock, child_sock) = StdUnixStream::pair().map_err(HelperError::Socket)?; - // Spawn via podman unshare, with child_sock as the child's stdin. - // We use `env` to set the HELPER_ENV because podman unshare doesn't - // propagate the parent's environment to the inner command. + // Spawn via `podman unshare`; set HELPER_ENV because podman unshare + // does not automatically forward the parent's environment. + // + // `podman` must be on PATH — see [`HelperError::PodmanNotFound`]. let child = Command::new("podman") .arg("unshare") .arg("env") - .arg(format!("{}=1", HELPER_ENV)) + .arg(format!("{HELPER_ENV}=1")) .arg(&exe) .stdin(Stdio::from(OwnedFd::from(child_sock))) .stdout(Stdio::inherit()) .stderr(Stdio::inherit()) .spawn() - .map_err(HelperError::Spawn)?; + .map_err(|e| { + if e.kind() == std::io::ErrorKind::NotFound { + HelperError::PodmanNotFound(e) + } else { + HelperError::Spawn(e) + } + })?; - // Convert our socket to async + // Wrap the parent socket end as a zlink Connection. parent_sock.set_nonblocking(true)?; - let tokio_socket = TokioUnixStream::from_std(parent_sock) - .map_err(|e| HelperError::Ipc(format!("failed to convert socket: {}", e)))?; - - let transport = UnixSocketTransport::new(tokio_socket); - let (sender, receiver) = transport.split(); + let tok = tokio::net::UnixStream::from_std(parent_sock) + .map_err(|e| HelperError::Ipc(format!("failed to convert socket: {e}")))?; + let zs = zlink::unix::Stream::from(tok); + let conn = zlink::Connection::from(zs); Ok(Self { - child, - sender, - receiver, - next_id: 1, + child: Some(child), + conn, }) } - /// Open a file via the helper, returning its fd. - /// - /// # Arguments - /// - /// * `path` - The path to open (should be absolute) + /// Return a mutable reference to the underlying zlink connection. /// - /// # Returns - /// - /// The opened file descriptor, which can be used for reading. - pub async fn open_file( - &mut self, - path: impl AsRef, - ) -> std::result::Result { - let params = OpenFileParams { - path: path.as_ref().to_string_lossy().to_string(), - }; - - let id = self.next_id; - self.next_id += 1; - - let request = JsonRpcRequest::new( - methods::OPEN_FILE.to_string(), - Some(serde_json::to_value(¶ms).unwrap()), - serde_json::Value::Number(id.into()), - ); - - let message = MessageWithFds::new(JsonRpcMessage::Request(request), vec![]); - self.sender - .send(message) - .await - .map_err(|e| HelperError::Ipc(format!("failed to send request: {}", e)))?; - - // Receive response - let response = self - .receiver - .receive() - .await - .map_err(|e| HelperError::Ipc(format!("failed to receive response: {}", e)))?; - - match response.message { - JsonRpcMessage::Response(resp) => { - if let Some(error) = resp.error { - return Err(HelperError::RpcError { - code: error.code(), - message: error.message().to_string(), - }); - } - - // The fd should be in the response - if response.file_descriptors.is_empty() { - return Err(HelperError::Ipc( - "response missing file descriptor".to_string(), - )); - } - - Ok(response.file_descriptors.into_iter().next().unwrap()) - } - other => Err(HelperError::Ipc(format!( - "unexpected message type: {:?}", - other - ))), - } - } - - /// Shutdown the helper process gracefully. - pub async fn shutdown(mut self) -> std::result::Result<(), HelperError> { - let id = self.next_id; - - let request = JsonRpcRequest::new( - methods::SHUTDOWN.to_string(), - None, - serde_json::Value::Number(id.into()), - ); - - let message = MessageWithFds::new(JsonRpcMessage::Request(request), vec![]); - // Ignore send errors - the child may have already exited - let _ = self.sender.send(message).await; - - // Wait for the child to exit - let _ = self.child.wait(); - - Ok(()) - } - - /// List images in storage via the helper. - pub async fn list_images( - &mut self, - storage_path: &str, - ) -> std::result::Result, HelperError> { - let params = ListImagesParams { - storage_path: storage_path.to_string(), - }; - let result: ListImagesResult = self.call(methods::LIST_IMAGES, ¶ms).await?; - Ok(result.images) + /// Callers use this with the `OciProxy` trait (from + /// `composefs_oci::varlink_types`) to drive `GetLayer` calls over the + /// helper connection. + pub fn connection(&mut self) -> &mut zlink::unix::Connection { + &mut self.conn } - /// Get image information via the helper. - pub async fn get_image( - &mut self, - storage_path: &str, - image_ref: &str, - ) -> std::result::Result { - let params = GetImageParams { - storage_path: storage_path.to_string(), - image_ref: image_ref.to_string(), - }; - self.call(methods::GET_IMAGE, ¶ms).await - } - - /// Start streaming a layer's tar-split content. - /// - /// Returns a stream that yields `ProxiedTarSplitItem`s. The helper sends - /// notifications with file descriptors for each file in the layer. - pub async fn stream_layer( - &mut self, - storage_path: &str, - layer_id: &str, - ) -> std::result::Result, HelperError> { - let params = StreamLayerParams { - storage_path: storage_path.to_string(), - layer_id: layer_id.to_string(), - }; - - let id = self.next_id; - self.next_id += 1; - - let request = JsonRpcRequest::new( - methods::STREAM_LAYER.to_string(), - Some(serde_json::to_value(¶ms).unwrap()), - serde_json::Value::Number(id.into()), - ); - - let message = MessageWithFds::new(JsonRpcMessage::Request(request), vec![]); - self.sender - .send(message) - .await - .map_err(|e| HelperError::Ipc(format!("failed to send stream_layer request: {}", e)))?; - - Ok(ProxiedLayerStream { - receiver: &mut self.receiver, - request_id: id, - finished: false, - }) - } - - /// Make an RPC call and parse the response. - async fn call Deserialize<'de>>( - &mut self, - method: &str, - params: &P, - ) -> std::result::Result { - let id = self.next_id; - self.next_id += 1; - - let request = JsonRpcRequest::new( - method.to_string(), - Some(serde_json::to_value(params).unwrap()), - serde_json::Value::Number(id.into()), - ); - - let message = MessageWithFds::new(JsonRpcMessage::Request(request), vec![]); - self.sender - .send(message) - .await - .map_err(|e| HelperError::Ipc(format!("failed to send request: {}", e)))?; - - // Receive response - let response = self - .receiver - .receive() - .await - .map_err(|e| HelperError::Ipc(format!("failed to receive response: {}", e)))?; - - match response.message { - JsonRpcMessage::Response(resp) => { - if let Some(error) = resp.error { - return Err(HelperError::RpcError { - code: error.code(), - message: error.message().to_string(), - }); - } - - let result = resp - .result - .ok_or_else(|| HelperError::Ipc("response missing result".to_string()))?; - - serde_json::from_value(result) - .map_err(|e| HelperError::Ipc(format!("failed to parse result: {}", e))) - } - other => Err(HelperError::Ipc(format!( - "unexpected message type: {:?}", - other - ))), - } - } -} - -/// Item received from a proxied layer stream. -#[derive(Debug)] -pub enum ProxiedTarSplitItem { - /// Raw segment bytes (tar header/padding). - Segment(Vec), - /// File content with metadata and fd. - FileContent { - /// File descriptor for the content. - fd: OwnedFd, - /// File size. - size: u64, - /// File name/path. - name: String, - }, -} - -/// Stream of tar-split items received via the helper proxy. -pub struct ProxiedLayerStream<'a> { - receiver: &'a mut jsonrpc_fdpass::transport::Receiver, - request_id: u64, - finished: bool, -} - -impl std::fmt::Debug for ProxiedLayerStream<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("ProxiedLayerStream") - .field("request_id", &self.request_id) - .field("finished", &self.finished) - .finish_non_exhaustive() - } -} - -impl<'a> ProxiedLayerStream<'a> { - /// Get the next item from the stream. + /// Shut down the helper. /// - /// Returns `None` when the stream is complete. - pub async fn next(&mut self) -> std::result::Result, HelperError> { - if self.finished { - return Ok(None); - } - - let msg_with_fds = match self.receiver.receive().await { - Ok(m) => m, - Err(jsonrpc_fdpass::Error::ConnectionClosed) => { - self.finished = true; - return Ok(None); - } - Err(e) => { - return Err(HelperError::Ipc(format!("failed to receive: {}", e))); - } - }; - - let mut fds = msg_with_fds.file_descriptors; - - match msg_with_fds.message { - JsonRpcMessage::Notification(notif) => { - let params = notif.params.unwrap_or(serde_json::Value::Null); - - match notif.method.as_str() { - "stream.segment" => { - let seg: StreamSegmentNotification = serde_json::from_value(params) - .map_err(|e| { - HelperError::Ipc(format!("invalid segment params: {}", e)) - })?; - - let bytes = BASE64_STANDARD.decode(&seg.data).map_err(|e| { - HelperError::Ipc(format!("failed to decode segment: {}", e)) - })?; - - Ok(Some(ProxiedTarSplitItem::Segment(bytes))) - } - "stream.file" => { - let file: StreamFileNotification = serde_json::from_value(params) - .map_err(|e| HelperError::Ipc(format!("invalid file params: {}", e)))?; - - if fds.is_empty() { - return Err(HelperError::Ipc( - "file notification missing fd".to_string(), - )); - } - - let fd = fds.remove(0); - Ok(Some(ProxiedTarSplitItem::FileContent { - fd, - size: file.size, - name: file.name, - })) - } - other => Err(HelperError::Ipc(format!( - "unknown notification method: {}", - other - ))), - } - } - JsonRpcMessage::Response(resp) => { - // Final response - stream is complete - self.finished = true; - - if let Some(error) = resp.error { - return Err(HelperError::RpcError { - code: error.code(), - message: error.message().to_string(), - }); - } - - Ok(None) - } - JsonRpcMessage::Request(_) => Err(HelperError::Ipc( - "unexpected request from helper".to_string(), - )), + /// Kills the child process and waits for it to exit. Dropping this + /// struct also kills the child (via `Drop`), but `shutdown` additionally + /// waits for the process to be reaped so no zombie is left behind. This + /// explicit `kill()` is the helper's primary shutdown mechanism. + pub async fn shutdown(mut self) -> Result<(), HelperError> { + // Take the child out so Drop sees None and skips the redundant kill(). + if let Some(mut child) = self.child.take() { + drop(self); // drop conn first + let _ = tokio::task::spawn_blocking(move || { + let _ = child.kill(); + child.wait() + }) + .await; } + Ok(()) } } impl Drop for StorageProxy { fn drop(&mut self) { - // Try to kill the child if it's still running - let _ = self.child.kill(); + // Best-effort: kill the child if it is still running. + if let Some(ref mut child) = self.child { + let _ = child.kill(); + } } } diff --git a/crates/composefs/src/splitstream.rs b/crates/composefs/src/splitstream.rs index 2f827749..fdb65b90 100644 --- a/crates/composefs/src/splitstream.rs +++ b/crates/composefs/src/splitstream.rs @@ -1051,6 +1051,39 @@ impl SplitStreamReader { } } + /// Walk the stream one chunk at a time, invoking `callback` for each + /// inline span and each external object reference, in stream order. + /// + /// This is the reference-preserving analogue of [`Self::cat`]: where `cat` + /// resolves external objects by copying their bytes, this yields the + /// object id so callers can re-emit a reference instead. + /// + /// The callback receives a [`SplitStreamData`] value for each chunk. + /// Inline chunks carry their raw bytes; external chunks carry the object + /// identifier so the caller can look up the object by whatever means it + /// prefers (e.g. emitting a filename reference into a `splitdirfdstream`). + #[context("Walking splitstream chunks")] + pub fn for_each_chunk( + &mut self, + mut callback: impl FnMut(SplitStreamData) -> Result<()>, + ) -> Result<()> { + let mut buffer = vec![]; + + loop { + match self.ensure_chunk(true, true, 0)? { + ChunkType::Eof => break Ok(()), + ChunkType::Inline => { + read_into_vec(&mut self.decoder, &mut buffer, self.inline_bytes)?; + self.inline_bytes = 0; + callback(SplitStreamData::Inline(buffer[..].into()))?; + } + ChunkType::External(id) => { + callback(SplitStreamData::External(id))?; + } + } + } + } + /// Traverses the split stream and calls the callback for each object reference. /// /// This includes both references from the digest map and external references in the stream.