diff --git a/Cargo.toml b/Cargo.toml
index 7086d4b3..fda6eef0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,7 @@ default-members = [
     "crates/composefs-setup-root",
     "crates/composefs-storage",
     "crates/composefs-erofs-debug",
-    "crates/composefs-splitfdstream",
+    "crates/composefs-splitdirfdstream",
 ]
 resolver = "2"
 
@@ -43,8 +43,6 @@ composefs-ostree = { version = "0.7.0", path = "crates/composefs-ostree", defaul
 cap-std-ext = "5.1.2"
 ocidir = "0.7.2"
 
-# JSON-RPC with FD passing for userns helper
-jsonrpc-fdpass = { version = "0.1.0", default-features = false }
 zlink = { version = "0.5", default-features = false, features = ["tokio", "introspection", "proxy", "server", "service", "tracing"] }
 zlink-core = { version = "0.5", default-features = false, features = ["introspection", "std", "tracing"] }
 
diff --git a/crates/composefs-ctl/Cargo.toml b/crates/composefs-ctl/Cargo.toml
index b7155319..27ea8d4e 100644
--- a/crates/composefs-ctl/Cargo.toml
+++ b/crates/composefs-ctl/Cargo.toml
@@ -19,7 +19,7 @@ path = "src/main.rs"
 [features]
 default = ['pre-6.15', 'oci', 'containers-storage', 'ostree']
 http = ['composefs-http']
-oci = ['composefs-oci', 'composefs-oci/varlink']
+oci = ['composefs-oci', 'composefs-oci/varlink', 'composefs-splitdirfdstream']
 containers-storage = ['composefs-oci/containers-storage', 'cstorage']
 ostree = ['composefs-ostree']
 rhel9 = ['composefs/rhel9']
@@ -33,19 +33,26 @@ comfy-table = { version = "7.1", default-features = false }
 composefs = { workspace = true, features = ["varlink"] }
 composefs-boot = { workspace = true }
 composefs-oci = { workspace = true, optional = true, features = ["boot"] }
+composefs-splitdirfdstream = { path = "../composefs-splitdirfdstream", version = "0.7.0", optional = true }
 composefs-http = { workspace = true, optional = true }
-cstorage = { package = "composefs-storage", path = "../composefs-storage", version = "0.7.0", features = ["userns-helper"], optional = true }
+cstorage = { package = "composefs-storage", path = "../composefs-storage", version = "0.7.0", features = ["layer-transfer"], optional = true }
 composefs-ostree = { workspace = true, optional = true }
 env_logger = { version = "0.11.0", default-features = false }
 hex = { version = "0.4.0", default-features = false }
 indicatif = { version = "0.17.0", default-features = false }
 libsystemd = { version = "0.7" }
 log = { version = "0.4", default-features = false }
-rustix = { version = "1.0.0", default-features = false, features = ["fs", "process"] }
+rustix = { version = "1.0.0", default-features = false, features = ["fs", "net", "pipe", "process"] }
 serde = { version = "1.0", default-features = false, features = ["derive"] }
 serde_json = { version = "1.0", default-features = false, features = ["std"] }
-tokio = { version = "1.24.2", default-features = false, features = ["io-std", "io-util", "net", "rt", "sync"] }
+tokio = { version = "1.24.2", default-features = false, features = ["io-std", "io-util", "net", "rt", "rt-multi-thread", "sync"] }
 zlink = { workspace = true }
 
+[dev-dependencies]
+similar-asserts = "1.7.0"
+tar = { version = "0.4.38", default-features = false }
+tempfile = "3.8.0"
+tokio = { version = "1.24.2", default-features = false, features = ["macros", "rt-multi-thread"] }
+
 [lints]
 workspace = true
diff --git a/crates/composefs-ctl/src/lib.rs b/crates/composefs-ctl/src/lib.rs
index 9a8c592e..d8af0db7 100644
--- a/crates/composefs-ctl/src/lib.rs
+++ b/crates/composefs-ctl/src/lib.rs
@@ -264,7 +264,7 @@ impl From<ErofsVersion> for composefs::erofs::format::FormatVersion {
 /// start with `@`.
 #[cfg(feature = "oci")]
 #[derive(Debug, Clone)]
-pub(crate) enum OciReference {
+pub enum OciReference {
     /// A content-addressable digest such as `sha256:abcdef…`.
     Digest(composefs_oci::OciDigest),
     /// A named ref resolved through the repository's ref tree, typically
@@ -377,6 +377,31 @@ enum OciCommand {
         #[arg(long, value_enum, default_value_t = LocalFetchCli::Disabled)]
         local_fetch: LocalFetchCli,
     },
+    /// Copy an OCI image (and its layers) from another composefs repository
+    /// into this repository.
+    ///
+    /// The destination repository is selected by the global `--repo`/`--user`/
+    /// `--system` flags. The source is `--from`.
+    ///
+    /// Pass `--zerocopy` to attempt reflink (then hardlink) instead of copying
+    /// object data.  This requires both repositories to be on the same
+    /// filesystem, to use the same hash algorithm, and the caller to have
+    /// `CAP_DAC_READ_SEARCH` (i.e. root).
+    /// Without `--zerocopy`, objects are always copied, which is safe on any
+    /// filesystem and across repositories using different hash algorithms.
+    Copy {
+        /// Image to copy (tag name or `@digest`).
+        image: OciReference,
+        /// Path to the source composefs repository.
+        #[clap(long)]
+        from: PathBuf,
+        /// Tag to assign to the image in the destination repository.
+        #[clap(long)]
+        name: Option<String>,
+        /// Use reflink/hardlink zero-copy transfer (requires same filesystem, same hash algorithm, and root).
+        #[clap(long)]
+        zerocopy: bool,
+    },
     /// List all tagged OCI images in the repository
     #[clap(name = "images")]
     ListImages {
@@ -940,12 +965,18 @@ pub async fn run_if_socket_activated() -> Result<bool> {
     if std::env::args_os().len() != 1 {
         return Ok(false);
     }
-    let Some(listener) = crate::varlink::try_activated_listener()? else {
-        return Ok(false);
-    };
     let service = crate::varlink::CfsctlService::activated();
-    crate::varlink::serve_activated(service, listener).await?;
-    Ok(true)
+    match crate::varlink::try_activated_listener()? {
+        Some(crate::varlink::ActivatedSocket::Connected(l)) => {
+            crate::varlink::serve_activated(service, l).await?;
+            Ok(true)
+        }
+        Some(crate::varlink::ActivatedSocket::Listening(listener)) => {
+            crate::varlink::serve_on_listener(service, listener).await?;
+            Ok(true)
+        }
+        None => Ok(false),
+    }
 }
 
 /// Top-level dispatch: handle init specially, otherwise open repo and run.
@@ -1147,6 +1178,111 @@ where
     Ok(repo)
 }
 
+/// Copy an OCI image (and all its layers) from one repository to another using varlink connections.
+#[cfg(feature = "oci")]
+pub async fn copy_image(
+    conn_src: &mut zlink::unix::Connection,
+    conn_dest: &mut zlink::unix::Connection,
+    handle_src: u64,
+    handle_dest: u64,
+    image: &OciReference,
+    name: Option<&str>,
+    zerocopy: bool,
+) -> Result<crate::varlink::layer_sync::FinalizeImageReply> {
+    use crate::varlink::layer_sync::LayerRef;
+    use crate::varlink::oci::OciError;
+    use crate::varlink::proxy::{GetLayerParams, OciProxy};
+    use anyhow::ensure;
+    use zlink::futures_util::StreamExt as _;
+
+    let image_str = image.to_string();
+    let inspect = conn_src
+        .inspect(handle_src, &image_str)
+        .await
+        .context("zlink transport error calling Inspect")?
+        .map_err(|e: OciError| anyhow::anyhow!("Inspect failed: {e:?}"))?;
+
+    ensure!(
+        !inspect.manifest.is_empty(),
+        "inspect returned empty manifest"
+    );
+    ensure!(!inspect.config.is_empty(), "inspect returned empty config");
+
+    // Extract ordered layer identifiers via the shared helper that handles
+    // both container images (rootfs.diff_ids) and OCI artifacts (manifest
+    // layer digests).
+    let diff_ids_ordered = composefs_oci::extract_layer_ids(&inspect.manifest, &inspect.config)
+        .context("extracting layer identifiers")?;
+
+    let mut layer_refs: Vec<LayerRef> = Vec::with_capacity(diff_ids_ordered.len());
+
+    for diff_id in &diff_ids_ordered {
+        let has = conn_dest
+            .has_layer(handle_dest, diff_id)
+            .await
+            .context("zlink transport error calling HasLayer")?
+            .map_err(|e: OciError| anyhow::anyhow!("HasLayer failed: {e:?}"))?;
+
+        let layer_verity = if has.present {
+            has.layer_verity
+                .context("HasLayer returned present=true but no layer_verity")?
+        } else {
+            let get_params = GetLayerParams {
+                diff_id: Some(diff_id.to_string()),
+                storage: None,
+            };
+            let mut get_stream = std::pin::pin!(
+                conn_src
+                    .get_layer(handle_src, get_params)
+                    .await
+                    .context("zlink transport error calling GetLayer")?
+            );
+            let mut all_fds: Vec<std::os::fd::OwnedFd> = Vec::new();
+            let mut get_reply = None;
+            while let Some(item) = get_stream.next().await {
+                let (result, fds) = item.context("GetLayer stream frame error")?;
+                let reply =
+                    result.map_err(|e: OciError| anyhow::anyhow!("GetLayer failed: {e:?}"))?;
+                get_reply = Some(reply);
+                all_fds.extend(fds);
+            }
+            let get_reply = get_reply.context("GetLayer returned empty stream")?;
+            let dir_count = get_reply.dir_count as usize;
+
+            let pipe_and_dirfds_len = 1 + dir_count;
+            let lifetime_fds = all_fds.split_off(pipe_and_dirfds_len);
+
+            let put_reply = conn_dest
+                .put_layer(handle_dest, diff_id, zerocopy, all_fds)
+                .await
+                .context("zlink transport error calling PutLayer")?
+                .map_err(|e: OciError| anyhow::anyhow!("PutLayer failed: {e:?}"))?;
+            drop(lifetime_fds);
+
+            put_reply.layer_verity
+        };
+
+        layer_refs.push(LayerRef {
+            diff_id: diff_id.clone(),
+            layer_verity,
+        });
+    }
+
+    let finalize = conn_dest
+        .finalize_image(
+            handle_dest,
+            &inspect.manifest,
+            &inspect.config,
+            layer_refs,
+            name,
+        )
+        .await
+        .context("zlink transport error calling FinalizeImage")?
+        .map_err(|e: OciError| anyhow::anyhow!("FinalizeImage failed: {e:?}"))?;
+
+    Ok(finalize)
+}
+
 /// Resolve an [`OciReference`] to an [`OciImage`].
 #[cfg(feature = "oci")]
 pub(crate) fn resolve_oci_image<ObjectID: FsVerityHashValue>(
@@ -1350,6 +1486,8 @@ where
     ObjectID: FsVerityHashValue,
 {
     let repo = Arc::new(repo);
+    #[cfg(feature = "oci")]
+    let dest_path = resolve_repo_path(&args)?;
     match args.cmd {
         Command::Init { .. } => {
             // Handled in run_app before we get here
@@ -1453,6 +1591,81 @@ where
                     println!("Boot image: {}", image_verity.to_hex());
                 }
             }
+            OciCommand::Copy {
+                ref image,
+                ref from,
+                ref name,
+                zerocopy,
+            } => {
+                use crate::varlink::proxy::RepositoryProxy;
+
+                let src_hash = resolve_hash_type(from, args.hash, !args.no_upgrade)
+                    .with_context(|| format!("opening source repository {}", from.display()))?;
+                let dest_hash = resolve_hash_type(&dest_path, args.hash, !args.no_upgrade)
+                    .with_context(|| {
+                        format!("opening destination repository {}", dest_path.display())
+                    })?;
+
+                if zerocopy && src_hash != dest_hash {
+                    anyhow::bail!(
+                        "--zerocopy requires matching hash algorithms; \
+                         source uses {src_hash:?} but destination uses {dest_hash:?}"
+                    );
+                }
+
+                let from_str = from.to_str().context("source path is not valid UTF-8")?;
+                let dest_str = dest_path
+                    .to_str()
+                    .context("destination path is not valid UTF-8")?;
+
+                let service_src = crate::varlink::CfsctlService::new();
+                let service_dest = crate::varlink::CfsctlService::new();
+
+                let (mut conn_src, _srv_src) = crate::varlink::spawn_in_process(service_src)
+                    .context("spawning source in-process service")?;
+                let (mut conn_dest, _srv_dest) = crate::varlink::spawn_in_process(service_dest)
+                    .context("spawning destination in-process service")?;
+
+                let handle_src = conn_src
+                    .open_repository(Some(from_str), None, None)
+                    .await
+                    .context("zlink transport error calling OpenRepository on source")?
+                    .map_err(|e| anyhow::anyhow!("OpenRepository failed on source: {e:?}"))?
+                    .handle;
+
+                let handle_dest = conn_dest
+                    .open_repository(Some(dest_str), None, None)
+                    .await
+                    .context("zlink transport error calling OpenRepository on destination")?
+                    .map_err(|e| anyhow::anyhow!("OpenRepository failed on destination: {e:?}"))?
+                    .handle;
+
+                let finalize_reply = copy_image(
+                    &mut conn_src,
+                    &mut conn_dest,
+                    handle_src,
+                    handle_dest,
+                    image,
+                    name.as_deref(),
+                    zerocopy,
+                )
+                .await?;
+
+                let tag_info = if let Some(n) = name {
+                    format!(", tagged as {n}")
+                } else {
+                    String::new()
+                };
+                println!(
+                    "Copied image {image} from {} to destination repo{}",
+                    from.display(),
+                    tag_info
+                );
+                println!("Manifest digest: {}", finalize_reply.manifest_digest);
+                println!("Manifest verity: {}", finalize_reply.manifest_verity);
+                println!("Config digest:   {}", finalize_reply.config_digest);
+                println!("Config verity:   {}", finalize_reply.config_verity);
+            }
             OciCommand::ListImages { json } => {
                 let images = composefs_oci::oci_image::list_images(&repo)?;
 
diff --git a/crates/composefs-ctl/src/varlink.rs b/crates/composefs-ctl/src/varlink.rs
index fedff358..ae610127 100644
--- a/crates/composefs-ctl/src/varlink.rs
+++ b/crates/composefs-ctl/src/varlink.rs
@@ -136,11 +136,44 @@ pub enum RepositoryError {
     },
 }
 
-/// Reply carrying an opaque repository handle.
+/// Reply carrying an opaque repository handle and basic repository metadata.
+///
+/// The `hash_algorithm` and `objects_device_id` fields let a client making a
+/// cross-repository copy decide whether zero-copy (reflink / hardlink)
+/// transfer is viable for a given source–destination pair:
+///
+/// * **`hash_algorithm`** — `"sha256"` or `"sha512"`. Hardlink (zero-copy)
+///   requires both repositories to use the same algorithm, because fs-verity
+///   is enabled on the *shared* inode. Reflink and regular copy work across
+///   algorithms (each produces a fresh inode re-digested under the
+///   destination's algorithm).
+///
+/// * **`objects_device_id`** — the `st_dev` of the repository's objects
+///   directory. Both reflink (`FICLONE`) and hardlink (`linkat`) require
+///   source and destination to reside on the same filesystem; comparing
+///   `objects_device_id` from both sides lets the client detect this up front.
+///   Note: `st_dev` is only meaningful when both servers share a mount
+///   namespace (the typical same-host deployment). If they do not, the
+///   worst case is a failed `PutLayer` (EXDEV), not silent data corruption.
 #[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
 pub struct OpenRepositoryReply {
     /// The opaque handle to pass to subsequent repository methods.
     pub handle: u64,
+
+    /// The fs-verity hash algorithm used by this repository (`"sha256"` or
+    /// `"sha512"`).
+    ///
+    /// `None` on old servers that do not report this field (serde default).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub hash_algorithm: Option<String>,
+
+    /// The `st_dev` of the repository's objects directory, as a decimal u64.
+    ///
+    /// Clients comparing two repositories should treat matching values as
+    /// "likely same filesystem" (and thus eligible for reflink/hardlink).
+    /// `None` on old servers.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub objects_device_id: Option<u64>,
 }
 
 /// Reply from initializing a repository.
@@ -163,6 +196,27 @@ pub(crate) enum OpenRepo {
     Sha512(Arc<Repository<Sha512HashValue>>),
 }
 
+impl OpenRepo {
+    /// The fs-verity hash algorithm name for this repository.
+    fn hash_algorithm(&self) -> &'static str {
+        match self {
+            OpenRepo::Sha256(_) => "sha256",
+            OpenRepo::Sha512(_) => "sha512",
+        }
+    }
+
+    /// The `st_dev` of the repository's objects directory, if available.
+    fn objects_device_id(&self) -> Option<u64> {
+        let stat_it = |fd: &std::os::fd::OwnedFd| -> Option<u64> {
+            rustix::fs::fstat(fd).ok().map(|s| s.st_dev)
+        };
+        match self {
+            OpenRepo::Sha256(r) => r.objects_dir().ok().and_then(stat_it),
+            OpenRepo::Sha512(r) => r.objects_dir().ok().and_then(stat_it),
+        }
+    }
+}
+
 /// A single entry in the service's open-repository table.
 #[derive(Debug)]
 struct HandleEntry {
@@ -225,6 +279,12 @@ pub(crate) struct CfsctlService {
     open_opts: OpenOptions,
 }
 
+impl Default for CfsctlService {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl CfsctlService {
     /// Construct an empty service with the given repository open options.
     ///
@@ -256,6 +316,24 @@ impl CfsctlService {
         Self::with_open_opts(OpenOptions::default())
     }
 
+    /// Construct a service with default open options.
+    pub(crate) fn new() -> Self {
+        Self::with_open_opts(OpenOptions::default())
+    }
+
+    /// Construct an insecure service for in-process tests.
+    ///
+    /// The `insecure` flag disables fs-verity requirements so that tests can
+    /// use repositories created on tmpfs or without verity support.
+    #[cfg(test)]
+    pub(crate) fn insecure_for_test() -> Self {
+        Self::with_open_opts(OpenOptions {
+            insecure: true,
+            require_verity: false,
+            no_upgrade: false,
+        })
+    }
+
     /// Allocate a fresh, never-reused handle. Starts at `1` (`0` is "none").
     fn next_handle(&mut self) -> u64 {
         self.next_handle += 1;
@@ -285,7 +363,8 @@ impl CfsctlService {
             .ok_or(oci::OciError::InvalidHandle { handle })
     }
 
-    /// Resolve, open and register a repository at `path`, returning its handle.
+    /// Resolve, open and register a repository at `path`, returning the reply
+    /// with the handle and repository metadata.
     ///
     /// The digest algorithm is detected from the repository metadata; both
     /// resolution and open failures are reported as
@@ -294,7 +373,7 @@ impl CfsctlService {
         &mut self,
         path: &Path,
         owner: Option<usize>,
-    ) -> std::result::Result<u64, RepositoryError> {
+    ) -> std::result::Result<OpenRepositoryReply, RepositoryError> {
         let hash_type = resolve_hash_type(path, None, !self.open_opts.no_upgrade).map_err(|e| {
             RepositoryError::RepoNotFound {
                 message: format!("{e:#}"),
@@ -325,8 +404,14 @@ impl CfsctlService {
             )),
         };
         let handle = self.next_handle();
+        let hash_algorithm = Some(repo.hash_algorithm().to_string());
+        let objects_device_id = repo.objects_device_id();
         self.repos.insert(handle, HandleEntry { repo, owner });
-        Ok(handle)
+        Ok(OpenRepositoryReply {
+            handle,
+            hash_algorithm,
+            objects_device_id,
+        })
     }
 
     /// Resolve a repository selector (`path`/`user`/`system`) to a path.
@@ -794,8 +879,7 @@ mod service_impl {
             #[zlink(connection)] conn: &mut zlink::Connection<Sock>,
         ) -> std::result::Result<OpenRepositoryReply, RepositoryError> {
             let selected = Self::resolve_selector(path, user, system)?;
-            let handle = self.do_open(&selected, Some(conn.id()))?;
-            Ok(OpenRepositoryReply { handle })
+            self.do_open(&selected, Some(conn.id()))
         }
 
         /// Close a previously opened repository handle.
@@ -893,6 +977,9 @@ mod service_impl {
 mod service_impl {
     #![allow(missing_docs)]
 
+    use super::layer_sync::{
+        FinalizeImageReply, GetInfoReply, GetLayerReply, HasLayerReply, LayerRef, PutLayerReply,
+    };
     use super::oci::{
         ListImagesReply, OciComputeIdReply, OciError, OciFsckReply, OciInspectReply, PullProgress,
         parse_local_fetch, pull_stream,
@@ -903,7 +990,10 @@ mod service_impl {
         run_gc, run_image_objects, run_init_repository, run_inspect, run_list_images, run_mount,
         run_oci_fsck, run_oci_mount, run_tag, run_untag,
     };
-    use composefs::fsverity::{Algorithm, Sha256HashValue, Sha512HashValue};
+    use composefs::fsverity::{Algorithm, FsVerityHashValue, Sha256HashValue, Sha512HashValue};
+    use composefs_oci::layer_transport::{RepoLayerSource, serve_get_layer};
+    use composefs_oci::varlink_types::GetLayerParams;
+    use composefs_splitdirfdstream::seed_from_id;
 
     #[zlink::service(
         interface = "org.composefs.Repository",
@@ -952,8 +1042,7 @@ mod service_impl {
             #[zlink(connection)] conn: &mut zlink::Connection<Sock>,
         ) -> std::result::Result<OpenRepositoryReply, RepositoryError> {
             let selected = Self::resolve_selector(path, user, system)?;
-            let handle = self.do_open(&selected, Some(conn.id()))?;
-            Ok(OpenRepositoryReply { handle })
+            self.do_open(&selected, Some(conn.id()))
         }
 
         /// Close a previously opened repository handle.
@@ -1158,7 +1247,7 @@ mod service_impl {
             bootable: bool,
         ) -> impl zlink::futures_util::Stream<
             Item = std::result::Result<zlink::Reply<PullProgress>, OciError>,
-        > + Send {
+        > {
             let lf = parse_local_fetch(&local_fetch);
             let sr = storage_root.map(std::path::PathBuf::from);
             // Resolve the handle synchronously and clone an owned Arc out so the
@@ -1213,6 +1302,423 @@ mod service_impl {
                 Err(e) => (Err(e), vec![]),
             }
         }
+
+        // --- org.composefs.Oci (layer-sync methods) ---
+        //
+        // These methods were previously under org.composefs.LayerSync but have
+        // been folded into the Oci interface. Each carries an explicit `interface`
+        // annotation so the wire names land under the correct interface namespace.
+
+        /// Return the capability tokens supported by this service.
+        ///
+        /// Currently advertises `"splitdirfdstream-v0"`.
+        #[zlink(interface = "org.composefs.Oci")]
+        async fn get_info(&self) -> std::result::Result<GetInfoReply, OciError> {
+            Ok(GetInfoReply {
+                features: vec!["splitdirfdstream-v0".into()],
+            })
+        }
+
+        /// Check whether the layer splitstream for `diff_id` is present.
+        ///
+        /// Returns `present = true` and the hex verity if found; `present =
+        /// false` and `layer_verity = None` if not.
+        #[zlink(interface = "org.composefs.Oci")]
+        async fn has_layer(
+            &self,
+            handle: u64,
+            diff_id: String,
+        ) -> std::result::Result<HasLayerReply, OciError> {
+            let diff_id_parsed: composefs_oci::OciDigest =
+                diff_id.parse().map_err(|e| OciError::InvalidDigest {
+                    message: format!("{e}"),
+                })?;
+            let content_id = composefs_oci::layer_content_id(&diff_id_parsed);
+
+            fn check<ObjectID: FsVerityHashValue>(
+                repo: &composefs::repository::Repository<ObjectID>,
+                content_id: &str,
+            ) -> std::result::Result<HasLayerReply, OciError> {
+                match repo
+                    .has_stream(content_id)
+                    .map_err(|e| OciError::InternalError {
+                        message: format!("{e:#}"),
+                    })? {
+                    Some(verity) => Ok(HasLayerReply {
+                        present: true,
+                        layer_verity: Some(verity.to_hex()),
+                    }),
+                    None => Ok(HasLayerReply {
+                        present: false,
+                        layer_verity: None,
+                    }),
+                }
+            }
+
+            match self.lookup_oci(handle)? {
+                OpenRepo::Sha256(ref r) => check::<Sha256HashValue>(r, &content_id),
+                OpenRepo::Sha512(ref r) => check::<Sha512HashValue>(r, &content_id),
+            }
+        }
+
+        /// Stream the layer as a `splitdirfdstream` over a pipe, with the full
+        /// hardened streaming fd-transport contract.
+        ///
+        /// This is a **streaming** method (`more`): it yields multiple frames,
+        /// each carrying a batch of FDs.  The client must concatenate FD batches
+        /// from all frames to reconstruct the logical array:
+        ///
+        /// ```text
+        /// [ pipe_read | <dirfds region: dir_count fds> | <lifetime fds: keepalive + extras> ]
+        /// ```
+        ///
+        /// The dirfds region uses sparse placement (hash-determined slot assignment);
+        /// lifetime fds are opaque tokens the client must hold until done reading.
+        ///
+        /// **Non-streaming** (`more=false`): all fds in a single frame; returns
+        /// `FdLimitExceeded` if the total exceeds `MAX_FDS_PER_FRAME` (retry with
+        /// `more=true`).
+        ///
+        /// The producer runs on `spawn_blocking` so the async task is never blocked.
+        /// For the repo case there is no external lock to release, so `keepalive_read`
+        /// is moved into the producer closure and dropped when the producer finishes.
+        #[zlink(interface = "org.composefs.Oci", more, return_fds)]
+        async fn get_layer(
+            &self,
+            more: bool,
+            handle: u64,
+            params: GetLayerParams,
+            #[zlink(fds)] _fds: Vec<std::os::fd::OwnedFd>,
+        ) -> impl zlink::futures_util::Stream<
+            Item = (
+                std::result::Result<zlink::Reply<GetLayerReply>, OciError>,
+                Vec<std::os::fd::OwnedFd>,
+            ),
+        > + Unpin {
+            use zlink::futures_util::stream::{self, StreamExt as _};
+
+            type StreamItem = (
+                std::result::Result<zlink::Reply<GetLayerReply>, OciError>,
+                Vec<std::os::fd::OwnedFd>,
+            );
+
+            macro_rules! err_stream {
+                ($e:expr) => {
+                    return stream::iter(std::iter::once::<StreamItem>((Err($e), vec![])))
+                        .left_stream()
+                };
+            }
+
+            // ── Extract diff_id from params (repo service requires it) ─────────
+            let diff_id = match params.diff_id {
+                Some(d) => d,
+                None => err_stream!(OciError::InvalidRequest {
+                    message: "GetLayer: diff_id is required for the repo service".into(),
+                }),
+            };
+
+            // ── Parse diff_id ─────────────────────────────────────────────────
+            let diff_id_parsed: composefs_oci::OciDigest = match diff_id.parse() {
+                Ok(d) => d,
+                Err(e) => err_stream!(OciError::InvalidDigest {
+                    message: format!("{e}"),
+                }),
+            };
+            let content_id = composefs_oci::layer_content_id(&diff_id_parsed);
+
+            // ── Drive serve_get_layer via the LayerSource trait ───────────────
+            fn do_serve_get_layer<ObjectID: FsVerityHashValue>(
+                repo: &std::sync::Arc<composefs::repository::Repository<ObjectID>>,
+                content_id: &str,
+                diff_id_str: &str,
+                more: bool,
+            ) -> std::result::Result<composefs_oci::layer_transport::GetLayerFrames, OciError>
+            {
+                let verity = repo
+                    .has_stream(content_id)
+                    .map_err(|e| OciError::InternalError {
+                        message: format!("{e:#}"),
+                    })?
+                    .ok_or_else(|| OciError::NoSuchLayer {
+                        diff_id: diff_id_str.to_string(),
+                    })?;
+
+                let seed = seed_from_id(content_id);
+                let source = RepoLayerSource {
+                    repo: repo.clone(),
+                    layer_verity: verity,
+                };
+
+                serve_get_layer(source, seed, more).map_err(|e| match e {
+                    composefs_oci::layer_transport::ServeGetLayerError::FdLimitExceeded(e) => {
+                        OciError::FdLimitExceeded {
+                            fd_count: e.fd_count as u64,
+                            max_per_frame: e.max_per_frame as u64,
+                        }
+                    }
+                    composefs_oci::layer_transport::ServeGetLayerError::Other(e) => {
+                        OciError::InternalError {
+                            message: format!("{e:#}"),
+                        }
+                    }
+                })
+            }
+
+            let frames = match self.lookup_oci(handle) {
+                Ok(OpenRepo::Sha256(ref r)) => {
+                    do_serve_get_layer::<Sha256HashValue>(r, &content_id, &diff_id, more)
+                }
+                Ok(OpenRepo::Sha512(ref r)) => {
+                    do_serve_get_layer::<Sha512HashValue>(r, &content_id, &diff_id, more)
+                }
+                Err(e) => Err(e),
+            };
+
+            let frames = match frames {
+                Ok(f) => f,
+                Err(e) => err_stream!(e),
+            };
+
+            let dir_count = frames.dir_count;
+            let batches = frames.batches;
+            let n_frames = batches.len();
+            let reply = GetLayerReply { dir_count };
+
+            stream::iter(batches.into_iter().enumerate().map(move |(i, batch)| {
+                let is_last = i == n_frames - 1;
+                (
+                    Ok(zlink::Reply::new(Some(reply.clone())).set_continues(Some(!is_last))),
+                    batch,
+                )
+            }))
+            .right_stream()
+        }
+
+        /// Receive a layer as a `splitdirfdstream` from the client and import
+        /// it into the server's repository, verifying content integrity.
+        ///
+        /// The client supplies:
+        /// * `fds[0]` — read end of a pipe carrying the `splitdirfdstream` bytes.
+        /// * `fds[1..]` — source object directories (the splitdirfdstream's
+        ///   `dirfd_index` selects among them; objects dir is index 0).
+        ///
+        /// The server runs the verified drain on a `spawn_blocking` thread so the
+        /// async task is not blocked while data flows through the pipe.  The layer
+        /// content is only committed if its reconstructed sha256 matches `diff_id`;
+        /// on mismatch [`OciError::DiffIdMismatch`] is returned and no stream
+        /// is committed.
+        ///
+        /// The server always drains the pipe to avoid wedging the client's writer
+        /// even if the layer is already present — the import is idempotent.
+        #[zlink(interface = "org.composefs.Oci")]
+        async fn put_layer(
+            &self,
+            handle: u64,
+            diff_id: String,
+            zerocopy: bool,
+            #[zlink(fds)] fds: Vec<std::os::fd::OwnedFd>,
+        ) -> std::result::Result<PutLayerReply, OciError> {
+            // Validate the fd count: fds[0] = pipe read, fds[1..] = dir fds.
+            if fds.len() < 2 {
+                return Err(OciError::InvalidRequest {
+                    message: format!(
+                        "expected at least 2 fds (1 pipe + >=1 dir fd), got {}",
+                        fds.len()
+                    ),
+                });
+            }
+
+            let diff_id_parsed: composefs_oci::OciDigest =
+                diff_id.parse().map_err(|e| OciError::InvalidDigest {
+                    message: format!("{e}"),
+                })?;
+
+            let content_id = composefs_oci::layer_content_id(&diff_id_parsed);
+
+            // Check whether the layer is already present (for the reply flag).
+            // We still proceed with the drain regardless to avoid wedging the
+            // client's writer if it is already producing.
+            let already_present = match self.lookup_oci(handle)? {
+                OpenRepo::Sha256(ref r) => r
+                    .has_stream(&content_id)
+                    .map_err(|e| OciError::InternalError {
+                        message: format!("{e:#}"),
+                    })?
+                    .is_some(),
+                OpenRepo::Sha512(ref r) => r
+                    .has_stream(&content_id)
+                    .map_err(|e| OciError::InternalError {
+                        message: format!("{e:#}"),
+                    })?
+                    .is_some(),
+            };
+
+            // Split fds: pipe_read + dir_fds.
+            let mut fds = fds;
+            let pipe_read = fds.remove(0);
+            let dir_fds = fds; // remaining fds are the dir fds
+
+            async fn run_put_layer<ObjectID: FsVerityHashValue>(
+                repo: std::sync::Arc<composefs::repository::Repository<ObjectID>>,
+                pipe_read: std::os::fd::OwnedFd,
+                dir_fds: Vec<std::os::fd::OwnedFd>,
+                diff_id: composefs_oci::OciDigest,
+                zerocopy: bool,
+                already_present: bool,
+            ) -> std::result::Result<PutLayerReply, OciError> {
+                tokio::task::spawn_blocking(move || {
+                    composefs_oci::layer_sync::drain_splitdirfdstream_verified(
+                        repo,
+                        pipe_read,
+                        dir_fds,
+                        &diff_id,
+                        zerocopy,
+                        composefs::repository::ImportContext::default(),
+                    )
+                })
+                .await
+                .map_err(|e| OciError::InternalError {
+                    message: format!("spawn_blocking panic: {e}"),
+                })?
+                .map(|(verity, stats, _ctx)| PutLayerReply {
+                    layer_verity: verity.to_hex(),
+                    already_present,
+                    objects_reflinked: stats.objects_reflinked,
+                    objects_hardlinked: stats.objects_hardlinked,
+                    objects_copied: stats.objects_copied,
+                    objects_already_present: stats.objects_already_present,
+                })
+                .map_err(|e| match e {
+                    composefs_oci::layer_sync::VerifiedDrainError::DiffIdMismatch {
+                        expected,
+                        actual,
+                    } => OciError::DiffIdMismatch { expected, actual },
+                    composefs_oci::layer_sync::VerifiedDrainError::Other(err) => {
+                        OciError::InternalError {
+                            message: format!("{err:#}"),
+                        }
+                    }
+                })
+            }
+
+            match self.lookup_oci(handle)? {
+                OpenRepo::Sha256(ref r) => {
+                    run_put_layer::<Sha256HashValue>(
+                        r.clone(),
+                        pipe_read,
+                        dir_fds,
+                        diff_id_parsed,
+                        zerocopy,
+                        already_present,
+                    )
+                    .await
+                }
+                OpenRepo::Sha512(ref r) => {
+                    run_put_layer::<Sha512HashValue>(
+                        r.clone(),
+                        pipe_read,
+                        dir_fds,
+                        diff_id_parsed,
+                        zerocopy,
+                        already_present,
+                    )
+                    .await
+                }
+            }
+        }
+
+        /// Finalize an OCI image after all layers have been imported.
+        ///
+        /// Given the raw manifest and config JSON bytes and the ordered list of
+        /// `(diff_id, layer_verity)` pairs (as returned by `PutLayer`), this
+        /// method writes the config and manifest splitstreams, generates the
+        /// composefs EROFS image, and optionally tags the manifest. Idempotent.
+        ///
+        /// Returns the digest and verity strings for both the manifest and config
+        /// splitstreams.
+        #[zlink(interface = "org.composefs.Oci")]
+        async fn finalize_image(
+            &self,
+            handle: u64,
+            manifest_json: String,
+            config_json: String,
+            layers: Vec<LayerRef>,
+            name: Option<String>,
+        ) -> std::result::Result<FinalizeImageReply, OciError> {
+            async fn run_finalize<ObjectID: FsVerityHashValue>(
+                repo: std::sync::Arc<composefs::repository::Repository<ObjectID>>,
+                manifest_json: String,
+                config_json: String,
+                layers: Vec<LayerRef>,
+                name: Option<String>,
+            ) -> std::result::Result<FinalizeImageReply, OciError> {
+                // Parse each LayerRef into (OciDigest, ObjectID).
+                let mut layer_refs: Vec<(composefs_oci::OciDigest, ObjectID)> =
+                    Vec::with_capacity(layers.len());
+                for lr in &layers {
+                    let diff_id: composefs_oci::OciDigest =
+                        lr.diff_id.parse().map_err(|e| OciError::InvalidDigest {
+                            message: format!("diff_id {:?}: {e}", lr.diff_id),
+                        })?;
+                    let verity = ObjectID::from_hex(&lr.layer_verity).map_err(|e| {
+                        OciError::InvalidDigest {
+                            message: format!("layer_verity {:?}: {e}", lr.layer_verity),
+                        }
+                    })?;
+                    layer_refs.push((diff_id, verity));
+                }
+
+                tokio::task::spawn_blocking(move || {
+                    composefs_oci::layer_sync::finalize_oci_image(
+                        &repo,
+                        manifest_json.as_bytes(),
+                        config_json.as_bytes(),
+                        &layer_refs,
+                        name.as_deref(),
+                    )
+                })
+                .await
+                .map_err(|e| OciError::InternalError {
+                    message: format!("spawn_blocking panic: {e}"),
+                })?
+                .map(
+                    |((manifest_digest, manifest_verity), (config_digest, config_verity))| {
+                        FinalizeImageReply {
+                            manifest_digest: manifest_digest.to_string(),
+                            manifest_verity: manifest_verity.to_hex(),
+                            config_digest: config_digest.to_string(),
+                            config_verity: config_verity.to_hex(),
+                        }
+                    },
+                )
+                .map_err(|e| OciError::InternalError {
+                    message: format!("{e:#}"),
+                })
+            }
+
+            match self.lookup_oci(handle)? {
+                OpenRepo::Sha256(ref r) => {
+                    run_finalize::<Sha256HashValue>(
+                        r.clone(),
+                        manifest_json,
+                        config_json,
+                        layers,
+                        name,
+                    )
+                    .await
+                }
+                OpenRepo::Sha512(ref r) => {
+                    run_finalize::<Sha512HashValue>(
+                        r.clone(),
+                        manifest_json,
+                        config_json,
+                        layers,
+                        name,
+                    )
+                    .await
+                }
+            }
+        }
     }
 }
 
@@ -1239,14 +1745,31 @@ impl zlink::Listener for ActivatedListener {
     }
 }
 
-/// Try to build an [`ActivatedListener`] from a socket-activated fd.
+/// An inherited socket-activation fd, classified by its listening state.
+pub(crate) enum ActivatedSocket {
+    /// A pre-connected stream (`varlinkctl exec:` transport): one connection
+    /// on fd 3. Served via [`ActivatedListener`].
+    Connected(ActivatedListener),
+    /// A listening socket (systemd `.socket` with `Accept=no`, or the test
+    /// harness): served with a normal accept loop.
+    Listening(zlink::unix::Listener),
+}
+
+/// Try to classify a socket-activation fd inherited from the service manager.
+///
+/// Uses `libsystemd` to receive file descriptors (checks `LISTEN_FDS`/
+/// `LISTEN_PID` and clears the env vars). Returns `None` when the process
+/// was not socket-activated.
 ///
-/// Uses `libsystemd` to receive file descriptors passed by the service
-/// manager (checks `LISTEN_FDS`/`LISTEN_PID` and clears the env vars).
-/// Returns `None` when the process was not socket-activated.
+/// When a fd is present its socket type is inspected via `SO_ACCEPTCONN`:
+/// - **Listening**: the fd is a bound, listening socket (e.g. passed by the
+///   test harness or a systemd `.socket` unit with `Accept=no`) — wrapped as
+///   [`ActivatedSocket::Listening`].
+/// - **Connected**: the fd is an already-connected stream (e.g. `varlinkctl
+///   exec:`) — wrapped as [`ActivatedSocket::Connected`].
 #[allow(unsafe_code)]
-pub(crate) fn try_activated_listener() -> Result<Option<ActivatedListener>> {
-    use std::os::fd::{FromRawFd as _, IntoRawFd as _};
+pub(crate) fn try_activated_listener() -> Result<Option<ActivatedSocket>> {
+    use std::os::fd::{FromRawFd as _, IntoRawFd as _, OwnedFd};
 
     let fds = libsystemd::activation::receive_descriptors(true)
         .map_err(|e| anyhow::anyhow!("Failed to receive activation fds: {e}"))?;
@@ -1256,56 +1779,100 @@ pub(crate) fn try_activated_listener() -> Result<Option<ActivatedListener>> {
         None => return Ok(None),
     };
 
-    // SAFETY: `libsystemd::activation::receive_descriptors(true)` validated
-    // the fd and transferred ownership. `into_raw_fd()` consumes the
-    // `FileDescriptor` wrapper, giving us sole ownership of a valid fd.
-    let std_stream = unsafe { std::os::unix::net::UnixStream::from_raw_fd(fd.into_raw_fd()) };
-    std_stream
-        .set_nonblocking(true)
-        .context("setting systemd socket to non-blocking")?;
-    let tokio_stream = tokio::net::UnixStream::from_std(std_stream)
-        .context("converting systemd UnixStream to tokio")?;
-    let zlink_stream = zlink::unix::Stream::from(tokio_stream);
-    let conn = zlink::Connection::from(zlink_stream);
-    Ok(Some(ActivatedListener { conn: Some(conn) }))
+    // SAFETY: `receive_descriptors` validated the fd and transferred ownership
+    // via `IntoRawFd`.  We immediately re-wrap the raw integer as an `OwnedFd`
+    // so that Rust's ownership rules track the fd from this point forward.
+    let owned: OwnedFd = unsafe { OwnedFd::from_raw_fd(fd.into_raw_fd()) };
+
+    // Query SO_ACCEPTCONN to distinguish a pre-connected stream (varlinkctl
+    // `exec:`) from a listening socket (systemd socket unit / test harness).
+    let is_listening = rustix::net::sockopt::socket_acceptconn(&owned)
+        .context("querying SO_ACCEPTCONN on activation fd")?;
+
+    if is_listening {
+        // The fd is a bound, listening Unix socket.  Hand it to zlink's
+        // Listener adapter, which calls set_nonblocking and wraps it in tokio.
+        let listener = zlink::unix::Listener::try_from(owned)
+            .context("converting listening activation fd to zlink Listener")?;
+        Ok(Some(ActivatedSocket::Listening(listener)))
+    } else {
+        // The fd is an already-connected stream (e.g. varlinkctl exec:).
+        // `From<OwnedFd>` for `UnixStream` is safe — ownership is transferred.
+        let std_stream = std::os::unix::net::UnixStream::from(owned);
+        std_stream
+            .set_nonblocking(true)
+            .context("setting systemd socket to non-blocking")?;
+        let tokio_stream = tokio::net::UnixStream::from_std(std_stream)
+            .context("converting systemd UnixStream to tokio")?;
+        let zlink_stream = zlink::unix::Stream::from(tokio_stream);
+        let conn = zlink::Connection::from(zlink_stream);
+        Ok(Some(ActivatedSocket::Connected(ActivatedListener {
+            conn: Some(conn),
+        })))
+    }
 }
 
-/// Serve `service` on an already-obtained socket-activated listener.
+/// Serve `service` on an already-obtained socket-activated connected listener.
 ///
 /// Status is logged, never written to stdout: under socket activation (e.g.
 /// varlinkctl's `exec:` transport) the parent may treat our stdout as part of
 /// the protocol handshake, and any stray bytes there reset the connection.
+///
+/// The server loop runs inside a [`tokio::task::LocalSet`] so request handlers
+/// can `spawn_local` `!Send` work (see [`pull_stream`]).  Both serve paths
+/// wrap exactly one `LocalSet`.
 pub(crate) async fn serve_activated<S>(service: S, listener: ActivatedListener) -> Result<()>
 where
     S: zlink::Service<zlink::unix::Stream>,
 {
     log::info!("Listening on systemd-activated socket");
     let server = zlink::Server::new(listener, service);
-    server
-        .run()
+    tokio::task::LocalSet::new()
+        .run_until(server.run())
         .await
         .context("running varlink server (activated)")
 }
 
-/// Serve `service` either on a systemd-activated connected socket or by
-/// binding a fresh Unix socket at `address`.
+/// Serve `service` on a listening [`zlink::unix::Listener`] inside a
+/// [`tokio::task::LocalSet`].
+///
+/// Used for both the socket-activated listening fd path and the normal
+/// `bind`-a-fresh-socket path (see [`serve`]).
+pub(crate) async fn serve_on_listener<S>(service: S, listener: zlink::unix::Listener) -> Result<()>
+where
+    S: zlink::Service<zlink::unix::Stream>,
+{
+    let server = zlink::Server::new(listener, service);
+    tokio::task::LocalSet::new()
+        .run_until(server.run())
+        .await
+        .context("running varlink server")
+}
+
+/// Serve `service` on the appropriate socket, auto-detecting the source.
 ///
-/// Socket activation takes precedence: if the process was started with a
-/// connected activation socket, `address` is ignored. Otherwise `address`
-/// must be `Some`, or this returns an error.
+/// Resolution order:
+/// 1. A socket-activation fd inherited from the service manager:
+///    - If listening (`SO_ACCEPTCONN`): serve with a normal accept loop.
+///    - If connected (`varlinkctl exec:`): serve single-shot.
+/// 2. A freshly bound socket at `address` (which must be `Some`).
 pub(crate) async fn serve<S>(service: S, address: Option<&Path>) -> Result<()>
 where
     S: zlink::Service<zlink::unix::Stream>,
 {
-    if let Some(activated) = try_activated_listener()? {
-        return serve_activated(service, activated).await;
+    match try_activated_listener()? {
+        Some(ActivatedSocket::Connected(l)) => return serve_activated(service, l).await,
+        Some(ActivatedSocket::Listening(listener)) => {
+            log::info!("Listening on systemd-activated socket");
+            return serve_on_listener(service, listener).await;
+        }
+        None => {}
     }
     let address = address.context("no --address given and not socket-activated")?;
     let listener = zlink::unix::bind(address)
         .with_context(|| format!("binding varlink socket at {}", address.display()))?;
     log::info!("Listening on {}", address.display());
-    let server = zlink::Server::new(listener, service);
-    server.run().await.context("running varlink server")
+    serve_on_listener(service, listener).await
 }
 
 /// Varlink support for the OCI interface (`org.composefs.Oci`).
@@ -1689,6 +2256,10 @@ pub mod oci {
     /// When `more` is `false` the client asked for a single reply, so no progress
     /// reporter is attached and the stream yields only the terminal `completed`
     /// frame (or an error).
+    ///
+    /// The pull task uses [`tokio::task::spawn_local`], not [`tokio::spawn`]:
+    /// `composefs_oci::pull` is `!Send` (the `get_layer` zlink proxy returns a
+    /// `!Send` `ReplyStream`), and the server loop runs inside a `LocalSet`.
     #[allow(clippy::too_many_arguments)]
     pub(crate) fn pull_stream<ObjectID: FsVerityHashValue>(
         repo: Arc<Repository<ObjectID>>,
@@ -1702,7 +2273,7 @@ pub mod oci {
         Box<
             dyn zlink::futures_util::Stream<
                     Item = std::result::Result<zlink::Reply<PullProgress>, OciError>,
-                > + Send,
+                >,
         >,
     > {
         use zlink::futures_util::stream;
@@ -1720,7 +2291,7 @@ pub mod oci {
         // and sends it through the channel before the sender drops.  Pull errors
         // are carried out via the task's `JoinHandle` return value.
         let task_tx = tx.clone();
-        let handle = tokio::spawn(async move {
+        let handle = tokio::task::spawn_local(async move {
             let opts = composefs_oci::PullOptions {
                 local_fetch,
                 storage_root: storage_root.as_deref(),
@@ -1848,6 +2419,164 @@ pub mod oci {
             /// Description of the failure.
             message: String,
         },
+        /// The requested layer (by diff-id) is not present in the repository.
+        NoSuchLayer {
+            /// The diff-id that was not found.
+            diff_id: String,
+        },
+        /// A supplied digest/diff-id string was malformed.
+        InvalidDigest {
+            /// Human-readable description of the parse failure.
+            message: String,
+        },
+        /// Received layer content did not hash to the declared diff-id.
+        ///
+        /// The stream was NOT committed; the client must retry with correct data.
+        DiffIdMismatch {
+            /// The diff_id that was declared by the client.
+            expected: String,
+            /// The sha256 digest of the data that was actually received.
+            actual: String,
+        },
+        /// The request was malformed (e.g. wrong fd count).
+        InvalidRequest {
+            /// Human-readable description of what was wrong.
+            message: String,
+        },
+        /// The total fd count exceeds [`MAX_FDS_PER_FRAME`] for a `more=false` call.
+        ///
+        /// The client must retry with `more=true` (streaming mode).
+        FdLimitExceeded {
+            /// Total number of fds that would be sent.
+            fd_count: u64,
+            /// The per-frame cap that was exceeded.
+            max_per_frame: u64,
+        },
+    }
+}
+
+/// Reply types for the layer-sync methods of the `org.composefs.Oci` interface,
+/// gated behind the `oci` feature (they depend on [`composefs_oci::layer_sync`]).
+///
+/// The four layer-sync methods (`GetInfo`, `HasLayer`, `GetLayer`, `PutLayer`)
+/// are part of `org.composefs.Oci`; this module merely collects their reply
+/// structs to keep them separate from the rest of the OCI wire types.
+#[cfg(feature = "oci")]
+pub mod layer_sync {
+    use super::*;
+
+    /// Reply from `GetInfo`: capability tokens supported by this service.
+    #[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+    pub struct GetInfoReply {
+        /// Capability tokens advertised by this service instance.
+        ///
+        /// Currently only `"splitdirfdstream-v0"` is defined.
+        pub features: Vec<String>,
+    }
+
+    /// Reply from `HasLayer`: whether the layer is present in the repository.
+    #[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+    pub struct HasLayerReply {
+        /// Whether the layer splitstream for the given diff-id is present.
+        pub present: bool,
+        /// Hex-encoded fs-verity hash of the layer splitstream, if present.
+        pub layer_verity: Option<String>,
+    }
+
+    /// Reply from `GetLayer`: the number of diff-directory slots in the logical FD array.
+    ///
+    /// `GetLayer` is a **streaming** method (`more`): it yields multiple frames,
+    /// each carrying a batch of FDs.  The client MUST concatenate the FD batches
+    /// from all frames (in arrival order) to reconstruct the full logical FD array:
+    ///
+    /// - `fds[0]` — data pipe read end (carries the `splitdirfdstream` bytes).
+    /// - `fds[1..=dir_count]` — the dirfds region (`dir_count` slots total).  The
+    ///   real objects-directory fd sits at a sparse, hash-determined index within
+    ///   this region; the remaining (gap) slots hold inert dummy fds that
+    ///   `reconstruct` never dereferences.  The sparse placement is encoded in each
+    ///   `FileBackedData` chunk's `dirfd_index`; the client passes the whole region
+    ///   to `drain_splitdirfdstream` / `reconstruct` unchanged and must NOT assume
+    ///   the dir is at a fixed index.
+    /// - `fds[dir_count+1..]` — opaque lifetime FDs.  The client MUST hold every
+    ///   one of these open until it has finished reading and processing all dir fds,
+    ///   then close them all to signal completion to the server.  The count of
+    ///   trailing FDs is unspecified by contract; the client keeps open whatever it
+    ///   does not otherwise recognise.  This lifetime-FD convention is part of the
+    ///   `splitdirfdstream-v0` feature.
+    ///
+    /// Each transport frame carries at most `MAX_FDS_PER_FRAME` (240) fds, safely
+    /// below the kernel `SCM_MAX_FD` (253) limit.  Every frame carries the same
+    /// `dir_count`; the client should use the value from any frame (they are all
+    /// identical).  The stream terminates when a frame with `continues=false` is
+    /// received.
+    ///
+    /// A non-streaming (`more=false`) call delivers all fds in a single frame; if
+    /// the layer requires more than `MAX_FDS_PER_FRAME` fds the call returns
+    /// `FdLimitExceeded` and the client must retry with `more=true`.
+    #[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+    pub struct GetLayerReply {
+        /// Number of diff-directory file descriptors in the full logical FD array
+        /// (i.e. `fds[1..=dir_count]` after concatenating all frames' batches).
+        pub dir_count: u32,
+    }
+
+    /// Reply from `PutLayer`: the verity hash of the imported layer, whether
+    /// it was already present, and per-object transfer statistics.
+    ///
+    /// The object-count fields let the client verify that zero-copy transfer
+    /// actually took place (e.g. assert `objects_reflinked > 0` in tests) and
+    /// accumulate aggregate stats for user-facing output.
+    #[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+    pub struct PutLayerReply {
+        /// Hex-encoded fs-verity hash of the committed layer splitstream.
+        pub layer_verity: String,
+        /// `true` if the layer was already present before this call.
+        ///
+        /// The server always drains the pipe regardless (to avoid wedging
+        /// the client's writer), so the stream is re-imported idempotently.
+        pub already_present: bool,
+
+        /// Number of objects that were reflinked (FICLONE) into the
+        /// destination. Non-zero only when source and dest share a filesystem.
+        #[serde(default)]
+        pub objects_reflinked: u64,
+        /// Number of objects hardlinked into the destination (zerocopy mode).
+        #[serde(default)]
+        pub objects_hardlinked: u64,
+        /// Number of objects byte-copied into the destination.
+        #[serde(default)]
+        pub objects_copied: u64,
+        /// Number of objects already present in the destination (skipped).
+        #[serde(default)]
+        pub objects_already_present: u64,
+    }
+
+    /// A single (diff_id, layer_verity) pair passed to `FinalizeImage`.
+    ///
+    /// The client builds this list from the `PutLayer` replies it received while
+    /// copying layers to the destination repository.  The order must match the
+    /// manifest layer order.
+    #[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+    pub struct LayerRef {
+        /// OCI diff-id of the layer (e.g. `"sha256:abcd..."`).
+        pub diff_id: String,
+        /// Hex-encoded fs-verity hash of the layer splitstream in the destination
+        /// repository, as returned by `PutLayer`.
+        pub layer_verity: String,
+    }
+
+    /// Reply from `FinalizeImage`: digest and verity strings for the manifest
+    /// and config splitstreams that were written (or already existed).
+    #[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+    pub struct FinalizeImageReply {
+        /// OCI digest of the manifest (e.g. `"sha256:abcd..."`).
+        pub manifest_digest: String,
+        /// Hex-encoded fs-verity hash of the manifest splitstream.
+        pub manifest_verity: String,
+        /// OCI digest of the config (e.g. `"sha256:abcd..."`).
+        pub config_digest: String,
+        /// Hex-encoded fs-verity hash of the config splitstream.
+        pub config_verity: String,
     }
 }
 
@@ -1859,6 +2588,10 @@ pub mod oci {
 pub mod proxy {
     #![allow(missing_docs)]
 
+    #[cfg(feature = "oci")]
+    use super::layer_sync::{
+        FinalizeImageReply, GetInfoReply, GetLayerReply, HasLayerReply, LayerRef, PutLayerReply,
+    };
     #[cfg(feature = "oci")]
     use super::oci::{
         ListImagesReply, OciComputeIdReply, OciError, OciFsckReply, OciInspectReply, PullProgress,
@@ -1868,6 +2601,8 @@ pub mod proxy {
         RepositoryError,
     };
     #[cfg(feature = "oci")]
+    pub use composefs_oci::varlink_types::GetLayerParams;
+    #[cfg(feature = "oci")]
     use zlink::futures_util::Stream;
 
     /// Typed client for the `org.composefs.Repository` interface.
@@ -1975,8 +2710,755 @@ pub mod proxy {
             storage_root: Option<&str>,
             bootable: bool,
         ) -> zlink::Result<impl Stream<Item = zlink::Result<Result<PullProgress, OciError>>>>;
+
+        /// Query capability tokens supported by the service.
+        async fn get_info(&mut self) -> zlink::Result<Result<GetInfoReply, OciError>>;
+
+        /// Check whether a layer is present in the repository.
+        async fn has_layer(
+            &mut self,
+            handle: u64,
+            diff_id: &str,
+        ) -> zlink::Result<Result<HasLayerReply, OciError>>;
+
+        /// Stream the layer as a `splitdirfdstream` with full hardened fd-transport
+        /// contract (sparse dirfds, keepalive, lifetime fds, multi-frame).
+        ///
+        /// Drive the returned stream to completion (until `continues=false`),
+        /// concatenating each frame's fd batch in order to reconstruct the full
+        /// logical FD array `[pipe_read, dirfds.., lifetime_fds..]`.
+        #[zlink(more, return_fds)]
+        async fn get_layer(
+            &mut self,
+            handle: u64,
+            params: GetLayerParams,
+        ) -> zlink::Result<
+            impl zlink::futures_util::Stream<
+                Item = zlink::Result<(Result<GetLayerReply, OciError>, Vec<std::os::fd::OwnedFd>)>,
+            >,
+        >;
+
+        /// Receive a layer as a `splitdirfdstream` from the client and import
+        /// it into the server's repository with diff_id verification.
+        ///
+        /// `fds[0]` is the pipe read end; `fds[1..]` are source object dirs.
+        async fn put_layer(
+            &mut self,
+            handle: u64,
+            diff_id: &str,
+            zerocopy: bool,
+            #[zlink(fds)] fds: Vec<std::os::fd::OwnedFd>,
+        ) -> zlink::Result<Result<PutLayerReply, OciError>>;
+
+        /// Finalize an OCI image after all layers have been imported.
+        ///
+        /// `layers` must be in manifest layer order; each entry pairs the layer's
+        /// OCI diff-id with the hex verity returned by `PutLayer`.  `name` is the
+        /// tag to assign (optional). Idempotent.
+        async fn finalize_image(
+            &mut self,
+            handle: u64,
+            manifest_json: &str,
+            config_json: &str,
+            layers: Vec<LayerRef>,
+            name: Option<&str>,
+        ) -> zlink::Result<Result<FinalizeImageReply, OciError>>;
     }
 }
 
 #[cfg(feature = "oci")]
 pub(crate) use oci::*;
+
+/// Spawn a `CfsctlService` in-process over a Unix socket pair for testing.
+///
+/// Returns a connected client [`zlink::unix::Connection`] and a
+/// [`std::thread::JoinHandle`] for the server thread.
+///
+/// Mirrors the pattern in `composefs-storage`'s `spawn_in_process`: the zlink
+/// server is `!Send` so it runs on a dedicated OS thread with its own
+/// current-thread Tokio runtime and [`tokio::task::LocalSet`].
+///
+/// The server thread exits when the client connection is closed.
+#[cfg(feature = "oci")]
+pub(crate) fn spawn_in_process(
+    service: CfsctlService,
+) -> std::io::Result<(zlink::unix::Connection, std::thread::JoinHandle<()>)> {
+    let (client_std, server_std) = std::os::unix::net::UnixStream::pair()?;
+    client_std.set_nonblocking(true)?;
+    server_std.set_nonblocking(true)?;
+
+    let client_stream = tokio::net::UnixStream::from_std(client_std)?;
+    let client_zlink = zlink::unix::Stream::from(client_stream);
+    let client_conn = zlink::Connection::from(client_zlink);
+
+    let handle = std::thread::Builder::new()
+        .name("cfsctl-service-server".into())
+        .spawn(move || {
+            let rt = match tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()
+            {
+                Ok(rt) => rt,
+                Err(e) => {
+                    log::error!("CfsctlService server runtime build failed: {e:#?}");
+                    return;
+                }
+            };
+            let local = tokio::task::LocalSet::new();
+            local.block_on(&rt, async move {
+                let server_stream = match tokio::net::UnixStream::from_std(server_std) {
+                    Ok(s) => s,
+                    Err(e) => {
+                        log::error!("CfsctlService server stream conversion failed: {e:#?}");
+                        return;
+                    }
+                };
+                let server_zlink = zlink::unix::Stream::from(server_stream);
+                let listener = zlink::ReadyListener::new(server_zlink);
+                let server = zlink::Server::new(listener, service);
+                if let Err(e) = server.run().await {
+                    log::warn!("CfsctlService in-process server error: {e:#?}");
+                }
+            });
+        })?;
+
+    Ok((client_conn, handle))
+}
+
+#[cfg(all(test, feature = "oci"))]
+mod layer_sync_tests {
+    //! In-process round-trip tests for the layer-sync methods of the
+    //! `org.composefs.Oci` interface.
+    //!
+    //! These mirror the in-process transport test in
+    //! `composefs-storage`'s `cstor_service.rs`.
+
+    use std::io::Read as _;
+    use std::os::fd::AsFd as _;
+    use std::sync::Arc;
+
+    use composefs::fsverity::{FsVerityHashValue as _, Sha256HashValue};
+    use composefs::repository::{Repository, RepositoryConfig};
+    use composefs_splitdirfdstream::reconstruct;
+
+    use super::layer_sync::GetLayerReply;
+    use super::oci::OciError;
+    use super::proxy::{OciProxy, RepositoryProxy as _};
+    use super::{CfsctlService, spawn_in_process};
+    use composefs_oci::varlink_types::GetLayerParams;
+
+    /// Drive a streaming `get_layer` call to completion, collecting all FDs.
+    ///
+    /// Returns `(reply, all_fds)` where `all_fds` is the concatenated FD vector
+    /// from all frames in arrival order:
+    /// ```text
+    /// [ pipe_read | dirfds region (dir_count) | lifetime fds ]
+    /// ```
+    async fn collect_get_layer<C>(
+        client: &mut C,
+        handle: u64,
+        diff_id: &str,
+    ) -> Result<(GetLayerReply, Vec<std::os::fd::OwnedFd>), OciError>
+    where
+        C: OciProxy,
+    {
+        use zlink::futures_util::StreamExt as _;
+
+        let params = GetLayerParams {
+            diff_id: Some(diff_id.to_owned()),
+            storage: None,
+        };
+        let mut stream = std::pin::pin!(
+            client
+                .get_layer(handle, params)
+                .await
+                .expect("get_layer transport error")
+        );
+
+        let mut all_fds: Vec<std::os::fd::OwnedFd> = Vec::new();
+        let mut last_reply: Option<GetLayerReply> = None;
+
+        while let Some(item) = stream.next().await {
+            let (result, fds) = item.expect("get_layer stream error");
+            match result {
+                Ok(reply) => {
+                    last_reply = Some(reply);
+                }
+                Err(e) => return Err(e),
+            }
+            all_fds.extend(fds);
+        }
+
+        Ok((last_reply.expect("get_layer stream was empty"), all_fds))
+    }
+
+    /// Like `collect_get_layer` but splits the fd array into:
+    /// - `pipe_and_dirfds`: `fds[0..=dir_count]` (pipe + dirfds region)
+    /// - `lifetime_fds`: `fds[dir_count+1..]` (keepalive + extras)
+    ///
+    /// Returns `(dir_count, pipe_and_dirfds, lifetime_fds)`.
+    async fn collect_get_layer_split<C>(
+        client: &mut C,
+        handle: u64,
+        diff_id: &str,
+    ) -> (
+        GetLayerReply,
+        Vec<std::os::fd::OwnedFd>,
+        Vec<std::os::fd::OwnedFd>,
+    )
+    where
+        C: OciProxy,
+    {
+        let (reply, mut all_fds) = collect_get_layer(client, handle, diff_id)
+            .await
+            .expect("get_layer failed");
+        let dir_count = reply.dir_count as usize;
+        // pipe_and_dirfds = fds[0..=dir_count] (1 + dir_count)
+        let pipe_and_dirfds_len = 1 + dir_count;
+        assert!(
+            all_fds.len() >= pipe_and_dirfds_len,
+            "expected at least {pipe_and_dirfds_len} fds, got {}",
+            all_fds.len()
+        );
+        let lifetime_fds = all_fds.split_off(pipe_and_dirfds_len);
+        (reply, all_fds, lifetime_fds)
+    }
+
+    /// Build a trivial tar stream with one file at `size` bytes and return the
+    /// raw bytes.  Content is deterministic (repeating `i % 251`).
+    fn build_tar_layer(file_size: usize) -> Vec<u8> {
+        let content: Vec<u8> = (0..file_size).map(|i| (i % 251) as u8).collect();
+        let mut builder = ::tar::Builder::new(vec![]);
+        let mut header = ::tar::Header::new_ustar();
+        header.set_uid(0);
+        header.set_gid(0);
+        header.set_mode(0o644);
+        header.set_entry_type(::tar::EntryType::Regular);
+        header.set_size(file_size as u64);
+        builder
+            .append_data(&mut header, format!("file_{file_size}"), &content[..])
+            .unwrap();
+        builder.into_inner().unwrap()
+    }
+
+    /// Create an insecure test repo.
+    fn create_test_repo() -> (Arc<Repository<Sha256HashValue>>, tempfile::TempDir) {
+        let tempdir = tempfile::TempDir::new().unwrap();
+        let (repo, _) = Repository::init_path(
+            rustix::fs::CWD,
+            tempdir.path().join("repo"),
+            RepositoryConfig::default().set_insecure(),
+        )
+        .unwrap();
+        (Arc::new(repo), tempdir)
+    }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_layer_sync_in_process() {
+        // --- set up a repo and import a synthetic layer ---
+        let (repo, _tempdir) = create_test_repo();
+
+        // Build a tar layer that has one large (>64-byte = external) file.
+        let tar_bytes = build_tar_layer(128 * 1024); // 128 KiB — external object
+        let diff_id = composefs_oci::sha256_content_digest(&tar_bytes);
+        let (verity, _stats) =
+            composefs_oci::import_layer(&repo, &diff_id, None, tar_bytes.as_slice())
+                .await
+                .expect("import_layer");
+
+        // Record expected cat() output for comparison later.
+        let mut expected = Vec::<u8>::new();
+        {
+            let mut reader = repo
+                .open_stream("", Some(&verity), Some(composefs_oci::LAYER_CONTENT_TYPE))
+                .expect("open_stream for cat");
+            reader.cat(&repo, &mut expected).expect("cat");
+        }
+
+        let repo_path = _tempdir.path().join("repo").to_str().unwrap().to_string();
+
+        // --- build and start the in-process service ---
+        let service = CfsctlService::insecure_for_test();
+        let (mut client, _server_handle) = spawn_in_process(service).unwrap();
+
+        // OpenRepository to get a handle + metadata.
+        let open_reply = client
+            .open_repository(Some(&repo_path), None, None)
+            .await
+            .unwrap()
+            .expect("open_repository");
+        let handle = open_reply.handle;
+
+        // Validate the new metadata fields.
+        assert_eq!(
+            open_reply.hash_algorithm.as_deref(),
+            Some("sha256"),
+            "hash_algorithm must be sha256 for a Sha256HashValue repo"
+        );
+        assert!(
+            open_reply.objects_device_id.is_some(),
+            "objects_device_id must be reported"
+        );
+
+        // --- GetInfo ---
+        let info = client.get_info().await.unwrap().expect("get_info");
+        assert!(
+            info.features.contains(&"splitdirfdstream-v0".to_string()),
+            "expected splitdirfdstream-v0 in features"
+        );
+
+        // --- HasLayer: present ---
+        let has = client
+            .has_layer(handle, diff_id.as_ref())
+            .await
+            .unwrap()
+            .expect("has_layer");
+        assert!(has.present, "layer must be present");
+        assert_eq!(
+            has.layer_verity.as_deref(),
+            Some(verity.to_hex().as_str()),
+            "verity mismatch"
+        );
+
+        // --- HasLayer: absent ---
+        let fake_digest = "sha256:0000000000000000000000000000000000000000000000000000000000000000";
+        let has_absent = client
+            .has_layer(handle, fake_digest)
+            .await
+            .unwrap()
+            .expect("has_layer absent");
+        assert!(!has_absent.present, "absent layer must not be present");
+        assert!(has_absent.layer_verity.is_none());
+
+        // --- GetLayer: e2e round-trip ---
+        // The new streaming form: collect all frames' fds, then split into
+        // pipe+dirfds (wire positions 0..=dir_count) and lifetime fds (rest).
+        let (get_reply, pipe_and_dirfds, lifetime_fds) =
+            collect_get_layer_split(&mut client, handle, diff_id.as_ref()).await;
+        let dir_count = get_reply.dir_count as usize;
+
+        // Keep lifetime fds alive until we are done reading the stream.
+        let _lifetime_fds = lifetime_fds;
+
+        // fds[0] = pipe read; fds[1..=dir_count] = dirfds region (sparse).
+        let pipe_fd = pipe_and_dirfds[0].as_fd();
+        let dir_fds: Vec<_> = pipe_and_dirfds[1..=dir_count]
+            .iter()
+            .map(|f| f.as_fd())
+            .collect();
+
+        // Read the splitdirfdstream from the pipe to EOF.
+        let pipe_owned = rustix::io::dup(pipe_fd).expect("dup pipe read");
+        let mut pipe_file = std::fs::File::from(pipe_owned);
+        let mut stream_bytes = Vec::new();
+        pipe_file.read_to_end(&mut stream_bytes).unwrap();
+        assert!(!stream_bytes.is_empty(), "stream must be non-empty");
+
+        // Reconstruct via the sparse dirfds region.
+        let mut actual = Vec::new();
+        reconstruct(stream_bytes.as_slice(), &dir_fds, &mut actual)
+            .expect("reconstruct splitdirfdstream");
+
+        similar_asserts::assert_eq!(
+            actual,
+            expected,
+            "reconstructed layer must equal cat() output"
+        );
+
+        // --- GetLayer: unknown diff-id ---
+        let err = collect_get_layer(&mut client, handle, fake_digest).await;
+        match err {
+            Err(super::oci::OciError::NoSuchLayer { .. }) => {}
+            other => panic!("expected NoSuchLayer, got {other:?}"),
+        }
+    }
+
+    /// Full GetLayer→PutLayer relay: serve repo A via one in-process server,
+    /// call `get_layer` to obtain the stream fds, then relay them to a second
+    /// in-process server hosting repo B via `put_layer`.
+    ///
+    /// Asserts:
+    /// - `put_layer` succeeds with `already_present = false`.
+    /// - repo B has the layer committed and its `cat` output matches repo A.
+    /// - A second `put_layer` with the same data returns `already_present = true`.
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_put_layer_relay() {
+        // --- set up repo A with a layer containing an external object ---
+        let (repo_a, _td_a) = create_test_repo();
+        let tar_bytes = build_tar_layer(128 * 1024); // 128 KiB — external object
+        let diff_id = composefs_oci::sha256_content_digest(&tar_bytes);
+        let (verity_a, _) =
+            composefs_oci::import_layer(&repo_a, &diff_id, None, tar_bytes.as_slice())
+                .await
+                .expect("import_layer into repo_a");
+
+        // expected cat() output for later comparison
+        let mut expected = Vec::<u8>::new();
+        {
+            let mut reader = repo_a
+                .open_stream("", Some(&verity_a), Some(composefs_oci::LAYER_CONTENT_TYPE))
+                .expect("open_stream for cat");
+            reader.cat(&repo_a, &mut expected).expect("cat");
+        }
+        let repo_a_path = _td_a.path().join("repo").to_str().unwrap().to_string();
+
+        // --- set up repo B (empty) ---
+        let (repo_b, _td_b) = create_test_repo();
+        let repo_b_path = _td_b.path().join("repo").to_str().unwrap().to_string();
+
+        // --- start two in-process services ---
+        let service_a = CfsctlService::insecure_for_test();
+        let (mut client_a, _srv_a) = spawn_in_process(service_a).unwrap();
+
+        let service_b = CfsctlService::insecure_for_test();
+        let (mut client_b, _srv_b) = spawn_in_process(service_b).unwrap();
+
+        // Open repos via each service.
+        let handle_a = client_a
+            .open_repository(Some(&repo_a_path), None, None)
+            .await
+            .unwrap()
+            .expect("open_repository A")
+            .handle;
+        let handle_b = client_b
+            .open_repository(Some(&repo_b_path), None, None)
+            .await
+            .unwrap()
+            .expect("open_repository B")
+            .handle;
+
+        // --- GetLayer from service A ---
+        // Collect all frames; split into pipe+dirfds and lifetime fds.
+        let (get_reply, pipe_and_dirfds, lifetime_fds) =
+            collect_get_layer_split(&mut client_a, handle_a, diff_id.as_ref()).await;
+        let dir_count = get_reply.dir_count as usize;
+
+        // --- PutLayer into service B (first time) ---
+        // PutLayer receives fds[0..=dir_count] (pipe + dirfds region).
+        // We hold lifetime_fds open until put_layer returns.
+        let put_fds = pipe_and_dirfds; // fds[0] = pipe, fds[1..=dir_count] = dirs
+        let put_reply = client_b
+            .put_layer(handle_b, diff_id.as_ref(), false, put_fds)
+            .await
+            .unwrap()
+            .expect("put_layer");
+        // Drop lifetime fds after put_layer completes.
+        drop(lifetime_fds);
+
+        assert!(
+            !put_reply.already_present,
+            "first put_layer must report already_present = false"
+        );
+        assert!(
+            dir_count > 0,
+            "dir_count must be > 0 (dirfds region has at least one slot)"
+        );
+
+        // The layer has one large external object; at least one object must
+        // have been stored via copy (same-host in-process, but tmpfs may not
+        // support reflink). Verify the stats are populated.
+        let total_stored =
+            put_reply.objects_reflinked + put_reply.objects_hardlinked + put_reply.objects_copied;
+        assert!(
+            total_stored + put_reply.objects_already_present > 0,
+            "put_layer must report at least one object stored, got {put_reply:?}"
+        );
+
+        // Verify repo B now has the layer.
+        let content_id = composefs_oci::layer_content_id(&diff_id);
+        assert!(
+            repo_b
+                .has_stream(&content_id)
+                .expect("has_stream B")
+                .is_some(),
+            "repo B must have the layer after put_layer"
+        );
+
+        // Verify the cat() output matches.
+        let verity_b: Sha256HashValue =
+            Sha256HashValue::from_hex(&put_reply.layer_verity).expect("parse layer_verity hex");
+        let mut actual = Vec::<u8>::new();
+        {
+            let mut reader = repo_b
+                .open_stream("", Some(&verity_b), Some(composefs_oci::LAYER_CONTENT_TYPE))
+                .expect("open_stream B for cat");
+            reader.cat(&repo_b, &mut actual).expect("cat B");
+        }
+        similar_asserts::assert_eq!(actual, expected, "repo B cat must equal repo A cat");
+
+        // --- PutLayer a second time (idempotent): already_present = true ---
+        let (get_reply2, pipe_and_dirfds2, lifetime_fds2) =
+            collect_get_layer_split(&mut client_a, handle_a, diff_id.as_ref()).await;
+        let _ = get_reply2;
+
+        let put_reply2 = client_b
+            .put_layer(handle_b, diff_id.as_ref(), false, pipe_and_dirfds2)
+            .await
+            .unwrap()
+            .expect("put_layer 2nd");
+        drop(lifetime_fds2);
+
+        assert!(
+            put_reply2.already_present,
+            "second put_layer must report already_present = true"
+        );
+    }
+
+    /// Negative: `put_layer` with a wrong diff_id must return `DiffIdMismatch`
+    /// and repo B must NOT have the stream committed.
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_put_layer_wrong_diff_id() {
+        let (repo_a, _td_a) = create_test_repo();
+        let tar_bytes = build_tar_layer(128 * 1024);
+        let correct_diff_id = composefs_oci::sha256_content_digest(&tar_bytes);
+        let (_verity_a, _) =
+            composefs_oci::import_layer(&repo_a, &correct_diff_id, None, tar_bytes.as_slice())
+                .await
+                .expect("import_layer");
+        let repo_a_path = _td_a.path().join("repo").to_str().unwrap().to_string();
+
+        let (_repo_b, _td_b) = create_test_repo();
+        let repo_b_path = _td_b.path().join("repo").to_str().unwrap().to_string();
+
+        let service_a = CfsctlService::insecure_for_test();
+        let (mut client_a, _srv_a) = spawn_in_process(service_a).unwrap();
+        let service_b = CfsctlService::insecure_for_test();
+        let (mut client_b, _srv_b) = spawn_in_process(service_b).unwrap();
+
+        let handle_a = client_a
+            .open_repository(Some(&repo_a_path), None, None)
+            .await
+            .unwrap()
+            .expect("open_repository A")
+            .handle;
+        let handle_b = client_b
+            .open_repository(Some(&repo_b_path), None, None)
+            .await
+            .unwrap()
+            .expect("open_repository B")
+            .handle;
+
+        // Get layer fds from A (collect streaming frames, split off lifetime fds).
+        let (_get_reply, pipe_and_dirfds, lifetime_fds) =
+            collect_get_layer_split(&mut client_a, handle_a, correct_diff_id.as_ref()).await;
+
+        // Deliberately supply the wrong diff_id to service B.
+        let wrong_diff_id =
+            "sha256:0000000000000000000000000000000000000000000000000000000000000000";
+        let put_err = client_b
+            .put_layer(handle_b, wrong_diff_id, false, pipe_and_dirfds)
+            .await
+            .unwrap();
+        drop(lifetime_fds);
+
+        match put_err {
+            Err(super::oci::OciError::DiffIdMismatch { expected, actual }) => {
+                assert_eq!(expected, wrong_diff_id);
+                assert_eq!(actual, correct_diff_id.to_string());
+            }
+            other => panic!("expected DiffIdMismatch, got {other:?}"),
+        }
+
+        // The wrong stream must NOT be committed in repo B.
+        let wrong_content_id = composefs_oci::layer_content_id(
+            &wrong_diff_id.parse::<composefs_oci::OciDigest>().unwrap(),
+        );
+        assert!(
+            _repo_b
+                .has_stream(&wrong_content_id)
+                .expect("has_stream B")
+                .is_none(),
+            "repo B must NOT have a stream for the wrong diff_id"
+        );
+    }
+
+    // -------------------------------------------------------------------------
+    // Helpers shared by the finalize_image test
+    // -------------------------------------------------------------------------
+
+    /// Build a minimal tar layer with a valid OCI directory structure.
+    ///
+    /// Creates `./`, `./usr/`, `./usr/share/`, and one data file of `payload_size`
+    /// bytes at `./usr/share/data_<payload_size>`.
+    fn build_oci_tar_layer(payload_size: usize) -> Vec<u8> {
+        let mut builder = ::tar::Builder::new(vec![]);
+
+        for (path, is_dir) in &[("./", true), ("./usr/", true), ("./usr/share/", true)] {
+            let mut hdr = ::tar::Header::new_ustar();
+            hdr.set_entry_type(::tar::EntryType::Directory);
+            hdr.set_uid(0);
+            hdr.set_gid(0);
+            hdr.set_mode(0o755);
+            hdr.set_size(0);
+            let _ = is_dir; // suppress unused warning
+            builder
+                .append_data(&mut hdr, path, std::io::empty())
+                .unwrap();
+        }
+
+        let content: Vec<u8> = (0..payload_size).map(|i| (i % 251) as u8).collect();
+        let mut file_hdr = ::tar::Header::new_ustar();
+        file_hdr.set_entry_type(::tar::EntryType::Regular);
+        file_hdr.set_uid(0);
+        file_hdr.set_gid(0);
+        file_hdr.set_mode(0o644);
+        file_hdr.set_size(payload_size as u64);
+        builder
+            .append_data(
+                &mut file_hdr,
+                format!("./usr/share/data_{payload_size}"),
+                content.as_slice(),
+            )
+            .unwrap();
+
+        builder.into_inner().unwrap()
+    }
+
+    /// Build a minimal OCI config JSON with the given diff-id strings.
+    ///
+    /// Produces a JSON that `oci_spec::image::ImageConfiguration` would accept,
+    /// without pulling in the `oci_spec` builders (not available in composefs-ctl).
+    fn make_config_json(diff_ids: &[String]) -> String {
+        let ids: Vec<String> = diff_ids.iter().map(|d| format!("\"{d}\"")).collect();
+        format!(
+            r#"{{"architecture":"amd64","os":"linux","rootfs":{{"type":"layers","diff_ids":[{}]}},"config":{{}}}}"#,
+            ids.join(",")
+        )
+    }
+
+    /// Build a minimal OCI manifest JSON referencing `config_digest_str`.
+    fn make_manifest_json(
+        config_json: &str,
+        config_digest_str: &str,
+        diff_ids: &[String],
+    ) -> String {
+        let layer_entries: Vec<String> = diff_ids
+            .iter()
+            .map(|d| {
+                format!(
+                    r#"{{"mediaType":"application/vnd.oci.image.layer.v1.tar+gzip","digest":"{d}","size":1}}"#
+                )
+            })
+            .collect();
+        format!(
+            r#"{{"schemaVersion":2,"mediaType":"application/vnd.oci.image.manifest.v1+json","config":{{"mediaType":"application/vnd.oci.image.config.v1+json","digest":"{config_digest_str}","size":{}}},"layers":[{}]}}"#,
+            config_json.len(),
+            layer_entries.join(",")
+        )
+    }
+
+    /// Round-trip test for the `FinalizeImage` varlink method.
+    ///
+    /// Imports a layer directly into repo B, then calls `finalize_image` via
+    /// the in-process varlink service on B, and asserts:
+    /// - The reply digests are non-empty.
+    /// - The manifest and config splitstreams now exist in repo B.
+    /// - The composefs EROFS image was generated.
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_finalize_image_roundtrip() {
+        use composefs_oci::OciDigest;
+
+        let (repo_b, _td_b) = create_test_repo();
+        let repo_b_path = _td_b.path().join("repo").to_str().unwrap().to_string();
+
+        // Import a layer directly into repo_b.
+        let tar_bytes = build_oci_tar_layer(128 * 1024);
+        let diff_id = composefs_oci::sha256_content_digest(&tar_bytes);
+        let (layer_verity, _) =
+            composefs_oci::import_layer(&repo_b, &diff_id, None, tar_bytes.as_slice())
+                .await
+                .expect("import_layer into repo_b");
+
+        // Build config + manifest JSON.
+        let diff_ids = vec![diff_id.to_string()];
+        let config_json = make_config_json(&diff_ids);
+        let config_digest = composefs_oci::sha256_content_digest(config_json.as_bytes());
+        let manifest_json = make_manifest_json(&config_json, config_digest.as_ref(), &diff_ids);
+
+        // Start the in-process service on repo B.
+        let service_b = CfsctlService::insecure_for_test();
+        let (mut client_b, _srv_b) = spawn_in_process(service_b).unwrap();
+
+        let handle_b = client_b
+            .open_repository(Some(&repo_b_path), None, None)
+            .await
+            .unwrap()
+            .expect("open_repository B")
+            .handle;
+
+        // Build the LayerRef list.
+        let layers = vec![super::layer_sync::LayerRef {
+            diff_id: diff_id.to_string(),
+            layer_verity: layer_verity.to_hex(),
+        }];
+
+        // Call finalize_image.
+        let reply = client_b
+            .finalize_image(
+                handle_b,
+                &manifest_json,
+                &config_json,
+                layers,
+                Some("finalize-test:v1"),
+            )
+            .await
+            .unwrap()
+            .expect("finalize_image");
+
+        // Digests must be non-empty strings.
+        assert!(
+            !reply.manifest_digest.is_empty(),
+            "manifest_digest must be non-empty"
+        );
+        assert!(
+            !reply.manifest_verity.is_empty(),
+            "manifest_verity must be non-empty"
+        );
+        assert!(
+            !reply.config_digest.is_empty(),
+            "config_digest must be non-empty"
+        );
+        assert!(
+            !reply.config_verity.is_empty(),
+            "config_verity must be non-empty"
+        );
+
+        // Manifest and config splitstreams must now exist in repo_b.
+        let manifest_digest: OciDigest = reply.manifest_digest.parse().unwrap();
+        let config_digest2: OciDigest = reply.config_digest.parse().unwrap();
+
+        let manifest_id = composefs_oci::oci_image::manifest_identifier(&manifest_digest);
+        assert!(
+            repo_b
+                .has_stream(&manifest_id)
+                .expect("has_stream manifest")
+                .is_some(),
+            "manifest splitstream must exist in repo_b"
+        );
+
+        // The config stream key follows the pattern "oci-config-<digest>".
+        let config_id2 = format!("oci-config-{config_digest2}");
+        assert!(
+            repo_b
+                .has_stream(&config_id2)
+                .expect("has_stream config")
+                .is_some(),
+            "config splitstream must exist in repo_b"
+        );
+
+        // EROFS must have been generated.
+        let manifest_verity =
+            Sha256HashValue::from_hex(&reply.manifest_verity).expect("parse manifest_verity");
+        let erofs = composefs_oci::composefs_erofs_for_manifest(
+            &repo_b,
+            &manifest_digest,
+            Some(&manifest_verity),
+            repo_b.erofs_version(),
+        )
+        .expect("composefs_erofs_for_manifest");
+        assert!(
+            erofs.is_some(),
+            "EROFS image must exist after finalize_image"
+        );
+    }
+}
diff --git a/crates/composefs-integration-tests/src/main.rs b/crates/composefs-integration-tests/src/main.rs
index dc6dbef1..be1dc2a2 100644
--- a/crates/composefs-integration-tests/src/main.rs
+++ b/crates/composefs-integration-tests/src/main.rs
@@ -14,7 +14,7 @@
 use std::fs;
 use std::path::{Path, PathBuf};
 
-use anyhow::{Result, bail};
+use anyhow::{Context as _, Result, bail};
 use libtest_mimic::{Arguments, Trial};
 
 pub(crate) use composefs_integration_tests::{INTEGRATION_TESTS, integration_test};
@@ -57,6 +57,45 @@ pub(crate) fn cfsctl() -> Result<PathBuf> {
     )
 }
 
+/// Bind a listening Unix socket at a fresh tempdir path and spawn `cfsctl`
+/// against it via the systemd socket-activation protocol (`LISTEN_FDS=1`, the
+/// listening socket on fd 3, `LISTEN_PID` set in the child). The socket is
+/// already listening before the child starts, so callers may connect
+/// immediately — no polling needed. The path exists on disk, so `varlinkctl`
+/// (which connects by path) works unchanged.
+///
+/// Returns the spawned child, the tempdir (keep alive for the socket's
+/// lifetime), and the socket path.
+pub(crate) fn spawn_activated_cfsctl() -> Result<(std::process::Child, tempfile::TempDir, PathBuf)>
+{
+    use std::os::fd::OwnedFd;
+    use std::os::unix::net::UnixListener;
+    use std::sync::Arc;
+
+    use cap_std_ext::cmdext::{CapStdExtCommandExt, CmdFds, SystemdFdName};
+
+    let cfsctl = cfsctl()?;
+    let socket_dir = tempfile::tempdir()?;
+    let socket = socket_dir.path().join("varlink.sock");
+
+    let listener = UnixListener::bind(&socket)
+        .with_context(|| format!("binding varlink listener at {}", socket.display()))?;
+    let listener_fd: Arc<OwnedFd> = Arc::new(listener.into());
+
+    let mut cmd = std::process::Command::new(&cfsctl);
+    // Bare invocation — no subcommand, no --address.  cfsctl serves on the
+    // inherited listening fd via run_if_socket_activated().
+    //
+    // IMPORTANT: do NOT call cmd.env()/.envs() here — take_fds() sets the
+    // LISTEN_* vars via setenv in pre_exec, and Command::env would clobber
+    // them by building a separate envp array.
+    let fds = CmdFds::new_systemd_fds([(listener_fd, SystemdFdName::new("varlink"))]);
+    cmd.take_fds(fds);
+    let child = cmd.spawn().context("spawning socket-activated cfsctl")?;
+
+    Ok((child, socket_dir, socket))
+}
+
 /// Create a test rootfs fixture inside `parent` and return its path.
 ///
 /// Includes a file large enough (128 KiB) to avoid erofs inlining so that
diff --git a/crates/composefs-integration-tests/src/tests/copy_image.rs b/crates/composefs-integration-tests/src/tests/copy_image.rs
new file mode 100644
index 00000000..b65c7960
--- /dev/null
+++ b/crates/composefs-integration-tests/src/tests/copy_image.rs
@@ -0,0 +1,687 @@
+//! Integration tests for the `oci copy` CLI command and `copy_image` API.
+
+use std::path::Path;
+use std::sync::Arc;
+
+use anyhow::{Context, Result, bail, ensure};
+use xshell::{Shell, cmd};
+
+use composefs_oci::OciDigest;
+use composefs_oci::composefs::fsverity::{Sha256HashValue, Sha512HashValue};
+use composefs_oci::composefs::repository::{Repository, RepositoryConfig};
+use composefs_oci::layer_sync::finalize_oci_image;
+use composefs_oci::test_util::{build_oci_tar_layer, make_config_json, make_manifest_json};
+use composefs_oci::{import_layer, layer_content_id, sha256_content_digest};
+
+use composefs_ctl::varlink::RepositoryError;
+use composefs_ctl::varlink::proxy::RepositoryProxy;
+
+use crate::{cfsctl, integration_test};
+
+// ── Shared helpers ─────────────────────────────────────────────────────────
+
+/// Initialise an insecure SHA-256 repository at `path`.
+fn init_sha256_repo(path: &std::path::Path) -> Result<Arc<Repository<Sha256HashValue>>> {
+    std::fs::create_dir_all(path)?;
+    let fd = rustix::fs::open(
+        path,
+        rustix::fs::OFlags::CLOEXEC | rustix::fs::OFlags::RDONLY,
+        0.into(),
+    )?;
+    let (repo, _) = Repository::<Sha256HashValue>::init_path(
+        &fd,
+        ".",
+        RepositoryConfig::default().set_insecure(),
+    )?;
+    Ok(Arc::new(repo))
+}
+
+/// Build a two-layer synthetic image in `repo` and tag it `name`.
+///
+/// Layer 1: 10 bytes — small enough to be fully inlined.
+/// Layer 2: 256 KiB — large enough to produce an external object that
+///          exercises the reflink / hardlink path.
+///
+/// Returns `(manifest_json_bytes, config_json_bytes, diff_id_layer2_string)`.
+fn build_and_import_test_image(
+    repo: &Arc<Repository<Sha256HashValue>>,
+    name: &str,
+) -> Result<(Vec<u8>, Vec<u8>, String)> {
+    let rt = tokio::runtime::Runtime::new()?;
+
+    let tar1 = build_oci_tar_layer(10);
+    let tar2 = build_oci_tar_layer(256 * 1024);
+
+    let diff_id1 = sha256_content_digest(&tar1);
+    let diff_id2 = sha256_content_digest(&tar2);
+    let diff_id2_str = diff_id2.to_string();
+
+    let (verity1, verity2) = rt.block_on(async {
+        let (v1, _) = import_layer(repo, &diff_id1, None, tar1.as_slice())
+            .await
+            .context("import layer 1")?;
+        let (v2, _) = import_layer(repo, &diff_id2, None, tar2.as_slice())
+            .await
+            .context("import layer 2")?;
+        Ok::<_, anyhow::Error>((v1, v2))
+    })?;
+
+    let diff_ids = vec![diff_id1.to_string(), diff_id2_str.clone()];
+    let config_json = make_config_json(&diff_ids);
+    let config_digest = sha256_content_digest(&config_json);
+    let manifest_json = make_manifest_json(&config_json, config_digest.as_ref(), &diff_ids);
+
+    let layer_refs = vec![(diff_id1, verity1), (diff_id2, verity2)];
+    finalize_oci_image(repo, &manifest_json, &config_json, &layer_refs, Some(name))
+        .context("finalize_oci_image")?;
+
+    Ok((manifest_json, config_json, diff_id2_str))
+}
+
+// ── Test A: correctness (unprivileged) ─────────────────────────────────────
+
+fn test_copy_image_correctness() -> Result<()> {
+    let sh = Shell::new()?;
+    let cfsctl = cfsctl()?;
+
+    // Both repos under the same parent tempdir (same filesystem).
+    let parent = tempfile::tempdir()?;
+    let repo_a_path = parent.path().join("repo-a");
+    let repo_b_path = parent.path().join("repo-b");
+
+    // Initialise repo A and synthesise a two-layer image.
+    let repo_a = init_sha256_repo(&repo_a_path)?;
+    let (manifest_json_a, config_json_a, diff_id2_str) =
+        build_and_import_test_image(&repo_a, "copyme:v1")?;
+
+    // Compute digests for repo A for later comparison.
+    let manifest_digest_a = sha256_content_digest(&manifest_json_a);
+    let config_digest_a = sha256_content_digest(&config_json_a);
+
+    // Initialise repo B (empty).
+    let _repo_b_init = init_sha256_repo(&repo_b_path)?;
+    drop(_repo_b_init); // close — cfsctl will open it
+
+    // Run `cfsctl oci copy` as a subprocess. This is a sanity check of the
+    // CLI: it must succeed and land the image in the destination. Structured
+    // transfer stats are intentionally not exposed by the CLI (use varlink for
+    // that); we verify the outcome by inspecting the destination repo below.
+    let repo_a_str = repo_a_path.to_str().unwrap();
+    let repo_b_str = repo_b_path.to_str().unwrap();
+    let tag = "copyme:v1";
+    cmd!(
+        sh,
+        "{cfsctl} --repo {repo_b_str} --insecure oci copy {tag} --from {repo_a_str} --name {tag}"
+    )
+    .run()
+    .context("cfsctl oci copy failed")?;
+
+    // ── Assert: manifest and config digests are identical in repo B ───────
+    // Open repo B in-process and compare.
+    let repo_b = Repository::<Sha256HashValue>::open_path(rustix::fs::CWD, &repo_b_path)?;
+    let img_b = composefs_oci::oci_image::OciImage::open_ref(&repo_b, "copyme:v1")
+        .context("opening copyme:v1 in repo B")?;
+
+    ensure!(
+        img_b.manifest_digest() == &manifest_digest_a,
+        "manifest digest mismatch: src={manifest_digest_a}, dest={}",
+        img_b.manifest_digest()
+    );
+    ensure!(
+        img_b.config_digest() == &config_digest_a,
+        "config digest mismatch: src={config_digest_a}, dest={}",
+        img_b.config_digest()
+    );
+
+    // ── Assert: the 256 KiB external object exists in B and is byte-identical ──
+    // Find the object path using the diff_id of layer 2.
+    let diff_id2: composefs_oci::OciDigest = diff_id2_str
+        .parse()
+        .context("parsing diff_id2 as OciDigest")?;
+    let content_id = composefs_oci::layer_content_id(&diff_id2);
+
+    let verity_a = repo_a
+        .has_stream(&content_id)?
+        .context("repo_a missing layer 2 stream")?;
+    let verity_b = repo_b
+        .has_stream(&content_id)?
+        .context("repo_b missing layer 2 stream after copy")?;
+
+    // Both repos should have resolved to the same verity hash (same content).
+    ensure!(
+        verity_a == verity_b,
+        "verity hashes differ: src={verity_a:?}, dest={verity_b:?}"
+    );
+
+    // Read the external object from both repos and compare bytes.
+    let obj_bytes_a = repo_a
+        .read_object(&verity_a)
+        .context("read_object from repo_a")?;
+    let obj_bytes_b = repo_b
+        .read_object(&verity_b)
+        .context("read_object from repo_b")?;
+    ensure!(
+        obj_bytes_a == obj_bytes_b,
+        "external object content differs between repos"
+    );
+
+    Ok(())
+}
+integration_test!(test_copy_image_correctness);
+
+// ── Test B: deterministic reflink on XFS (privileged) ──────────────────────
+
+fn privileged_copy_image_reflink() -> Result<()> {
+    use crate::tests::privileged::{LoopTempDir, require_privileged};
+
+    if require_privileged("privileged_copy_image_reflink")?.is_some() {
+        return Ok(());
+    }
+
+    // Skip if mkfs.xfs is not available.
+    if !std::path::Path::new("/usr/sbin/mkfs.xfs").exists()
+        && !std::path::Path::new("/sbin/mkfs.xfs").exists()
+    {
+        println!("SKIP: mkfs.xfs not available");
+        return Ok(());
+    }
+
+    let sh = Shell::new()?;
+    let cfsctl = cfsctl()?;
+
+    // Both repos on the same XFS reflink-capable loop mount.
+    let fs = LoopTempDir::xfs_reflink()?;
+    let repo_a_path = fs.path().join("repo-a");
+    let repo_b_path = fs.path().join("repo-b");
+
+    let repo_a = init_sha256_repo(&repo_a_path)?;
+    build_and_import_test_image(&repo_a, "copyme:v1")?;
+
+    let _repo_b = init_sha256_repo(&repo_b_path)?;
+
+    // Run `cfsctl oci copy --zerocopy` via the CLI.
+    let repo_a_str = repo_a_path.to_str().unwrap();
+    let repo_b_str = repo_b_path.to_str().unwrap();
+    let tag = "copyme:v1";
+    cmd!(
+        sh,
+        "{cfsctl} --repo {repo_b_str} --insecure oci copy {tag} --from {repo_a_str} --name {tag} --zerocopy"
+    )
+    .run()
+    .context("cfsctl oci copy --zerocopy failed")?;
+
+    // Verify correctness: manifest + config digests match.
+    let repo_b = Repository::<Sha256HashValue>::open_path(rustix::fs::CWD, &repo_b_path)?;
+    let img_a = composefs_oci::oci_image::OciImage::open_ref(&repo_a, "copyme:v1")
+        .context("open copyme:v1 in repo_a")?;
+    let img_b = composefs_oci::oci_image::OciImage::open_ref(&repo_b, "copyme:v1")
+        .context("open copyme:v1 in repo_b")?;
+
+    ensure!(
+        img_a.manifest_digest() == img_b.manifest_digest(),
+        "manifest digest mismatch after reflink copy"
+    );
+    ensure!(
+        img_a.config_digest() == img_b.config_digest(),
+        "config digest mismatch after reflink copy"
+    );
+
+    // On XFS with reflinks, the objects should share extents.  Verify by
+    // checking that at least one object in repo B has nlink > 1 or shares
+    // its extents with the corresponding object in repo A (same inode on
+    // XFS reflink means FICLONE succeeded).
+    // A lighter check: the copy succeeded with --zerocopy (which would fail
+    // if reflink/hardlink was not possible), and the content matches.
+
+    Ok(())
+}
+integration_test!(privileged_copy_image_reflink);
+
+// ── Test C: varlink cross-process fd-passing copy ───────────────────────────
+
+/// A running `cfsctl varlink` subprocess bound to a Unix socket.
+///
+/// Kills the child process on drop so a test panic does not leak it.
+struct VarlinkProc {
+    child: std::process::Child,
+    socket: std::path::PathBuf,
+    /// Keep the tempdir holding the socket alive for the process's lifetime.
+    _socket_dir: tempfile::TempDir,
+}
+
+impl Drop for VarlinkProc {
+    fn drop(&mut self) {
+        let _ = self.child.kill();
+        let _ = self.child.wait();
+    }
+}
+
+impl VarlinkProc {
+    /// Spawn `cfsctl` via systemd socket-activation with a pre-bound listening
+    /// socket. The socket is already listening before the child starts, so
+    /// callers may connect immediately — no polling needed.
+    fn spawn() -> Result<Self> {
+        let (child, socket_dir, socket) = crate::spawn_activated_cfsctl()?;
+        Ok(VarlinkProc {
+            child,
+            socket,
+            _socket_dir: socket_dir,
+        })
+    }
+
+    fn socket(&self) -> &Path {
+        &self.socket
+    }
+}
+
+/// Open a repository on a varlink server and return its handle, via the typed
+/// zlink proxy. Connects a fresh client for this single call.
+/// Open `repo_path` over an existing connection and return the handle.
+///
+/// The handle is reused for all subsequent calls on the same connection.
+async fn open_repo(conn: &mut zlink::unix::Connection, repo_path: &Path) -> Result<u64> {
+    let path_str = repo_path.to_str().context("repo path is not valid UTF-8")?;
+    let reply = conn
+        .open_repository(Some(path_str), None, None)
+        .await
+        .context("zlink transport error calling OpenRepository")?;
+    match reply {
+        Ok(r) => Ok(r.handle),
+        Err(RepositoryError::InvalidSpec { message }) => {
+            bail!("OpenRepository: InvalidSpec: {message}")
+        }
+        Err(e) => bail!("OpenRepository failed: {e:?}"),
+    }
+}
+
+/// Copy an image between two varlink subprocess servers using `copy_image`.
+///
+/// This exercises the cross-process SCM_RIGHTS fd passing path that
+/// in-process tests cannot cover.
+fn test_copy_image_via_varlink() -> Result<()> {
+    let parent = tempfile::tempdir()?;
+    let repo_a_path = parent.path().join("repo-a");
+    let repo_b_path = parent.path().join("repo-b");
+
+    let repo_a = init_sha256_repo(&repo_a_path)?;
+    let (manifest_json_a, config_json_a, _diff_id2_str) =
+        build_and_import_test_image(&repo_a, "copyme:v1")?;
+
+    let manifest_digest_a = sha256_content_digest(&manifest_json_a);
+    let config_digest_a = sha256_content_digest(&config_json_a);
+
+    {
+        let _repo_b_init = init_sha256_repo(&repo_b_path)?;
+    }
+    drop(repo_a);
+
+    let svc_a = VarlinkProc::spawn().context("spawning varlink server A")?;
+    let svc_b = VarlinkProc::spawn().context("spawning varlink server B")?;
+
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .context("building tokio runtime")?;
+
+    let image: composefs_ctl::OciReference = "copyme:v1".parse()?;
+
+    let finalize_reply = rt.block_on(async {
+        let mut conn_a = zlink::unix::connect(svc_a.socket())
+            .await
+            .context("connecting to server A")?;
+        let mut conn_b = zlink::unix::connect(svc_b.socket())
+            .await
+            .context("connecting to server B")?;
+
+        let handle_a = open_repo(&mut conn_a, &repo_a_path).await?;
+        let handle_b = open_repo(&mut conn_b, &repo_b_path).await?;
+
+        let finalize = composefs_ctl::copy_image(
+            &mut conn_a,
+            &mut conn_b,
+            handle_a,
+            handle_b,
+            &image,
+            Some("copyme:v1"),
+            false,
+        )
+        .await?;
+
+        // Fsck the destination over the wire.
+        let fsck = conn_b
+            .fsck(handle_b, None)
+            .await
+            .context("Fsck transport error")?
+            .map_err(|e: RepositoryError| anyhow::anyhow!("Fsck failed: {e:?}"))?;
+        ensure!(fsck.ok, "fsck failed after copy: {fsck:?}");
+
+        Ok::<_, anyhow::Error>(finalize)
+    })?;
+
+    ensure!(
+        finalize_reply.manifest_digest == manifest_digest_a.to_string(),
+        "manifest_digest mismatch"
+    );
+    ensure!(
+        finalize_reply.config_digest == config_digest_a.to_string(),
+        "config_digest mismatch"
+    );
+
+    // Open repo B in-process and verify the image landed correctly.
+    let repo_b = Repository::<Sha256HashValue>::open_path(rustix::fs::CWD, &repo_b_path)?;
+    let img_b = composefs_oci::oci_image::OciImage::open_ref(&repo_b, "copyme:v1")
+        .context("opening copyme:v1 in repo B")?;
+
+    ensure!(
+        img_b.manifest_digest() == &manifest_digest_a,
+        "manifest digest mismatch in-process"
+    );
+    ensure!(
+        img_b.config_digest() == &config_digest_a,
+        "config digest mismatch in-process"
+    );
+
+    Ok(())
+}
+integration_test!(test_copy_image_via_varlink);
+
+// ── Test D: cross-algorithm copy via varlink (sha256 → sha512) ────────────
+
+/// Initialise an insecure SHA-512 repository at `path`.
+fn init_sha512_repo(path: &std::path::Path) -> Result<()> {
+    use composefs_oci::composefs::fsverity::Algorithm;
+    std::fs::create_dir_all(path)?;
+    let fd = rustix::fs::open(
+        path,
+        rustix::fs::OFlags::CLOEXEC | rustix::fs::OFlags::RDONLY,
+        0.into(),
+    )?;
+    let config = RepositoryConfig::new(Algorithm::SHA512).set_insecure();
+    Repository::<Sha512HashValue>::init_path(&fd, ".", config)?;
+    Ok(())
+}
+
+/// Copy an image from a sha256 repository to a sha512 repository over varlink.
+///
+/// This proves that cross-algorithm copy works end-to-end: the
+/// splitdirfdstream carries only algorithm-independent data (inline bytes +
+/// transient source-object locators) and the destination re-computes every
+/// object's fs-verity digest under its own (sha512) algorithm on import.
+///
+/// The test also validates the new `OpenRepositoryReply` metadata fields
+/// (`hash_algorithm`, `objects_device_id`) introduced for explicit zerocopy
+/// negotiation between client and servers.
+fn test_copy_image_cross_algorithm() -> Result<()> {
+    let parent = tempfile::tempdir()?;
+    let repo_a_path = parent.path().join("repo-sha256");
+    let repo_b_path = parent.path().join("repo-sha512");
+
+    let repo_a = init_sha256_repo(&repo_a_path)?;
+    let (manifest_json_a, _config_json_a, _) = build_and_import_test_image(&repo_a, "copyme:v1")?;
+    let manifest_digest_a = sha256_content_digest(&manifest_json_a);
+    drop(repo_a);
+
+    init_sha512_repo(&repo_b_path)?;
+
+    let svc_a = VarlinkProc::spawn().context("spawning server A (sha256)")?;
+    let svc_b = VarlinkProc::spawn().context("spawning server B (sha512)")?;
+
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()?;
+
+    let image: composefs_ctl::OciReference = "copyme:v1".parse()?;
+
+    let finalize_reply = rt.block_on(async {
+        let mut conn_a = zlink::unix::connect(svc_a.socket()).await?;
+        let mut conn_b = zlink::unix::connect(svc_b.socket()).await?;
+
+        let handle_a = open_repo(&mut conn_a, &repo_a_path).await?;
+        let handle_b = open_repo(&mut conn_b, &repo_b_path).await?;
+
+        let finalize = composefs_ctl::copy_image(
+            &mut conn_a,
+            &mut conn_b,
+            handle_a,
+            handle_b,
+            &image,
+            Some("copyme:v1"),
+            false,
+        )
+        .await?;
+
+        // Fsck the sha512 destination.
+        let fsck = conn_b
+            .fsck(handle_b, None)
+            .await
+            .context("Fsck transport error")?
+            .map_err(|e: RepositoryError| anyhow::anyhow!("Fsck failed: {e:?}"))?;
+        ensure!(fsck.ok, "fsck failed on sha512 dest: {fsck:?}");
+
+        Ok::<_, anyhow::Error>(finalize)
+    })?;
+
+    // OCI content digests must match regardless of fs-verity algorithm.
+    ensure!(
+        finalize_reply.manifest_digest == manifest_digest_a.to_string(),
+        "manifest_digest mismatch",
+    );
+
+    // The verity strings should be sha512 (128 hex chars).
+    ensure!(
+        finalize_reply.manifest_verity.len() == 128,
+        "sha512 manifest_verity should be 128 hex chars, got {} chars",
+        finalize_reply.manifest_verity.len(),
+    );
+
+    // Verify the image exists in the sha512 repo.
+    let repo_b = Repository::<Sha512HashValue>::open_path(rustix::fs::CWD, &repo_b_path)?;
+    let img_b = composefs_oci::oci_image::OciImage::open_ref(&repo_b, "copyme:v1")
+        .context("opening copyme:v1 in sha512 repo B")?;
+    ensure!(
+        img_b.manifest_digest() == &manifest_digest_a,
+        "manifest digest mismatch in sha512 repo",
+    );
+
+    Ok(())
+}
+integration_test!(test_copy_image_cross_algorithm);
+
+// ── Test E: CLI cross-algorithm copy (sha256 → sha512) ────────────────────
+
+/// Run `cfsctl oci copy` from a sha256 repo to a sha512 repo via the CLI.
+///
+/// Proves that the CLI dispatch correctly opens the destination with its own
+/// algorithm (2×2 monomorphisation) and that non-zerocopy cross-algorithm
+/// copy succeeds end-to-end.
+fn test_copy_image_cross_algorithm_cli() -> Result<()> {
+    let sh = Shell::new()?;
+    let cfsctl = cfsctl()?;
+
+    let parent = tempfile::tempdir()?;
+    let repo_a_path = parent.path().join("repo-sha256");
+    let repo_b_path = parent.path().join("repo-sha512");
+
+    let repo_a = init_sha256_repo(&repo_a_path)?;
+    let (manifest_json_a, _config_json_a, _) = build_and_import_test_image(&repo_a, "copyme:v1")?;
+    let manifest_digest_a = sha256_content_digest(&manifest_json_a);
+    drop(repo_a);
+
+    init_sha512_repo(&repo_b_path)?;
+
+    let repo_a_str = repo_a_path.to_str().unwrap();
+    let repo_b_str = repo_b_path.to_str().unwrap();
+    let tag = "copyme:v1";
+    cmd!(
+        sh,
+        "{cfsctl} --repo {repo_b_str} --insecure oci copy {tag} --from {repo_a_str} --name {tag}"
+    )
+    .run()
+    .context("cfsctl oci copy (sha256→sha512) should succeed without --zerocopy")?;
+
+    // Verify the image landed in the sha512 repo.
+    let repo_b = Repository::<Sha512HashValue>::open_path(rustix::fs::CWD, &repo_b_path)?;
+    let img_b = composefs_oci::oci_image::OciImage::open_ref(&repo_b, "copyme:v1")
+        .context("opening copyme:v1 in sha512 repo B")?;
+    ensure!(
+        img_b.manifest_digest() == &manifest_digest_a,
+        "manifest_digest mismatch: expected={manifest_digest_a}, got={}",
+        img_b.manifest_digest(),
+    );
+
+    Ok(())
+}
+integration_test!(test_copy_image_cross_algorithm_cli);
+
+// ── Test F: zerocopy + algorithm mismatch → error ─────────────────────────
+
+/// `cfsctl oci copy --zerocopy` from sha256 to sha512 must fail with a clear
+/// error rather than silently degrading.
+fn test_copy_image_zerocopy_algo_mismatch() -> Result<()> {
+    let sh = Shell::new()?;
+    let cfsctl = cfsctl()?;
+
+    let parent = tempfile::tempdir()?;
+    let repo_a_path = parent.path().join("repo-sha256");
+    let repo_b_path = parent.path().join("repo-sha512");
+
+    let repo_a = init_sha256_repo(&repo_a_path)?;
+    let _ = build_and_import_test_image(&repo_a, "copyme:v1")?;
+    drop(repo_a);
+
+    init_sha512_repo(&repo_b_path)?;
+
+    let repo_a_str = repo_a_path.to_str().unwrap();
+    let repo_b_str = repo_b_path.to_str().unwrap();
+    let tag = "copyme:v1";
+    let result = cmd!(
+        sh,
+        "{cfsctl} --repo {repo_b_str} --insecure oci copy {tag} --from {repo_a_str} --name {tag} --zerocopy"
+    )
+    .ignore_status()
+    .read_stderr()
+    .context("running cfsctl oci copy --zerocopy")?;
+
+    ensure!(
+        result.contains("zerocopy") && result.contains("hash algorithm"),
+        "expected error about zerocopy + hash algorithm mismatch, got: {result}"
+    );
+
+    Ok(())
+}
+integration_test!(test_copy_image_zerocopy_algo_mismatch);
+
+// ── Test G: OCI artifact copy (no rootfs.diff_ids) ────────────────────────
+
+/// Build a synthetic OCI artifact (not a container image) in repo A, then
+/// copy it to repo B via `cfsctl oci copy`.  Artifacts have no
+/// `rootfs.diff_ids` in their config, so this exercises the manifest-layer
+/// fallback in `copy_image`.
+fn test_copy_artifact() -> Result<()> {
+    use composefs_oci::composefs::repository::Repository;
+    use containers_image_proxy::oci_spec::image::{
+        DescriptorBuilder, ImageManifestBuilder, MediaType,
+    };
+
+    let sh = Shell::new()?;
+    let cfsctl = cfsctl()?;
+
+    let parent = tempfile::tempdir()?;
+    let repo_a_path = parent.path().join("repo-a");
+    let repo_b_path = parent.path().join("repo-b");
+
+    // ── Build an artifact in repo A ───────────────────────────────────────
+    let repo_a = init_sha256_repo(&repo_a_path)?;
+
+    // Artifact payload: a small SBOM-like blob (not a tar).
+    let blob_data = br#"{"spdxVersion":"SPDX-2.3","name":"test-artifact"}"#;
+    let blob_digest = sha256_content_digest(blob_data);
+
+    // Store the blob as an object + layer splitstream.
+    let blob_object_id = repo_a.ensure_object(blob_data)?;
+    let layer_id = composefs_oci::layer_identifier(&blob_digest);
+    // Use OCI_BLOB_CONTENT_TYPE for non-tar artifact layers.
+    let mut layer_stream = repo_a.create_stream(composefs_oci::skopeo::OCI_BLOB_CONTENT_TYPE)?;
+    layer_stream.add_external_size(blob_data.len() as u64);
+    layer_stream.write_reference(blob_object_id)?;
+    let layer_verity = repo_a.write_stream(layer_stream, &layer_id, None)?;
+
+    // Empty config (the OCI 1.1 artifact config pattern).
+    let config_json = b"{}";
+    let config_digest = sha256_content_digest(config_json);
+
+    // Build the manifest with EmptyJSON config media type (not ImageConfig).
+    let config_desc = DescriptorBuilder::default()
+        .media_type(MediaType::EmptyJSON)
+        .digest(config_digest.as_ref().to_string())
+        .size(config_json.len() as i64)
+        .build()
+        .context("building config descriptor")?;
+
+    let layer_desc = DescriptorBuilder::default()
+        .media_type(MediaType::Other("text/spdx+json".to_string()))
+        .digest(blob_digest.as_ref().to_string())
+        .size(blob_data.len() as i64)
+        .build()
+        .context("building layer descriptor")?;
+
+    let manifest = ImageManifestBuilder::default()
+        .schema_version(2u32)
+        .media_type(MediaType::ImageManifest)
+        .config(config_desc)
+        .layers(vec![layer_desc])
+        .build()
+        .context("building manifest")?;
+
+    let manifest_json = serde_json::to_vec(&manifest).context("serializing manifest")?;
+    let manifest_digest = sha256_content_digest(&manifest_json);
+
+    let layer_refs = vec![(blob_digest.clone(), layer_verity)];
+    finalize_oci_image(
+        &repo_a,
+        &manifest_json,
+        config_json,
+        &layer_refs,
+        Some("artifact:v1"),
+    )
+    .context("finalize artifact")?;
+
+    // Verify it's not a container image.
+    let img = composefs_oci::oci_image::OciImage::open_ref(&repo_a, "artifact:v1")?;
+    ensure!(
+        !img.is_container_image(),
+        "expected artifact, got container image"
+    );
+    drop(repo_a);
+
+    // ── Init repo B and copy via CLI ──────────────────────────────────────
+    {
+        let _repo_b = init_sha256_repo(&repo_b_path)?;
+    }
+
+    let repo_a_str = repo_a_path.to_str().unwrap();
+    let repo_b_str = repo_b_path.to_str().unwrap();
+    cmd!(
+        sh,
+        "{cfsctl} --repo {repo_b_str} --insecure oci copy artifact:v1 --from {repo_a_str} --name artifact:v1"
+    )
+    .run()
+    .context("cfsctl oci copy failed for artifact")?;
+
+    // ── Verify the artifact landed in repo B ──────────────────────────────
+    let repo_b = Repository::<Sha256HashValue>::open_path(rustix::fs::CWD, &repo_b_path)?;
+    let img_b = composefs_oci::oci_image::OciImage::open_ref(&repo_b, "artifact:v1")
+        .context("opening artifact:v1 in repo B")?;
+
+    ensure!(
+        !img_b.is_container_image(),
+        "copied artifact became a container image"
+    );
+    ensure!(
+        img_b.manifest_digest() == &manifest_digest,
+        "manifest digest mismatch after artifact copy"
+    );
+
+    Ok(())
+}
+integration_test!(test_copy_artifact);
diff --git a/crates/composefs-integration-tests/src/tests/mod.rs b/crates/composefs-integration-tests/src/tests/mod.rs
index 6a89a406..54242ec3 100644
--- a/crates/composefs-integration-tests/src/tests/mod.rs
+++ b/crates/composefs-integration-tests/src/tests/mod.rs
@@ -1,6 +1,7 @@
 //! Integration test modules, organized by execution environment.
 
 pub mod cli;
+pub mod copy_image;
 pub mod cstor;
 pub mod digest_stability;
 pub mod oci_compat;
diff --git a/crates/composefs-integration-tests/src/tests/privileged.rs b/crates/composefs-integration-tests/src/tests/privileged.rs
index a5c5f922..fd6c673d 100644
--- a/crates/composefs-integration-tests/src/tests/privileged.rs
+++ b/crates/composefs-integration-tests/src/tests/privileged.rs
@@ -639,19 +639,19 @@ integration_test!(privileged_pull_readonly_repo);
 /// Supports ext4 (with verity) and XFS (with reflinks).  The backing sparse
 /// file is 512 MB — large enough for a synthetic OCI image in
 /// containers-storage plus a composefs repo.
-struct LoopTempDir {
+pub(crate) struct LoopTempDir {
     mountpoint: PathBuf,
     _backing: tempfile::TempDir,
 }
 
 impl LoopTempDir {
     /// Create a loop-mounted ext4 filesystem with verity support.
-    fn ext4_verity() -> Result<Self> {
+    pub(crate) fn ext4_verity() -> Result<Self> {
         Self::create("mkfs.ext4", &["-q", "-O", "verity", "-b", "4096"])
     }
 
     /// Create a loop-mounted XFS filesystem with reflink support.
-    fn xfs_reflink() -> Result<Self> {
+    pub(crate) fn xfs_reflink() -> Result<Self> {
         Self::create("mkfs.xfs", &["-q", "-m", "reflink=1"])
     }
 
@@ -672,7 +672,7 @@ impl LoopTempDir {
         })
     }
 
-    fn path(&self) -> &Path {
+    pub(crate) fn path(&self) -> &Path {
         &self.mountpoint
     }
 }
diff --git a/crates/composefs-integration-tests/src/tests/varlink.rs b/crates/composefs-integration-tests/src/tests/varlink.rs
index aaa279b8..b73bcb28 100644
--- a/crates/composefs-integration-tests/src/tests/varlink.rs
+++ b/crates/composefs-integration-tests/src/tests/varlink.rs
@@ -1,8 +1,8 @@
 //! Integration tests for the `cfsctl` varlink RPC API.
 //!
 //! These drive the service the same way an external consumer would: they spawn
-//! `cfsctl varlink --address <socket>` (or `cfsctl oci varlink ...`) as a
-//! separate process and talk to it over the Unix socket.
+//! `cfsctl` as a separate process via systemd socket-activation and talk to it
+//! over a pre-bound Unix socket.
 //!
 //! ## Two clients, on purpose
 //!
@@ -28,9 +28,9 @@
 //! typed error variant. Avoid converting the whole suite to either side.
 //!
 //! A single service answers both the `org.composefs.Repository` and
-//! `org.composefs.Oci` interfaces on one socket, so the `repository()` and
-//! `oci()` spawn helpers below differ only in which CLI subcommand starts the
-//! (identical) combined service.
+//! `org.composefs.Oci` interfaces on one socket. The `repository()` and
+//! `oci()` spawn helpers are retained for historical reasons — both now use
+//! socket activation and serve the identical combined interface set.
 //!
 //! ## Handle-based API
 //!
@@ -43,7 +43,6 @@
 
 use std::path::Path;
 use std::process::Command;
-use std::time::{Duration, Instant};
 
 use anyhow::{Context, Result, bail};
 use composefs_ctl::varlink::oci::{OciError, OciInspectReply, PullProgress};
@@ -79,31 +78,24 @@ impl Drop for VarlinkService {
 }
 
 impl VarlinkService {
-    /// Spawn `cfsctl <service_args...> --address <sock>`, wait for the socket to
-    /// appear, then open `repo` via `OpenRepository` and cache its handle.
+    /// Spawn `cfsctl` via systemd socket-activation with a pre-bound listening
+    /// socket, then open `repo` via `OpenRepository` and cache its handle.
+    ///
+    /// The socket is already listening before the child starts, so no polling
+    /// is needed — the child reaches the socket via the inherited fd 3. The
+    /// socket path also exists on disk, so `varlinkctl` (which connects by
+    /// path) works unchanged.
     ///
     /// The varlink service opens no repository at startup, so the test repo is
     /// opened explicitly here; the returned handle is auto-injected into
     /// subsequent calls. The repository's insecure mode is auto-detected from
     /// its metadata, so no open flags are needed.
-    fn spawn(repo: &Path, service_args: &[&str]) -> Result<Self> {
-        let cfsctl = cfsctl()?;
-        let socket_dir = tempfile::tempdir()?;
-        let socket = socket_dir.path().join("varlink.sock");
-
-        let mut cmd = Command::new(&cfsctl);
-        cmd.args(service_args);
-        cmd.arg("--address").arg(&socket);
-        let child = cmd.spawn().context("spawning cfsctl varlink server")?;
-
-        // Wait (briefly) for the service to bind the socket.
-        let deadline = Instant::now() + Duration::from_secs(10);
-        while !socket.exists() {
-            if Instant::now() > deadline {
-                bail!("timed out waiting for varlink socket {}", socket.display());
-            }
-            std::thread::sleep(Duration::from_millis(20));
-        }
+    ///
+    /// Socket activation serves the full combined interface set
+    /// (`org.composefs.Repository` + `org.composefs.Oci`) regardless of which
+    /// of the historical CLI entry points would have been used.
+    fn spawn(repo: &Path) -> Result<Self> {
+        let (child, socket_dir, socket) = crate::spawn_activated_cfsctl()?;
 
         let handle = Self::open_repository(&socket, repo)?;
 
@@ -154,16 +146,19 @@ impl VarlinkService {
             .context("OpenRepository reply missing numeric 'handle'")
     }
 
-    /// Spawn the combined service via the top-level `varlink` subcommand.
+    /// Spawn the combined service (historically the `varlink` subcommand entry
+    /// point). Both `repository` and `oci` now use socket activation, which
+    /// serves the full combined interface set on one socket.
     fn repository(repo: &Path) -> Result<Self> {
-        Self::spawn(repo, &["varlink"])
+        Self::spawn(repo)
     }
 
-    /// Spawn the combined service via the `oci varlink` subcommand. Serves the
-    /// same interfaces as [`Self::repository`]; kept to exercise both CLI entry
-    /// points.
+    /// Spawn the combined service (historically the `oci varlink` subcommand
+    /// entry point). Kept so tests that call this entry point continue to
+    /// exercise a real service spawn; identical to [`Self::repository`] under
+    /// socket activation.
     fn oci(repo: &Path) -> Result<Self> {
-        Self::spawn(repo, &["oci", "varlink"])
+        Self::spawn(repo)
     }
 
     fn socket_str(&self) -> String {
diff --git a/crates/composefs-oci/Cargo.toml b/crates/composefs-oci/Cargo.toml
index 317ff197..513b5b0d 100644
--- a/crates/composefs-oci/Cargo.toml
+++ b/crates/composefs-oci/Cargo.toml
@@ -14,16 +14,18 @@ version.workspace = true
 default = ["containers-storage"]
 test = ["tar", "rand", "composefs/test"]
 boot = ["composefs-boot"]
-containers-storage = ["dep:cstorage", "dep:base64", "cstorage/userns-helper"]
-varlink = ['dep:zlink-core', 'composefs/varlink']
+containers-storage = ["dep:cstorage", "dep:base64", "varlink", "cstorage/layer-transfer"]
+varlink = ['dep:zlink', 'dep:zlink-core', 'composefs/varlink']
 
 [dependencies]
 anyhow = { version = "1.0.87", default-features = false }
 fn-error-context = "0.2"
 async-compression = { version = "0.4.0", default-features = false, features = ["tokio", "zstd", "gzip"] }
 base64 = { version = "0.22", default-features = false, features = ["std"], optional = true }
+composefs-splitdirfdstream = { path = "../composefs-splitdirfdstream", version = "0.7.0", features = ["tokio"] }
 bytes = { version = "1", default-features = false }
 composefs = { workspace = true }
+zlink = { workspace = true, optional = true }
 zlink-core = { workspace = true, optional = true }
 composefs-boot = { workspace = true, optional = true }
 flate2 = { version = "1.0", default-features = false, features = ["zlib"] }
diff --git a/crates/composefs-oci/src/cstor.rs b/crates/composefs-oci/src/cstor.rs
index 33004f5f..e3ba45f5 100644
--- a/crates/composefs-oci/src/cstor.rs
+++ b/crates/composefs-oci/src/cstor.rs
@@ -14,7 +14,7 @@
 //!
 //! When importing from containers-storage, we:
 //! 1. Open the storage and locate the image
-//! 2. For each layer, iterate through the tar-split metadata
+//! 2. For each layer, stream it via the `org.composefs.Oci` zlink service
 //! 3. For large files (> INLINE_CONTENT_MAX_V0), reflink directly to objects/
 //! 4. For small files, embed inline in the splitstream
 //! 5. Handle overlay whiteouts properly
@@ -38,7 +38,6 @@
 //! println!("Stats: {:?}", stats);
 //! ```
 
-use std::os::unix::fs::FileExt;
 use std::os::unix::io::OwnedFd;
 use std::sync::Arc;
 
@@ -46,30 +45,26 @@ use anyhow::{Context, Result};
 use base64::Engine;
 
 use composefs::{
-    INLINE_CONTENT_MAX_V0,
     fsverity::FsVerityHashValue,
-    repository::{ImportContext, ObjectStoreMethod, Repository},
+    repository::{ImportContext, Repository},
 };
 
 use cstorage::{
-    Image, Layer, ProxiedTarSplitItem, Storage, StorageProxy, TarSplitFdStream, TarSplitItem,
-    can_bypass_file_permissions,
+    CstorLayerService, Image, Layer, Storage, StorageProxy, can_bypass_file_permissions,
+    spawn_cstor_in_process,
 };
 
+use crate::varlink_types::{GetLayerParams, OciProxy as _, StorageLocator};
+
 // Re-export init_if_helper for consumers that need userns helper support
 pub use cstorage::init_if_helper;
 
-use crate::oci_image::manifest_identifier;
 use crate::progress::{ComponentId, ProgressEvent, ProgressUnit, SharedReporter};
-use crate::skopeo::{OCI_CONFIG_CONTENT_TYPE, OCI_MANIFEST_CONTENT_TYPE, TAR_LAYER_CONTENT_TYPE};
-use crate::{ContentAndVerity, ImportStats, OciDigest, config_identifier, layer_identifier};
+use crate::{ContentAndVerity, ImportStats, OciDigest, layer_identifier};
 
 /// Full result of a cstor import: manifest and config digests + verities.
 type CstorImportResult<ObjectID> = (ContentAndVerity<ObjectID>, ContentAndVerity<ObjectID>);
 
-/// Zero padding buffer for tar block alignment (512 bytes max needed).
-const ZERO_PADDING: [u8; 512] = [0u8; 512];
-
 /// Import a container image from containers-storage into the composefs repository.
 ///
 /// This function reads an image from the local containers-storage (podman/buildah)
@@ -102,29 +97,23 @@ pub async fn import_from_containers_storage<ObjectID: FsVerityHashValue>(
 ) -> Result<(CstorImportResult<ObjectID>, ImportStats)> {
     // Check if we can access files directly or need a proxy
     if can_bypass_file_permissions() {
-        // Direct access - use blocking implementation
-        let repo = Arc::clone(repo);
-        let image_id = image_id.to_owned();
-        let reference = reference.map(|s| s.to_owned());
+        // Direct access via in-process CstorLayerService.
         let storage_root = storage_root.map(|p| p.to_path_buf());
         let additional_image_stores: Vec<std::path::PathBuf> = additional_image_stores
             .iter()
             .map(|p| p.to_path_buf())
             .collect();
 
-        tokio::task::spawn_blocking(move || {
-            import_from_containers_storage_direct(
-                &repo,
-                &image_id,
-                reference.as_deref(),
-                zerocopy,
-                storage_root.as_deref(),
-                &additional_image_stores,
-                reporter,
-            )
-        })
+        import_from_containers_storage_direct(
+            repo,
+            image_id,
+            reference,
+            zerocopy,
+            storage_root.as_deref(),
+            &additional_image_stores,
+            reporter,
+        )
         .await
-        .context("spawn_blocking failed")?
     } else {
         // The proxied (rootless) path uses a userns helper process that does
         // its own storage discovery.  Explicit storage paths are not yet
@@ -138,71 +127,73 @@ pub async fn import_from_containers_storage<ObjectID: FsVerityHashValue>(
     }
 }
 
-/// Direct (privileged) implementation of containers-storage import.
+/// Resolved image metadata needed by the async layer-import loop.
+struct ResolvedImageLayers {
+    /// The `Image` handle (for finalize_import).
+    image: Image,
+    /// Per-layer: (storage_root_path_string, storage_layer_id, diff_id).
+    layers: Vec<(String, String, OciDigest)>,
+}
+
+/// Synchronous helper: open stores, find image, resolve layer IDs + diff_ids.
 ///
-/// All file I/O operations in this function are blocking, so it must be called
-/// from a blocking context (e.g., via `spawn_blocking`).
-fn import_from_containers_storage_direct<ObjectID: FsVerityHashValue>(
-    repo: &Arc<Repository<ObjectID>>,
+/// Runs entirely in blocking context.  We open the stores with explicit paths
+/// so that we can pass the path string to the `CstorLayerService` per layer.
+fn resolve_image_layers(
     image_id: &str,
-    reference: Option<&str>,
-    zerocopy: bool,
-    storage_root: Option<&std::path::Path>,
-    additional_image_stores: &[std::path::PathBuf],
-    reporter: SharedReporter,
-) -> Result<(CstorImportResult<ObjectID>, ImportStats)> {
-    let mut stats = ImportStats::default();
-    let mut ctx = ImportContext::default();
+    storage_root: Option<std::path::PathBuf>,
+    additional_image_stores: Vec<std::path::PathBuf>,
+) -> Result<ResolvedImageLayers> {
+    // Build an ordered list of store root paths to search. An explicit
+    // `storage_root` skips auto-discovery; the additional image stores are
+    // appended after the primary store(s) in either case.
+    let mut store_paths: Vec<String> = match &storage_root {
+        Some(root) => vec![root.to_string_lossy().into_owned()],
+        None => storage_search_paths(),
+    };
+    for p in &additional_image_stores {
+        store_paths.push(p.to_string_lossy().into_owned());
+    }
 
-    // Build the list of stores to search.  When an explicit root is given,
-    // skip auto-discovery entirely; otherwise discover from the standard
-    // locations and $STORAGE_OPTS.
-    let mut stores = if let Some(root) = storage_root {
-        vec![
-            Storage::open(root)
-                .with_context(|| format!("Failed to open storage root at {}", root.display()))?,
-        ]
-    } else {
-        // Auto-discover; tolerate failure only if extra stores are provided.
-        match Storage::discover_all() {
-            Ok(s) => s,
-            Err(e) if !additional_image_stores.is_empty() => {
-                tracing::warn!(
-                    "containers-storage auto-discovery failed ({e:#}), \
-                     using additional image stores only"
-                );
-                Vec::new()
-            }
-            Err(e) => return Err(e).context("Failed to discover containers-storage"),
+    // Open all stores.
+    let mut stores: Vec<(String, Storage)> = Vec::with_capacity(store_paths.len());
+    let mut open_errors: Vec<String> = Vec::new();
+    for path in &store_paths {
+        match Storage::open(path) {
+            Ok(s) => stores.push((path.clone(), s)),
+            Err(e) => open_errors.push(format!("{path}: {e:#}")),
         }
-    };
-    for path in additional_image_stores {
-        stores.push(Storage::open(path).with_context(|| {
-            format!(
-                "Failed to open additional image store at {}",
-                path.display()
-            )
-        })?);
+    }
+    if stores.is_empty() {
+        anyhow::bail!(
+            "Could not open any containers-storage root: {}",
+            open_errors.join("; ")
+        );
     }
 
-    // Search all stores for the image (primary first, then additional image stores)
-    let (_image_store, image) = stores
+    // Search all stores for the image.
+    let image = stores
         .iter()
-        .find_map(|s| {
+        .find_map(|(_path, s)| {
             Image::open(s, image_id)
                 .or_else(|_| s.find_image_by_name(image_id))
                 .ok()
-                .map(|img| (s, img))
         })
         .with_context(|| format!("Failed to find image {image_id} in any storage"))?;
 
-    // Get the storage layer IDs — layers may span stores (e.g. base layers
-    // in an additional image store, new layers in the primary)
+    // storage_layer_ids() takes &[Storage]; collect owned Storage values by
+    // re-opening the already-open stores (cheap: just another fd dup via cap-std).
+    // We re-open so that storage_layer_ids can have an owned &[Storage] slice.
+    let storage_vec: Vec<Storage> = stores
+        .iter()
+        .map(|(path, _s)| Storage::open(path))
+        .collect::<std::result::Result<Vec<_>, _>>()
+        .context("Failed to re-open stores for storage_layer_ids")?;
     let storage_layer_ids = image
-        .storage_layer_ids(&stores)
+        .storage_layer_ids(&storage_vec)
         .context("Failed to get storage layer IDs from image")?;
 
-    // Get the config to access diff_ids
+    // Get the config to access diff_ids.
     let config = image.config().context("Failed to read image config")?;
     let diff_ids: Vec<OciDigest> = config
         .rootfs()
@@ -211,7 +202,6 @@ fn import_from_containers_storage_direct<ObjectID: FsVerityHashValue>(
         .map(|s| s.parse::<OciDigest>().context("parsing diff_id"))
         .collect::<Result<_>>()?;
 
-    // Ensure layer count matches
     anyhow::ensure!(
         storage_layer_ids.len() == diff_ids.len(),
         "Layer count mismatch: {} layers in storage, {} diff_ids in config",
@@ -219,10 +209,88 @@ fn import_from_containers_storage_direct<ObjectID: FsVerityHashValue>(
         diff_ids.len()
     );
 
-    stats.layers = storage_layer_ids.len() as u64;
+    // For each layer, find which store path it lives in.
+    let mut layers = Vec::with_capacity(storage_layer_ids.len());
+    for (storage_layer_id, diff_id) in storage_layer_ids.into_iter().zip(diff_ids) {
+        let store_path = stores
+            .iter()
+            .find_map(|(path, s)| Layer::open(s, &storage_layer_id).ok().map(|_| path.clone()))
+            .with_context(|| format!("Could not find store containing layer {storage_layer_id}"))?;
+        layers.push((store_path, storage_layer_id, diff_id));
+    }
+
+    Ok(ResolvedImageLayers { image, layers })
+}
+
+/// Return the ordered list of storage root path strings to search, mirroring
+/// `Storage::discover_all()` internals so we can pair each store with a path.
+fn storage_search_paths() -> Vec<String> {
+    // TODO: Read graphroot and additional image dirs from storage.conf,
+    // and respect the CONTAINERS_STORAGE_CONF environment variable.
+    let mut paths = Vec::new();
+
+    if let Ok(root) = std::env::var("CONTAINERS_STORAGE_ROOT") {
+        paths.push(root);
+    }
+
+    if let Ok(home) = std::env::var("HOME") {
+        if let Ok(xdg) = std::env::var("XDG_DATA_HOME") {
+            paths.push(format!("{xdg}/containers/storage"));
+        }
+        paths.push(format!("{home}/.local/share/containers/storage"));
+    }
+
+    paths.push("/var/lib/containers/storage".to_string());
+
+    if let Ok(opts) = std::env::var("STORAGE_OPTS") {
+        for item in opts.split(',') {
+            let item = item.trim();
+            if let Some(p) = item.strip_prefix("additionalimagestore=") {
+                paths.push(p.to_string());
+            }
+        }
+    }
 
-    let mut layer_refs = Vec::with_capacity(storage_layer_ids.len());
-    for (storage_layer_id, diff_id) in storage_layer_ids.iter().zip(diff_ids.iter()) {
+    paths
+}
+
+/// Direct (privileged) async implementation of containers-storage import.
+///
+/// Layer discovery is done synchronously via `spawn_blocking`, then each
+/// layer is imported asynchronously through the in-process `CstorLayerService`
+/// (`org.composefs.Oci` zlink interface).
+async fn import_from_containers_storage_direct<ObjectID: FsVerityHashValue>(
+    repo: &Arc<Repository<ObjectID>>,
+    image_id: &str,
+    reference: Option<&str>,
+    zerocopy: bool,
+    storage_root: Option<&std::path::Path>,
+    additional_image_stores: &[std::path::PathBuf],
+    reporter: SharedReporter,
+) -> Result<(CstorImportResult<ObjectID>, ImportStats)> {
+    let mut stats = ImportStats::default();
+    let mut ctx = ImportContext::default();
+
+    // Resolve image layers synchronously (filesystem + JSON work).
+    let image_id_owned = image_id.to_owned();
+    let storage_root_owned = storage_root.map(|p| p.to_path_buf());
+    let additional_owned: Vec<std::path::PathBuf> = additional_image_stores.to_vec();
+    let resolved = tokio::task::spawn_blocking(move || {
+        resolve_image_layers(&image_id_owned, storage_root_owned, additional_owned)
+    })
+    .await
+    .context("spawn_blocking(resolve_image_layers) failed")??;
+
+    stats.layers = resolved.layers.len() as u64;
+
+    // Spawn the in-process CstorLayerService server (synchronous call, no .await).
+    // `server` keeps the server's dedicated thread alive for the whole layer loop;
+    // it is shut down explicitly after the layer loop.
+    let (mut client, server) =
+        spawn_cstor_in_process(CstorLayerService).context("Failed to spawn CstorLayerService")?;
+
+    let mut layer_refs = Vec::with_capacity(resolved.layers.len());
+    for (store_path, storage_layer_id, diff_id) in &resolved.layers {
         let content_id = layer_identifier(diff_id);
         let id = ComponentId::from(diff_id.to_string());
 
@@ -236,12 +304,16 @@ fn import_from_containers_storage_direct<ObjectID: FsVerityHashValue>(
                 total: None,
                 unit: ProgressUnit::Bytes,
             });
-            let (layer_store, layer) = stores
-                .iter()
-                .find_map(|s| Layer::open(s, storage_layer_id).ok().map(|l| (s, l)))
-                .with_context(|| format!("Failed to open layer {}", storage_layer_id))?;
-            let (verity, layer_stats) =
-                import_layer_direct(repo, layer_store, &layer, diff_id, zerocopy, &mut ctx)?;
+            let (verity, layer_stats) = import_layer_via_transfer(
+                repo,
+                &mut client,
+                store_path,
+                storage_layer_id,
+                diff_id,
+                zerocopy,
+                &mut ctx,
+            )
+            .await?;
             let bytes = layer_stats.new_bytes();
             stats.merge(&layer_stats);
             reporter.report(ProgressEvent::Done {
@@ -254,14 +326,138 @@ fn import_from_containers_storage_direct<ObjectID: FsVerityHashValue>(
         layer_refs.push((diff_id.clone(), layer_verity));
     }
 
+    // Drop the client so the server connection closes, then shut the server
+    // down.  The server's `run()` never returns on its own (the in-process
+    // listener pends forever after the first connection), so `shutdown` sends an
+    // explicit stop signal and reaps the thread on a blocking task — the latter
+    // is important so we don't park the caller's reactor, which must stay live
+    // to deliver that very stop signal to the server thread.
+    drop(client);
+    server.shutdown().await;
+
     reporter.report(ProgressEvent::Message("Layers imported".to_string()));
-    finalize_import(repo, &image, &layer_refs, reference, &reporter, stats)
+
+    // finalize_import does blocking repo work; run it on spawn_blocking.
+    let repo2 = Arc::clone(repo);
+    let image = resolved.image;
+    let reference_owned = reference.map(|s| s.to_owned());
+    let reporter2 = reporter.clone();
+    tokio::task::spawn_blocking(move || {
+        finalize_import(
+            &repo2,
+            &image,
+            &layer_refs,
+            reference_owned.as_deref(),
+            &reporter2,
+            stats,
+        )
+    })
+    .await
+    .context("spawn_blocking(finalize_import) failed")?
+}
+
+/// Import a single layer via the in-process `org.composefs.Oci` zlink service.
+///
+/// Calls `get_layer(0, GetLayerParams{storage:Some(StorageLocator{...})})`,
+/// collects all frames, then drains the `splitdirfdstream` pipe in a
+/// `spawn_blocking` closure while the server-side producer fills the pipe
+/// concurrently on its own thread.
+async fn import_layer_via_transfer<ObjectID: FsVerityHashValue>(
+    repo: &Arc<Repository<ObjectID>>,
+    client: &mut zlink::unix::Connection,
+    storage_path: &str,
+    storage_layer_id: &str,
+    diff_id: &OciDigest,
+    zerocopy: bool,
+    ctx: &mut ImportContext,
+) -> Result<(ObjectID, ImportStats)> {
+    // Call get_layer with the storage locator (handle=0; cstor service ignores it).
+    let params = GetLayerParams {
+        diff_id: None,
+        storage: Some(StorageLocator {
+            storage_path: storage_path.to_owned(),
+            layer_id: storage_layer_id.to_owned(),
+        }),
+    };
+    let stream = client
+        .get_layer(0, params)
+        .await
+        .with_context(|| format!("Oci.GetLayer RPC failed for {storage_layer_id}"))?;
+
+    // Collect all frames.  Each frame carries a batch of FDs; concatenate them
+    // in arrival order to reconstruct the full logical FD array:
+    //   index 0             : pipe read end
+    //   index 1..=dir_count : dirfds region (sparse — may include dummy slots)
+    //   index dir_count+1.. : opaque lifetime FDs (keepalive + optional extras)
+    let mut fds: Vec<OwnedFd> = Vec::new();
+    let mut reply_opt: Option<crate::varlink_types::GetLayerReply> = None;
+    {
+        use zlink::futures_util::StreamExt as _;
+        let mut stream = std::pin::pin!(stream);
+        while let Some(item) = stream.next().await {
+            let (result, frame_fds) =
+                item.with_context(|| format!("Oci.GetLayer stream error for {storage_layer_id}"))?;
+            let frame_reply = result.map_err(|e| anyhow::anyhow!("Oci.GetLayer error: {e:?}"))?;
+            reply_opt = Some(frame_reply);
+            fds.extend(frame_fds);
+        }
+    }
+    let reply = reply_opt.ok_or_else(|| anyhow::anyhow!("Oci.GetLayer yielded no frames"))?;
+
+    // At minimum: 1 pipe fd + dir_count slots + 1 keepalive fd.
+    anyhow::ensure!(
+        fds.len() >= reply.dir_count as usize + 2,
+        "Oci.GetLayer: expected at least {} fds (1 pipe + {} dirfd slots + 1 keepalive), got {}",
+        2 + reply.dir_count,
+        reply.dir_count,
+        fds.len()
+    );
+
+    // Split into: pipe_read | dir_fds[dir_count] | lifetime_fds...
+    let mut it = fds.into_iter();
+    let pipe_read: OwnedFd = it.next().expect("checked above: fds.len() >= 1");
+    let dir_fds: Vec<OwnedFd> = it.by_ref().take(reply.dir_count as usize).collect();
+    // Opaque lifetime tokens — hold open until drain completes, then drop to
+    // signal the server's producer that we are done consuming the layer.
+    let lifetime_fds: Vec<OwnedFd> = it.collect();
+
+    // Drain the pipe in a blocking task (producer runs concurrently on the
+    // server's spawn_blocking thread).  ImportContext is threaded through.
+    let repo_clone = Arc::clone(repo);
+    let diff_id_owned = diff_id.clone();
+    let ctx_moved = std::mem::take(ctx);
+
+    let (verity, layer_stats, ctx_returned) = tokio::task::spawn_blocking(move || {
+        // Hold all lifetime FDs for the full duration of the drain.
+        let _lifetime_fds = lifetime_fds;
+        crate::layer_sync::drain_splitdirfdstream(
+            repo_clone,
+            pipe_read,
+            dir_fds,
+            &diff_id_owned,
+            zerocopy,
+            ctx_moved,
+        )
+    })
+    .await
+    .context("spawn_blocking(drain_splitdirfdstream) failed")??;
+
+    *ctx = ctx_returned;
+
+    Ok((verity, layer_stats))
 }
 
 /// Proxied (rootless) implementation of containers-storage import.
 ///
-/// This spawns a helper process via `podman unshare` that can read all files
-/// in containers-storage, and communicates with it via Unix socket + fd passing.
+/// Mirrors `import_from_containers_storage_direct` exactly, but acquires the
+/// `org.composefs.Oci` zlink client from the userns helper subprocess instead
+/// of an in-process server.  Metadata resolution (layer IDs, diff_ids, config)
+/// runs in `spawn_blocking` using the same `resolve_image_layers` path as the
+/// direct path, so the two paths remain behaviorally identical.
+///
+/// Note: explicit `storage_root` / `additional_image_stores` are not supported
+/// here; the caller (`import_from_containers_storage`) already guards against
+/// that and bails before reaching this function.
 async fn import_from_containers_storage_proxied<ObjectID: FsVerityHashValue>(
     repo: &Arc<Repository<ObjectID>>,
     image_id: &str,
@@ -272,49 +468,27 @@ async fn import_from_containers_storage_proxied<ObjectID: FsVerityHashValue>(
     let mut stats = ImportStats::default();
     let mut ctx = ImportContext::default();
 
-    // Spawn the proxy helper
-    let mut proxy = StorageProxy::spawn()
-        .await
-        .context("Failed to spawn userns helper")?
-        .context("Expected proxy but got None")?;
-
-    // Discover storage paths (primary + additional from $STORAGE_OPTS)
-    let storage_paths = discover_storage_paths()?;
-
-    // Search all storage paths for the image
-    let mut image_info = None;
-    let mut found_storage_path = String::new();
-    for path in &storage_paths {
-        match proxy.get_image(path, image_id).await {
-            Ok(info) => {
-                found_storage_path = path.clone();
-                image_info = Some(info);
-                break;
-            }
-            Err(_) => continue,
-        }
-    }
-    let image_info =
-        image_info.with_context(|| format!("Failed to find image {} in any storage", image_id))?;
-    let storage_path = found_storage_path;
-
-    // Ensure layer count matches
-    anyhow::ensure!(
-        image_info.storage_layer_ids.len() == image_info.layer_diff_ids.len(),
-        "Layer count mismatch: {} layers in storage, {} diff_ids in config",
-        image_info.storage_layer_ids.len(),
-        image_info.layer_diff_ids.len()
-    );
+    // Resolve image layers synchronously (filesystem + JSON work).
+    // Pass None / empty for storage_root / additional: the caller already
+    // rejected those for rootless mode, so this is always auto-discovery.
+    let image_id_owned = image_id.to_owned();
+    let resolved =
+        tokio::task::spawn_blocking(move || resolve_image_layers(&image_id_owned, None, vec![]))
+            .await
+            .context("spawn_blocking(resolve_image_layers) failed")??;
 
-    stats.layers = image_info.storage_layer_ids.len() as u64;
+    stats.layers = resolved.layers.len() as u64;
 
-    let mut layer_refs = Vec::with_capacity(image_info.storage_layer_ids.len());
+    // Spawn the userns helper; it serves the org.composefs.Oci zlink service
+    // (CstorLayerService) over a Unix socket.  We drive it through
+    // `proxy.connection()` exactly as the direct path drives the in-process client.
+    let mut proxy = StorageProxy::spawn()
+        .await
+        .context("spawn userns helper")?
+        .context("expected helper but got None (can_bypass_file_permissions returned true?)")?;
 
-    for (storage_layer_id, diff_id) in image_info
-        .storage_layer_ids
-        .iter()
-        .zip(image_info.layer_diff_ids.iter())
-    {
+    let mut layer_refs = Vec::with_capacity(resolved.layers.len());
+    for (store_path, storage_layer_id, diff_id) in &resolved.layers {
         let content_id = layer_identifier(diff_id);
         let id = ComponentId::from(diff_id.to_string());
 
@@ -328,10 +502,10 @@ async fn import_from_containers_storage_proxied<ObjectID: FsVerityHashValue>(
                 total: None,
                 unit: ProgressUnit::Bytes,
             });
-            let (verity, layer_stats) = import_layer_proxied(
+            let (verity, layer_stats) = import_layer_via_transfer(
                 repo,
-                &mut proxy,
-                &storage_path,
+                proxy.connection(),
+                store_path,
                 storage_layer_id,
                 diff_id,
                 zerocopy,
@@ -350,30 +524,40 @@ async fn import_from_containers_storage_proxied<ObjectID: FsVerityHashValue>(
         layer_refs.push((diff_id.clone(), layer_verity));
     }
 
-    reporter.report(ProgressEvent::Message("Layers imported".to_string()));
-
-    // Config and manifest metadata don't have restrictive file permissions,
-    // so we can read them directly without the proxy.
-    let stores = Storage::discover_all().context("Failed to discover containers-storage")?;
-    let (_, image) = stores
-        .iter()
-        .find_map(|s| Image::open(s, &image_info.id).ok().map(|img| (s, img)))
-        .with_context(|| format!("Failed to open image {}", image_info.id))?;
+    // Shut down the helper before finalize_import: finalize reads config and
+    // manifest JSON directly (no restrictive permissions), so the helper is
+    // no longer needed.
+    proxy.shutdown().await.context("shutdown userns helper")?;
 
-    // Shutdown the proxy before the blocking finalization
-    proxy.shutdown().await.context("Failed to shutdown proxy")?;
+    reporter.report(ProgressEvent::Message("Layers imported".to_string()));
 
-    finalize_import(repo, &image, &layer_refs, reference, &reporter, stats)
+    // finalize_import does blocking repo work; run it on spawn_blocking,
+    // mirroring _direct exactly.
+    let repo2 = Arc::clone(repo);
+    let image = resolved.image;
+    let reference_owned = reference.map(|s| s.to_owned());
+    let reporter2 = reporter.clone();
+    tokio::task::spawn_blocking(move || {
+        finalize_import(
+            &repo2,
+            &image,
+            &layer_refs,
+            reference_owned.as_deref(),
+            &reporter2,
+            stats,
+        )
+    })
+    .await
+    .context("spawn_blocking(finalize_import) failed")?
 }
 
 /// Create config + manifest splitstreams, generate the EROFS image, and tag.
 ///
 /// This is the shared finalization step for both direct and proxied import
 /// paths. By this point all layers are already imported; this function:
-/// 1. Creates the config splitstream with layer references
-/// 2. Creates the manifest splitstream
-/// 3. Generates the composefs EROFS image and links it to the config
-/// 4. Tags the manifest if a reference was provided
+/// 1. Reads config JSON and manifest JSON from the containers-storage `Image`
+/// 2. Delegates to [`crate::layer_sync::finalize_oci_image`] for the repo work
+/// 3. Re-attaches the `ImportStats` for the caller
 fn finalize_import<ObjectID: FsVerityHashValue>(
     repo: &Arc<Repository<ObjectID>>,
     image: &Image,
@@ -388,325 +572,32 @@ fn finalize_import<ObjectID: FsVerityHashValue>(
     let config_json = image
         .read_metadata(&encoded_key)
         .context("Failed to read config bytes")?;
-    let config_digest = crate::sha256_content_digest(&config_json);
-    let content_id = config_identifier(&config_digest);
-
-    let config_verity = if let Some(existing) = repo.has_stream(&content_id)? {
-        reporter.report(ProgressEvent::Message(format!(
-            "Already have config {config_digest}"
-        )));
-        existing
-    } else {
-        reporter.report(ProgressEvent::Message(format!(
-            "Creating config splitstream {config_digest}"
-        )));
-        let mut writer = repo.create_stream(OCI_CONFIG_CONTENT_TYPE)?;
-
-        for (diff_id, verity) in layer_refs {
-            let key: &str = diff_id.as_ref();
-            writer.add_named_stream_ref(key, verity);
-        }
 
-        writer.write_external(&config_json)?;
-        repo.write_stream(writer, &content_id, None)?
-    };
+    reporter.report(ProgressEvent::Message(format!(
+        "Read config ({} bytes)",
+        config_json.len()
+    )));
 
-    // Create the manifest splitstream (matching the skopeo path)
+    // Read the raw manifest JSON bytes
     let manifest_json = image
         .read_manifest_raw()
         .context("Failed to read manifest bytes")?;
-    let manifest_digest = crate::sha256_content_digest(&manifest_json);
-
-    let manifest_content_id = manifest_identifier(&manifest_digest);
-    let manifest_verity = if let Some(existing) = repo.has_stream(&manifest_content_id)? {
-        reporter.report(ProgressEvent::Message(format!(
-            "Already have manifest {manifest_digest}"
-        )));
-        existing
-    } else {
-        reporter.report(ProgressEvent::Message(format!(
-            "Creating manifest splitstream {manifest_digest}"
-        )));
-        let mut writer = repo.create_stream(OCI_MANIFEST_CONTENT_TYPE)?;
-
-        let config_ref_key = format!("config:{config_digest}");
-        writer.add_named_stream_ref(&config_ref_key, &config_verity);
-
-        for (diff_id, verity) in layer_refs {
-            let key: &str = diff_id.as_ref();
-            writer.add_named_stream_ref(key, verity);
-        }
 
-        writer.write_external(&manifest_json)?;
-        repo.write_stream(writer, &manifest_content_id, None)?
-    };
+    reporter.report(ProgressEvent::Message(format!(
+        "Read manifest ({} bytes)",
+        manifest_json.len()
+    )));
 
-    // Generate the composefs EROFS image and tag the manifest.
-    // Skip if the image already has an EROFS ref (idempotent re-import).
-    let existing_erofs = crate::composefs_erofs_for_manifest(
+    let result = crate::layer_sync::finalize_oci_image(
         repo,
-        &manifest_digest,
-        Some(&manifest_verity),
-        repo.erofs_version(),
-    )?;
-    if existing_erofs.is_none() {
-        let erofs = crate::ensure_oci_composefs_erofs(
-            repo,
-            &manifest_digest,
-            Some(&manifest_verity),
-            reference,
-        )?;
-        if erofs.is_none() {
-            // Not a container image (unlikely for cstor, but handle consistently)
-            if let Some(name) = reference {
-                crate::oci_image::tag_image(repo, &manifest_digest, name)?;
-            }
-        }
-    } else if let Some(name) = reference {
-        crate::oci_image::tag_image(repo, &manifest_digest, name)?;
-    }
-
-    // Re-read verities: ensure_oci_composefs_erofs rewrites config and
-    // manifest splitstreams (adding the EROFS ref), so the verities captured
-    // above may be stale.
-    let config_verity = repo
-        .has_stream(&content_id)?
-        .context("config splitstream missing after finalization")?;
-    let manifest_verity = repo
-        .has_stream(&manifest_content_id)?
-        .context("manifest splitstream missing after finalization")?;
-
-    Ok((
-        (
-            (manifest_digest, manifest_verity),
-            (config_digest, config_verity),
-        ),
-        stats,
-    ))
-}
-
-/// Import a single layer directly (privileged mode).
-fn import_layer_direct<ObjectID: FsVerityHashValue>(
-    repo: &Arc<Repository<ObjectID>>,
-    storage: &Storage,
-    layer: &Layer,
-    diff_id: &OciDigest,
-    zerocopy: bool,
-    ctx: &mut ImportContext,
-) -> Result<(ObjectID, ImportStats)> {
-    let mut stats = ImportStats::default();
-    let mut inline_buf = Vec::new();
-
-    let mut stream = TarSplitFdStream::new(storage, layer)
-        .with_context(|| format!("Failed to create tar-split stream for layer {}", layer.id()))?;
-
-    let mut writer = repo.create_stream(TAR_LAYER_CONTENT_TYPE)?;
-    let content_id = layer_identifier(diff_id);
-
-    // Track padding from previous file - tar-split bundles padding with the NEXT
-    // file's header in Segment entries, but we need to write padding immediately
-    // after file content (like tar.rs does) for consistent splitstream output.
-    let mut prev_file_padding: usize = 0;
-
-    while let Some(item) = stream.next()? {
-        match item {
-            TarSplitItem::Segment(bytes) => {
-                // Skip the leading padding bytes (we already wrote them after prev file)
-                let header_bytes = &bytes[prev_file_padding..];
-                stats.bytes_inlined += header_bytes.len() as u64;
-                writer.write_inline(header_bytes);
-                prev_file_padding = 0;
-            }
-            TarSplitItem::FileContent { fd, size, name } => {
-                process_file_content(
-                    repo,
-                    &mut writer,
-                    &mut stats,
-                    ctx,
-                    fd,
-                    size,
-                    &name,
-                    zerocopy,
-                    &mut inline_buf,
-                )?;
-
-                // Write padding inline immediately after file content
-                let padding_size = (size as usize).next_multiple_of(512) - size as usize;
-                if padding_size > 0 {
-                    stats.bytes_inlined += padding_size as u64;
-                    writer.write_inline(&ZERO_PADDING[..padding_size]);
-                }
-                prev_file_padding = padding_size;
-            }
-        }
-    }
-
-    // Write the stream with the content identifier
-    let verity = repo.write_stream(writer, &content_id, None)?;
-    Ok((verity, stats))
-}
-
-/// Import a single layer via the proxy (rootless mode).
-async fn import_layer_proxied<ObjectID: FsVerityHashValue>(
-    repo: &Arc<Repository<ObjectID>>,
-    proxy: &mut StorageProxy,
-    storage_path: &str,
-    layer_id: &str,
-    diff_id: &OciDigest,
-    zerocopy: bool,
-    ctx: &mut ImportContext,
-) -> Result<(ObjectID, ImportStats)> {
-    let mut stats = ImportStats::default();
-    let mut inline_buf = Vec::new();
-
-    let mut writer = repo.create_stream(TAR_LAYER_CONTENT_TYPE)?;
-    let content_id = layer_identifier(diff_id);
-
-    // Track padding from previous file - tar-split bundles padding with the NEXT
-    // file's header in Segment entries, but we need to write padding immediately
-    // after file content (like tar.rs does) for consistent splitstream output.
-    let mut prev_file_padding: usize = 0;
-
-    // Stream the layer via the proxy
-    let mut stream = proxy
-        .stream_layer(storage_path, layer_id)
-        .await
-        .with_context(|| format!("Failed to start streaming layer {}", layer_id))?;
-
-    while let Some(item) = stream
-        .next()
-        .await
-        .with_context(|| format!("Failed to receive stream item for layer {}", layer_id))?
-    {
-        match item {
-            ProxiedTarSplitItem::Segment(bytes) => {
-                // Skip the leading padding bytes (we already wrote them after prev file)
-                let header_bytes = &bytes[prev_file_padding..];
-                stats.bytes_inlined += header_bytes.len() as u64;
-                writer.write_inline(header_bytes);
-                prev_file_padding = 0;
-            }
-            ProxiedTarSplitItem::FileContent { fd, size, name } => {
-                process_file_content(
-                    repo,
-                    &mut writer,
-                    &mut stats,
-                    ctx,
-                    fd,
-                    size,
-                    &name,
-                    zerocopy,
-                    &mut inline_buf,
-                )?;
-
-                // Write padding inline immediately after file content
-                let padding_size = (size as usize).next_multiple_of(512) - size as usize;
-                if padding_size > 0 {
-                    stats.bytes_inlined += padding_size as u64;
-                    writer.write_inline(&ZERO_PADDING[..padding_size]);
-                }
-                prev_file_padding = padding_size;
-            }
-        }
-    }
-
-    // Write the stream with the content identifier
-    let verity = repo.write_stream(writer, &content_id, None)?;
-    Ok((verity, stats))
-}
-
-/// Process file content (shared between direct and proxied modes).
-#[allow(clippy::too_many_arguments)]
-fn process_file_content<ObjectID: FsVerityHashValue>(
-    repo: &Arc<Repository<ObjectID>>,
-    writer: &mut composefs::splitstream::SplitStreamWriter<ObjectID>,
-    stats: &mut ImportStats,
-    ctx: &mut ImportContext,
-    fd: OwnedFd,
-    size: u64,
-    name: &str,
-    zerocopy: bool,
-    inline_buf: &mut Vec<u8>,
-) -> Result<()> {
-    // Convert fd to File for operations
-    let file = std::fs::File::from(fd);
-
-    if size as usize > INLINE_CONTENT_MAX_V0 {
-        // Large file: store as external object
-        let (object_id, method) = if zerocopy {
-            repo.ensure_object_from_file_zerocopy(&file, size, ctx)
-        } else {
-            repo.ensure_object_from_file(&file, size, ctx)
-        }
-        .with_context(|| format!("Failed to store object for {}", name))?;
-
-        match method {
-            ObjectStoreMethod::Reflinked => {
-                stats.objects_reflinked += 1;
-                stats.bytes_reflinked += size;
-            }
-            ObjectStoreMethod::Hardlinked => {
-                stats.objects_hardlinked += 1;
-                stats.bytes_hardlinked += size;
-            }
-            ObjectStoreMethod::Copied => {
-                stats.objects_copied += 1;
-                stats.bytes_copied += size;
-            }
-            ObjectStoreMethod::AlreadyPresent => {
-                stats.objects_already_present += 1;
-            }
-        }
-
-        writer.add_external_size(size);
-        writer.write_reference(object_id)?;
-    } else {
-        // Small file: read and embed inline (reuse buffer across calls)
-        inline_buf.resize(size as usize, 0);
-        file.read_exact_at(inline_buf, 0)?;
-        stats.bytes_inlined += size;
-        writer.write_inline(inline_buf);
-    }
-
-    Ok(())
-}
-
-/// Discover storage paths: the primary store plus any additional image stores
-/// from `$STORAGE_OPTS`.
-fn discover_storage_paths() -> Result<Vec<String>> {
-    let mut paths = Vec::new();
-
-    // Try user storage first (rootless podman)
-    if let Ok(home) = std::env::var("HOME") {
-        let user_path = format!("{}/.local/share/containers/storage", home);
-        if std::path::Path::new(&user_path).exists() {
-            paths.push(user_path);
-        }
-    }
-
-    // Fall back to system storage
-    let system_path = "/var/lib/containers/storage";
-    if std::path::Path::new(system_path).exists() {
-        paths.push(system_path.to_string());
-    }
-
-    // Also check $STORAGE_OPTS for additional image stores
-    if let Ok(opts) = std::env::var("STORAGE_OPTS") {
-        for item in opts.split(',') {
-            let item = item.trim();
-            if let Some(path) = item.strip_prefix("additionalimagestore=")
-                && std::path::Path::new(path).exists()
-            {
-                paths.push(path.to_string());
-            }
-        }
-    }
-
-    anyhow::ensure!(
-        !paths.is_empty(),
-        "Could not find containers-storage at standard locations"
-    );
-    Ok(paths)
+        &manifest_json,
+        &config_json,
+        layer_refs,
+        reference,
+    )
+    .context("finalize_oci_image")?;
+
+    Ok((result, stats))
 }
 
 /// Check if an image reference uses the containers-storage transport.
diff --git a/crates/composefs-oci/src/layer_sync.rs b/crates/composefs-oci/src/layer_sync.rs
new file mode 100644
index 00000000..09cc8e9e
--- /dev/null
+++ b/crates/composefs-oci/src/layer_sync.rs
@@ -0,0 +1,1206 @@
+//! Shared, content-agnostic splitdirfdstream producer and consumer for layer
+//! synchronisation.
+//!
+//! This module holds the blocking logic for producing a `splitdirfdstream` from a
+//! stored layer and for draining one back into a repository.  It depends only on
+//! the (tiny) `composefs-splitdirfdstream` crate, not the containers-storage /
+//! podman stack, so it is always compiled — including in builds that do not
+//! enable the `containers-storage` feature.
+//!
+//! Callers:
+//! * `crates/composefs-oci/src/cstor.rs` — the containers-storage import path
+//!   (only built with the `containers-storage` feature).
+//! * `composefs-ctl` varlink server — repo-to-repo layer synchronisation via
+//!   `GetLayer`/`PutLayer`, and the `oci copy` command.
+
+use std::os::unix::fs::FileExt;
+use std::os::unix::io::OwnedFd;
+use std::sync::Arc;
+
+use anyhow::{Context, Result};
+use cap_std_ext::cap_std;
+use composefs_splitdirfdstream::{Chunk, SplitdirfdstreamReader, SplitdirfdstreamWriter};
+use rustix::fs::{MemfdFlags, fstat, memfd_create};
+use sha2::{Digest as _, Sha256};
+
+use composefs::{
+    INLINE_CONTENT_MAX_V0,
+    fsverity::FsVerityHashValue,
+    repository::{ImportContext, ObjectStoreMethod, Repository},
+    splitstream::{SplitStreamData, SplitStreamWriter},
+};
+
+use crate::skopeo::TAR_LAYER_CONTENT_TYPE;
+use crate::{ImportStats, OciDigest, layer_identifier, sha256_output_to_digest};
+
+/// Errors returned by the diff-id-verifying drain path.
+///
+/// Separates the integrity-check failure (wrong content) from I/O or
+/// repository errors so that callers can surface a clear diagnostic.
+#[derive(Debug, thiserror::Error)]
+pub enum VerifiedDrainError {
+    /// The reconstructed layer content does not match the declared `diff_id`.
+    #[error("layer content does not match declared diff_id: expected {expected}, got {actual}")]
+    DiffIdMismatch {
+        /// The diff_id that was declared by the sender.
+        expected: String,
+        /// The sha256 of the data that was actually received.
+        actual: String,
+    },
+    /// Any other I/O or repository error.
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+/// Verify that `fd` is a directory before using it as a base for `openat`.
+///
+/// If it is not a directory, returns a detailed error showing the fd index,
+/// file type bits, and size — making it easy to distinguish real diff-dirs
+/// from dummy `/dev/null` or `memfd` fds that ended up at the wrong slot.
+fn assert_is_dir(fd: &impl rustix::fd::AsFd, slot: u32, name: &str) -> anyhow::Result<()> {
+    use rustix::fs::FileType;
+    let st = fstat(fd).with_context(|| format!("fstat overlay_dir[{slot}]"))?;
+    let ft = FileType::from_raw_mode(st.st_mode);
+    if ft != FileType::Directory {
+        anyhow::bail!(
+            "overlay_dir[{slot}] is not a directory (file type: {ft:?}, \
+             mode={:#o}, size={}) — cannot open {name:?}; \
+             this is likely a bug where a dummy fd (e.g. /dev/null or memfd) \
+             was stored at a slot that should hold a real diff-dir fd",
+            st.st_mode,
+            st.st_size
+        );
+    }
+    Ok(())
+}
+
+/// Drain the `splitdirfdstream` pipe and write the resulting layer splitstream.
+///
+/// This is the blocking half of a layer-import operation — it should be run on a
+/// `spawn_blocking` thread so the pipe can fill concurrently with the producer.
+///
+/// # Arguments
+/// * `repo` — Repository to import into.
+/// * `pipe_read` — Read end of the splitdirfdstream pipe.
+/// * `dir_fds` — Sparse dirfds region passed wholesale by the transport layer.
+///   `dir_fds[dirfd_index]` resolves the real diff-directory fd for each
+///   [`Chunk::FileBackedData`] chunk.  Dummy fds at gap slots are never opened.
+/// * `diff_id` — OCI diff-id of the layer (used as the stream content identifier).
+/// * `zerocopy` — If `true`, use reflink/zerocopy when storing large objects.
+/// * `ctx` — Import context (passed back to the caller on success).
+///
+/// # Returns
+/// A tuple of `(verity_hash, import_stats, import_context)` on success.
+pub fn drain_splitdirfdstream<ObjectID: FsVerityHashValue>(
+    repo: Arc<Repository<ObjectID>>,
+    pipe_read: OwnedFd,
+    dir_fds: Vec<OwnedFd>,
+    diff_id: &OciDigest,
+    zerocopy: bool,
+    mut ctx: ImportContext,
+) -> Result<(ObjectID, ImportStats, ImportContext)> {
+    // Wrap each fd as a cap_std Dir. Dummy fds (at sparse gap slots) are never
+    // opened by the logic below; only the slot named by dirfd_index is accessed.
+    let overlay_dirs: Vec<cap_std::fs::Dir> =
+        dir_fds.into_iter().map(cap_std::fs::Dir::from).collect();
+
+    let mut writer = repo.create_stream(TAR_LAYER_CONTENT_TYPE)?;
+    let content_id = layer_identifier(diff_id);
+    let mut reader = SplitdirfdstreamReader::new(std::fs::File::from(pipe_read));
+    let mut inline_buf = Vec::new();
+    let mut stats = ImportStats::default();
+
+    drain_splitdirfdstream_inner(
+        &repo,
+        &mut writer,
+        &mut reader,
+        &overlay_dirs,
+        zerocopy,
+        &mut stats,
+        &mut ctx,
+        &mut inline_buf,
+        None,
+    )?;
+
+    let verity = repo.write_stream(writer, &content_id, None)?;
+    Ok((verity, stats, ctx))
+}
+
+#[allow(clippy::too_many_arguments)]
+fn drain_splitdirfdstream_inner<ObjectID: FsVerityHashValue>(
+    repo: &Arc<Repository<ObjectID>>,
+    writer: &mut SplitStreamWriter<ObjectID>,
+    reader: &mut SplitdirfdstreamReader<std::fs::File>,
+    overlay_dirs: &[cap_std::fs::Dir],
+    zerocopy: bool,
+    stats: &mut ImportStats,
+    ctx: &mut ImportContext,
+    inline_buf: &mut Vec<u8>,
+    mut hasher: Option<&mut Sha256>,
+) -> Result<()> {
+    while let Some(chunk) = reader.next_chunk().context("splitdirfdstream read error")? {
+        match chunk {
+            Chunk::Metadata(data) => {
+                if let Some(ref mut h) = hasher {
+                    h.update(data);
+                }
+                stats.bytes_inlined += data.len() as u64;
+                writer.write_inline(data);
+            }
+            Chunk::InlineData(data) => {
+                // Non-world-readable file transported inline by the producer.
+                // We already have the bytes — no fd needed for the small case.
+                if let Some(ref mut h) = hasher {
+                    h.update(data);
+                }
+                let length = data.len() as u64;
+                if should_inline(length) {
+                    stats.bytes_inlined += length;
+                    writer.write_inline(data);
+                } else {
+                    // Large file: store as an external object. A memfd is the
+                    // lightest way to hand an fd to process_file_content.
+                    process_file_content(
+                        repo,
+                        writer,
+                        stats,
+                        ctx,
+                        file_content_to_memfd(data)?,
+                        length,
+                        "<file-content>",
+                        zerocopy,
+                        inline_buf,
+                    )?;
+                }
+            }
+            Chunk::FileBackedData {
+                dirfd_index,
+                length,
+                filename,
+            } => {
+                let name = std::str::from_utf8(filename).with_context(|| {
+                    format!("non-utf8 filename in splitdirfdstream: {filename:?}")
+                })?;
+                let dir = overlay_dirs.get(dirfd_index as usize).with_context(|| {
+                    format!(
+                        "dirfd_index {dirfd_index} out of range (dir_fds.len={})",
+                        overlay_dirs.len()
+                    )
+                })?;
+                assert_is_dir(dir, dirfd_index, name)?;
+                let fd = dir
+                    .open(name)
+                    .map(OwnedFd::from)
+                    .with_context(|| format!("open {name:?} in overlay dir[{dirfd_index}]"))?;
+
+                use rustix::fs::FileType;
+                let st = fstat(&fd).with_context(|| format!("fstat {name:?}"))?;
+                let ft = FileType::from_raw_mode(st.st_mode);
+                if ft != FileType::RegularFile {
+                    anyhow::bail!(
+                        "object {name:?} in overlay dir[{dirfd_index}] is not a regular file (file type: {ft:?}, mode={:#o})",
+                        st.st_mode
+                    );
+                }
+
+                let fd_to_process = if let Some(ref mut h) = hasher {
+                    // Defend against a producer that declares a `length` that
+                    // disagrees with the actual object size.  The reflink store
+                    // path already rejects a mismatch, but the copy fallback would
+                    // store the whole file while we only hashed `length` bytes,
+                    // so the committed object could differ from the verified
+                    // bytes.  Pin the two together by requiring length == size.
+                    let obj_file = std::fs::File::from(fd);
+                    let actual_size = st.st_size as u64;
+                    if actual_size != length {
+                        anyhow::bail!(
+                            "object {name}: declared length {length} != actual size {actual_size}"
+                        );
+                    }
+
+                    // Hash the object file content using positioned reads.  This
+                    // does not disturb the fd cursor, so process_file_content can
+                    // independently read/reflink the same fd afterwards.
+                    hash_fd_contents(&obj_file, length, h)
+                        .with_context(|| format!("hashing object {name}"))?;
+
+                    obj_file.into()
+                } else {
+                    fd
+                };
+
+                process_file_content(
+                    repo,
+                    writer,
+                    stats,
+                    ctx,
+                    fd_to_process,
+                    length,
+                    name,
+                    zerocopy,
+                    inline_buf,
+                )?;
+                // NOTE: padding after the file content is already emitted by the
+                // producer as a following Inline chunk; do NOT add extra padding here.
+            }
+        }
+    }
+    Ok(())
+}
+
+/// Materialise `data` into an anonymous `memfd` for use with
+/// [`process_file_content`].
+///
+/// The memfd is created `CLOEXEC` and the data written at offset 0.
+/// `process_file_content` uses positioned reads (`pread`/`read_at`) so the
+/// write cursor position does not matter.
+fn file_content_to_memfd(data: &[u8]) -> Result<OwnedFd> {
+    let memfd = memfd_create(c"composefs-filecontent", MemfdFlags::CLOEXEC)
+        .context("memfd_create for FileContent chunk")?;
+    rustix::io::write(&memfd, data).context("writing FileContent to memfd")?;
+    Ok(memfd)
+}
+
+/// Hash `len` bytes of `fd` starting at offset 0 into `hasher`, using
+/// positioned reads so the fd cursor is not disturbed.
+///
+/// Uses [`FileExt::read_at`] in a loop with a 64 KiB stack buffer to
+/// avoid allocating for large objects.
+fn hash_fd_contents(fd: &std::fs::File, len: u64, hasher: &mut Sha256) -> Result<()> {
+    const BUF_SIZE: usize = 65536;
+    let mut buf = [0u8; BUF_SIZE];
+    let mut remaining = len;
+    let mut offset = 0u64;
+
+    while remaining > 0 {
+        let to_read = remaining.min(BUF_SIZE as u64) as usize;
+        let n = fd
+            .read_at(&mut buf[..to_read], offset)
+            .context("read_at while hashing fd contents")?;
+        if n == 0 {
+            anyhow::bail!(
+                "unexpected EOF at offset {offset} hashing fd (expected {len} bytes total)"
+            );
+        }
+        hasher.update(&buf[..n]);
+        offset += n as u64;
+        remaining -= n as u64;
+    }
+    Ok(())
+}
+
+/// Like [`drain_splitdirfdstream`], but additionally verifies that the
+/// reconstructed (uncompressed tar) content hashes to `diff_id`, and only
+/// commits the layer splitstream to the repository if it matches.
+///
+/// On mismatch the partially-written stream is discarded and
+/// [`VerifiedDrainError::DiffIdMismatch`] is returned.
+///
+/// # Integrity model
+///
+/// The diff_id is the sha256 of the uncompressed tar bytes — the same byte
+/// stream that `cat` on the splitstream produces.  As chunks are processed,
+/// every logical byte is fed into a running SHA-256 hasher:
+/// * **Inline** chunks: hash the raw bytes verbatim.
+/// * **External** chunks: hash the full object file (all `length` bytes) via
+///   positioned reads, then pass the same fd to [`process_file_content`].
+///   Using positioned reads (`read_at`) means the fd cursor is not consumed,
+///   so `process_file_content` can read the file independently.
+///
+/// # Note on partial objects
+///
+/// If verification fails, large external objects that were already written to
+/// the `objects/` directory of the destination repo are left in place.  They
+/// are orphaned (not referenced by any committed splitstream) and will be
+/// reclaimed by the next GC run.  We do NOT attempt to clean them up here
+/// because doing so correctly (without racing with concurrent imports) would
+/// be complex and the GC already handles this case.
+pub fn drain_splitdirfdstream_verified<ObjectID: FsVerityHashValue>(
+    repo: Arc<Repository<ObjectID>>,
+    pipe_read: OwnedFd,
+    dir_fds: Vec<OwnedFd>,
+    diff_id: &OciDigest,
+    zerocopy: bool,
+    mut ctx: ImportContext,
+) -> Result<(ObjectID, ImportStats, ImportContext), VerifiedDrainError> {
+    // We hash with SHA-256, which is the only algorithm OCI diff-ids use in
+    // practice (and the only one the rest of this crate assumes). Reject other
+    // algorithms up front with a clear error rather than producing a confusing
+    // "mismatch" between a sha256 hash and e.g. a sha512 diff-id string.
+    let algorithm = diff_id.algorithm().as_ref();
+    if algorithm != "sha256" {
+        return Err(VerifiedDrainError::Other(anyhow::anyhow!(
+            "unsupported diff_id algorithm {algorithm:?}: only sha256 is supported"
+        )));
+    }
+
+    // Wrap each fd as a cap_std Dir. Dummy fds at sparse gap slots are never
+    // opened by the logic below; only the slot named by dirfd_index is accessed.
+    let overlay_dirs: Vec<cap_std::fs::Dir> =
+        dir_fds.into_iter().map(cap_std::fs::Dir::from).collect();
+
+    let mut writer = repo
+        .create_stream(TAR_LAYER_CONTENT_TYPE)
+        .context("create_stream")?;
+    let content_id = layer_identifier(diff_id);
+    let mut reader = SplitdirfdstreamReader::new(std::fs::File::from(pipe_read));
+    let mut inline_buf = Vec::new();
+    let mut stats = ImportStats::default();
+    let mut hasher = Sha256::new();
+
+    drain_splitdirfdstream_inner(
+        &repo,
+        &mut writer,
+        &mut reader,
+        &overlay_dirs,
+        zerocopy,
+        &mut stats,
+        &mut ctx,
+        &mut inline_buf,
+        Some(&mut hasher),
+    )?;
+
+    // Verify the accumulated hash against the declared diff_id.
+    let actual_digest = sha256_output_to_digest(hasher.finalize());
+    let actual_str = actual_digest.to_string();
+    let expected_str = diff_id.to_string();
+    if actual_str != expected_str {
+        // Drop `writer` without calling write_stream: the stream is not
+        // committed.  Large objects already written to objects/ are orphaned
+        // but GC will reclaim them.
+        return Err(VerifiedDrainError::DiffIdMismatch {
+            expected: expected_str,
+            actual: actual_str,
+        });
+    }
+
+    let verity = repo
+        .write_stream(writer, &content_id, None)
+        .context("write_stream")?;
+    Ok((verity, stats, ctx))
+}
+
+/// Decide whether a file of the given `size` should be stored inline in the
+/// splitstream rather than as a separate object in the object store.
+///
+/// Files with `size <= INLINE_CONTENT_MAX_V0` are embedded directly in the
+/// splitstream; larger files become external objects.
+pub(crate) fn should_inline(size: u64) -> bool {
+    (size as usize) <= INLINE_CONTENT_MAX_V0
+}
+
+/// Store the content of one file fd into the splitstream, choosing inline vs
+/// external storage based on the file size.
+///
+/// For files at or below [`composefs::INLINE_CONTENT_MAX_V0`] bytes the
+/// content is read into `inline_buf` and embedded directly in `writer`.
+/// Larger files are stored as external objects in `repo` and referenced
+/// by hash.
+#[allow(clippy::too_many_arguments)]
+pub fn process_file_content<ObjectID: FsVerityHashValue>(
+    repo: &Arc<Repository<ObjectID>>,
+    writer: &mut SplitStreamWriter<ObjectID>,
+    stats: &mut ImportStats,
+    ctx: &mut ImportContext,
+    fd: OwnedFd,
+    size: u64,
+    name: &str,
+    zerocopy: bool,
+    inline_buf: &mut Vec<u8>,
+) -> Result<()> {
+    // Convert fd to File for operations
+    let file = std::fs::File::from(fd);
+
+    if !should_inline(size) {
+        // Large file: store as external object
+        let (object_id, method) = if zerocopy {
+            repo.ensure_object_from_file_zerocopy(&file, size, ctx)
+        } else {
+            repo.ensure_object_from_file(&file, size, ctx)
+        }
+        .with_context(|| format!("Failed to store object for {}", name))?;
+
+        match method {
+            ObjectStoreMethod::Reflinked => {
+                stats.objects_reflinked += 1;
+                stats.bytes_reflinked += size;
+            }
+            ObjectStoreMethod::Hardlinked => {
+                stats.objects_hardlinked += 1;
+                stats.bytes_hardlinked += size;
+            }
+            ObjectStoreMethod::Copied => {
+                stats.objects_copied += 1;
+                stats.bytes_copied += size;
+            }
+            ObjectStoreMethod::AlreadyPresent => {
+                stats.objects_already_present += 1;
+            }
+        }
+
+        writer.add_external_size(size);
+        writer.write_reference(object_id)?;
+    } else {
+        // Small file: read and embed inline (reuse buffer across calls)
+        inline_buf.resize(size as usize, 0);
+        file.read_exact_at(inline_buf, 0)?;
+        stats.bytes_inlined += size;
+        writer.write_inline(inline_buf);
+    }
+
+    Ok(())
+}
+
+/// Return the on-disk size of the object identified by `id` in `repo`.
+///
+/// Opens the object file and `fstat`s it to obtain the size without reading
+/// the full content.  This is used by [`produce_layer_splitdirfdstream`] to
+/// populate the `length` field of external chunks in the output stream.
+fn object_size<ObjectID: FsVerityHashValue>(
+    repo: &Repository<ObjectID>,
+    id: &ObjectID,
+) -> Result<u64> {
+    let fd = repo
+        .open_object(id)
+        .context("Opening object for size query")?;
+    let stat = fstat(&fd).context("fstat on object fd")?;
+    Ok(stat.st_size as u64)
+}
+
+/// Produce a `splitdirfdstream` for the layer splitstream identified by
+/// `layer_verity`, writing it to `out`.
+///
+/// External objects are emitted as references of the form `"xx/yyyy"` beneath
+/// the repository's `objects/` directory.  The consumer must supply the objects
+/// directory at `dirfd_index` within the sparse dirfds region (obtained from
+/// [`composefs_splitdirfdstream::build_layer_fd_layout`]'s `real_indices[0]`).
+///
+/// This is the read/serve side that mirrors [`drain_splitdirfdstream`]: given
+/// the same repo, a stream produced here and then passed through
+/// [`composefs_splitdirfdstream::reconstruct`] with the repo's objects dir will
+/// yield byte-identical output to calling
+/// [`composefs::splitstream::SplitStreamReader::cat`] on the same layer.
+///
+/// # Arguments
+/// * `repo`              — Repository that holds the layer and its objects.
+/// * `layer_verity`      — fs-verity hash of the layer splitstream to serve.
+/// * `objects_dirfd_index` — Slot index within the sparse dirfds region where
+///   the repository objects directory fd is placed (from `real_indices[0]`).
+/// * `out`               — Destination writer for the produced `splitdirfdstream`.
+pub fn produce_layer_splitdirfdstream<ObjectID: FsVerityHashValue, W: std::io::Write>(
+    repo: &Repository<ObjectID>,
+    layer_verity: &ObjectID,
+    objects_dirfd_index: u32,
+    out: W,
+) -> Result<()> {
+    let mut reader = repo
+        .open_stream("", Some(layer_verity), Some(TAR_LAYER_CONTENT_TYPE))
+        .context("Opening layer splitstream")?;
+    let mut writer = SplitdirfdstreamWriter::new(out);
+
+    reader
+        .for_each_chunk(|chunk| {
+            match chunk {
+                SplitStreamData::Inline(data) => {
+                    writer.write_metadata(&data).map_err(anyhow::Error::from)?;
+                }
+                SplitStreamData::External(id) => {
+                    let size = object_size(repo, &id)?;
+                    let pathname = id.to_object_pathname();
+                    writer
+                        .write_file_backed_data(objects_dirfd_index, size, pathname.as_bytes())
+                        .map_err(anyhow::Error::from)?;
+                }
+            }
+            Ok(())
+        })
+        .context("Walking layer splitstream chunks")?;
+
+    writer.finish().map_err(anyhow::Error::from)?;
+    Ok(())
+}
+
+/// A type alias for the (manifest, config) digest-and-verity pair returned by
+/// [`finalize_oci_image`].
+///
+/// The first element is `(manifest_digest, manifest_verity)`;
+/// the second is `(config_digest, config_verity)`.
+pub type FinalizeResult<ObjectID> = (
+    crate::ContentAndVerity<ObjectID>,
+    crate::ContentAndVerity<ObjectID>,
+);
+
+/// Finalize an OCI image whose layers are already imported into `repo`.
+///
+/// Given the raw manifest and config JSON (exact bytes, so their sha256
+/// digests are preserved) and the ordered (diff_id, layer_verity) pairs,
+/// this writes the config and manifest splitstreams, generates the composefs
+/// EROFS image, and optionally tags the manifest under `name`. Idempotent.
+///
+/// Returns `((manifest_digest, manifest_verity), (config_digest, config_verity))`.
+pub fn finalize_oci_image<ObjectID: FsVerityHashValue>(
+    repo: &Arc<Repository<ObjectID>>,
+    manifest_json: &[u8],
+    config_json: &[u8],
+    layer_refs: &[(OciDigest, ObjectID)],
+    name: Option<&str>,
+) -> anyhow::Result<FinalizeResult<ObjectID>> {
+    use crate::oci_image::manifest_identifier;
+    use crate::skopeo::{OCI_CONFIG_CONTENT_TYPE, OCI_MANIFEST_CONTENT_TYPE};
+    use crate::{config_identifier, sha256_content_digest};
+
+    let config_digest = sha256_content_digest(config_json);
+    let content_id = config_identifier(&config_digest);
+
+    let config_verity = if let Some(existing) = repo.has_stream(&content_id)? {
+        existing
+    } else {
+        let mut writer = repo.create_stream(OCI_CONFIG_CONTENT_TYPE)?;
+
+        for (diff_id, verity) in layer_refs {
+            let key: &str = diff_id.as_ref();
+            writer.add_named_stream_ref(key, verity);
+        }
+
+        writer.write_external(config_json)?;
+        repo.write_stream(writer, &content_id, None)?
+    };
+
+    let manifest_digest = sha256_content_digest(manifest_json);
+
+    let manifest_content_id = manifest_identifier(&manifest_digest);
+    let manifest_verity = if let Some(existing) = repo.has_stream(&manifest_content_id)? {
+        existing
+    } else {
+        let mut writer = repo.create_stream(OCI_MANIFEST_CONTENT_TYPE)?;
+
+        let config_ref_key = format!("config:{config_digest}");
+        writer.add_named_stream_ref(&config_ref_key, &config_verity);
+
+        for (diff_id, verity) in layer_refs {
+            let key: &str = diff_id.as_ref();
+            writer.add_named_stream_ref(key, verity);
+        }
+
+        writer.write_external(manifest_json)?;
+        repo.write_stream(writer, &manifest_content_id, None)?
+    };
+
+    // Generate the composefs EROFS image and tag the manifest.
+    // Skip if the image already has an EROFS ref (idempotent re-finalize).
+    let existing_erofs = crate::composefs_erofs_for_manifest(
+        repo,
+        &manifest_digest,
+        Some(&manifest_verity),
+        repo.erofs_version(),
+    )?;
+    if existing_erofs.is_none() {
+        let erofs = crate::ensure_oci_composefs_erofs(
+            repo,
+            &manifest_digest,
+            Some(&manifest_verity),
+            name,
+        )?;
+        if erofs.is_none() {
+            // Not a container image (e.g. an artifact) — tag directly.
+            if let Some(n) = name {
+                crate::oci_image::tag_image(repo, &manifest_digest, n)?;
+            }
+        }
+    } else if let Some(n) = name {
+        crate::oci_image::tag_image(repo, &manifest_digest, n)?;
+    }
+
+    // Re-read verities: ensure_oci_composefs_erofs rewrites config and
+    // manifest splitstreams (adding the EROFS ref), so the verities captured
+    // above may be stale.
+    let config_verity = repo
+        .has_stream(&content_id)?
+        .context("config splitstream missing after finalization")?;
+    let manifest_verity = repo
+        .has_stream(&manifest_content_id)?
+        .context("manifest splitstream missing after finalization")?;
+
+    Ok((
+        (manifest_digest, manifest_verity),
+        (config_digest, config_verity),
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::io::Write as _;
+
+    use composefs::fsverity::Sha256HashValue;
+    use composefs::repository::RepositoryConfig;
+    use composefs_splitdirfdstream::reconstruct;
+
+    /// The pure inline-vs-external predicate, tested exhaustively around the
+    /// boundary.  This is the exact expression branched on inside
+    /// [`process_file_content`], so locking it down here guards the decision
+    /// independently of the (heavier) end-to-end test below.
+    #[test]
+    fn test_should_inline_boundary() {
+        assert!(should_inline(0), "size 0 should be inlined");
+        assert!(should_inline(1), "size 1 should be inlined");
+        assert!(
+            should_inline(INLINE_CONTENT_MAX_V0 as u64),
+            "size == INLINE_CONTENT_MAX_V0 ({INLINE_CONTENT_MAX_V0}) should be inlined"
+        );
+        assert!(
+            !should_inline(INLINE_CONTENT_MAX_V0 as u64 + 1),
+            "size INLINE_CONTENT_MAX_V0+1 ({}) should NOT be inlined",
+            INLINE_CONTENT_MAX_V0 + 1
+        );
+        for size in [128u64, 4096, 65536, 1024 * 1024] {
+            assert!(
+                !should_inline(size),
+                "size {size} should NOT be inlined (well above threshold)"
+            );
+        }
+    }
+
+    /// Create an insecure (no fs-verity) repository in a fresh tempdir.
+    ///
+    /// Returns the repo alongside the `TempDir`, which the caller must keep
+    /// alive for the duration of the test.
+    fn create_test_repo() -> (Arc<Repository<Sha256HashValue>>, tempfile::TempDir) {
+        let tempdir = tempfile::TempDir::new().unwrap();
+        let (repo, _) = Repository::init_path(
+            rustix::fs::CWD,
+            &tempdir.path().join("repo"),
+            RepositoryConfig::default().set_insecure(),
+        )
+        .unwrap();
+        (Arc::new(repo), tempdir)
+    }
+
+    /// A file fd backed by `len` bytes of deterministic content.
+    fn tmpfile_of(len: usize) -> OwnedFd {
+        let mut f = tempfile::tempfile().unwrap();
+        let data: Vec<u8> = (0..len).map(|i| (i % 251) as u8).collect();
+        f.write_all(&data).unwrap();
+        f.into()
+    }
+
+    /// Data-driven end-to-end check of `process_file_content`'s storage
+    /// decision: small files land inline (no object written, `bytes_inlined`
+    /// grows), large files become external objects (`objects_*` grows,
+    /// `bytes_inlined` unchanged).
+    #[test]
+    fn test_process_file_content_inline_vs_external() {
+        let (repo, _tempdir) = create_test_repo();
+
+        let cases = [
+            (0usize, false),
+            (1, false),
+            (INLINE_CONTENT_MAX_V0, false),
+            (INLINE_CONTENT_MAX_V0 + 1, true),
+            (4096, true),
+            (256 * 1024, true),
+        ];
+
+        for (size, expect_external) in cases {
+            let mut writer = repo.create_stream(TAR_LAYER_CONTENT_TYPE).unwrap();
+            let mut stats = ImportStats::default();
+            let mut ctx = ImportContext::default();
+            let mut inline_buf = Vec::new();
+
+            let before_inlined = stats.bytes_inlined;
+            process_file_content(
+                &repo,
+                &mut writer,
+                &mut stats,
+                &mut ctx,
+                tmpfile_of(size),
+                size as u64,
+                "test-file",
+                false,
+                &mut inline_buf,
+            )
+            .unwrap();
+
+            let objects_written = stats.objects_reflinked
+                + stats.objects_hardlinked
+                + stats.objects_copied
+                + stats.objects_already_present;
+
+            if expect_external {
+                assert_eq!(
+                    stats.bytes_inlined, before_inlined,
+                    "size {size}: external file must not change bytes_inlined"
+                );
+                assert_eq!(
+                    objects_written, 1,
+                    "size {size}: exactly one external object expected"
+                );
+            } else {
+                assert_eq!(
+                    stats.bytes_inlined,
+                    before_inlined + size as u64,
+                    "size {size}: inline file must add its bytes to bytes_inlined"
+                );
+                assert_eq!(
+                    objects_written, 0,
+                    "size {size}: inline file must not write an object"
+                );
+            }
+        }
+    }
+
+    // -------------------------------------------------------------------------
+    // Helpers for the produce->reconstruct==cat round-trip tests
+    // -------------------------------------------------------------------------
+
+    /// Build a tar layer where each file has the specified size (bytes).
+    ///
+    /// File names are derived from the size so each case is identifiable
+    /// in assertion output.  Content is deterministic: repeating bytes
+    /// `i % 251`.
+    fn build_tar_layer(file_sizes: &[usize]) -> Vec<u8> {
+        let mut builder = ::tar::Builder::new(vec![]);
+        for &size in file_sizes {
+            let content: Vec<u8> = (0..size).map(|i| (i % 251) as u8).collect();
+            let mut header = ::tar::Header::new_ustar();
+            header.set_uid(0);
+            header.set_gid(0);
+            header.set_mode(0o644);
+            header.set_entry_type(::tar::EntryType::Regular);
+            header.set_size(size as u64);
+            builder
+                .append_data(
+                    &mut header,
+                    format!("file_{size}_{:08x}", size),
+                    &content[..],
+                )
+                .unwrap();
+        }
+        builder.into_inner().unwrap()
+    }
+
+    /// Assert that produce -> reconstruct yields the same bytes as `cat`.
+    ///
+    /// This is the determinism contract: given a layer already imported into
+    /// `repo` with verity hash `verity`, the two paths must be identical.
+    async fn assert_produce_eq_cat(
+        repo: &Arc<Repository<Sha256HashValue>>,
+        verity: &Sha256HashValue,
+    ) {
+        use std::os::fd::AsFd as _;
+        // --- expected: what cat() produces ---
+        let mut expected = Vec::<u8>::new();
+        let mut reader = repo
+            .open_stream("", Some(verity), Some(TAR_LAYER_CONTENT_TYPE))
+            .expect("open_stream for cat");
+        reader
+            .cat(repo, &mut expected)
+            .expect("cat on layer splitstream");
+
+        // --- actual: produce -> reconstruct ---
+        // For the single-repo test path the objects dir is always at index 0
+        // (one real dir, seed-determined slot may vary, but for this helper we
+        // just use dirfd_index=0 directly to keep the test simple).
+        let mut stream_buf = Vec::<u8>::new();
+        produce_layer_splitdirfdstream(repo, verity, 0, &mut stream_buf)
+            .expect("produce_layer_splitdirfdstream");
+
+        let objects_dir_fd = repo.objects_dir().expect("objects_dir");
+        let dirfds = [objects_dir_fd.as_fd()];
+        let mut actual = Vec::<u8>::new();
+        reconstruct(stream_buf.as_slice(), &dirfds, &mut actual)
+            .expect("reconstruct splitdirfdstream");
+
+        similar_asserts::assert_eq!(
+            actual,
+            expected,
+            "produce->reconstruct must equal cat for verity={verity:?}"
+        );
+    }
+
+    /// Data-driven round-trip test: produce -> reconstruct == cat for each
+    /// layer shape.
+    ///
+    /// The test cases cover:
+    /// - all-inline (only files <= INLINE_CONTENT_MAX_V0)
+    /// - only-external (one file larger than INLINE_CONTENT_MAX_V0)
+    /// - mixed (various sizes including crossing the threshold)
+    #[tokio::test]
+    async fn test_produce_reconstruct_eq_cat() {
+        let (repo, _tempdir) = create_test_repo();
+
+        // (label, file sizes in bytes)
+        let cases: &[(&str, &[usize])] = &[
+            // All inline: empty layer (no files)
+            ("empty", &[]),
+            // All inline: only small files (all <= INLINE_CONTENT_MAX_V0 = 64)
+            ("all_inline", &[0, 1, 10, 64]),
+            // Single external object (> 64 bytes)
+            ("single_external", &[65]),
+            // Larger external objects
+            ("large_external", &[4096, 200_000]),
+            // Mixed: small + large + various sizes
+            ("mixed", &[0, 10, 64, 65, 4096, 200_000]),
+        ];
+
+        for (label, sizes) in cases {
+            let tar_bytes = build_tar_layer(sizes);
+            let diff_id = crate::sha256_content_digest(&tar_bytes);
+            let (verity, _stats) = crate::import_layer(&repo, &diff_id, None, tar_bytes.as_slice())
+                .await
+                .unwrap_or_else(|e| panic!("import_layer failed for {label}: {e}"));
+
+            // Run the determinism check for this layer shape.
+            assert_produce_eq_cat(&repo, &verity).await;
+        }
+    }
+
+    // -------------------------------------------------------------------------
+    // Tests for drain_splitdirfdstream_verified
+    // -------------------------------------------------------------------------
+
+    /// Produce a splitdirfdstream from `repo_a` for `verity` into a pipe,
+    /// returning the pipe read end, repo_a's objects dir fd, and a join handle
+    /// for the producer task.
+    ///
+    /// The producer runs on a `spawn_blocking` task so the pipe can fill
+    /// concurrently with the consumer. The caller MUST await the returned
+    /// handle (after draining the pipe, to avoid a full-pipe deadlock) and
+    /// assert the producer succeeded — the task is tracked, not detached, so
+    /// producer errors are surfaced rather than swallowed.
+    fn produce_to_pipe(
+        repo_a: Arc<Repository<Sha256HashValue>>,
+        verity: Sha256HashValue,
+    ) -> (
+        OwnedFd,
+        Vec<OwnedFd>,
+        tokio::task::JoinHandle<anyhow::Result<()>>,
+    ) {
+        use std::os::fd::AsFd as _;
+
+        let (pipe_read, pipe_write) =
+            rustix::pipe::pipe_with(rustix::pipe::PipeFlags::CLOEXEC).expect("pipe");
+
+        let objects_dir = repo_a.objects_dir().expect("objects_dir");
+        let objects_owned = rustix::io::dup(objects_dir.as_fd()).expect("dup objects_dir");
+
+        let handle = tokio::task::spawn_blocking(move || {
+            let wf = std::fs::File::from(pipe_write);
+            // dirfd_index=0 for single-repo tests (objects dir at slot 0).
+            produce_layer_splitdirfdstream(&repo_a, &verity, 0, wf)
+        });
+
+        (pipe_read, vec![objects_owned], handle)
+    }
+
+    /// Await a producer handle from [`produce_to_pipe`] and assert success.
+    async fn join_producer(handle: tokio::task::JoinHandle<anyhow::Result<()>>) {
+        handle
+            .await
+            .expect("producer task panicked")
+            .expect("producer must succeed");
+    }
+
+    /// Await a producer handle without asserting its outcome.
+    ///
+    /// Used when the consumer rejects the stream early (before reading it):
+    /// the producer then sees a broken pipe and errors, which is expected. We
+    /// still join it so no task is left untracked.
+    async fn drain_producer(handle: tokio::task::JoinHandle<anyhow::Result<()>>) {
+        let _ = handle.await.expect("producer task panicked");
+    }
+
+    /// Positive test: produce → verified drain with the CORRECT diff_id.
+    ///
+    /// repo_a imports a layer (with a large external object), then produces
+    /// it as a splitdirfdstream into repo_b via `drain_splitdirfdstream_verified`.
+    /// Asserts:
+    /// - the call succeeds,
+    /// - repo_b now has the layer stream committed.
+    #[tokio::test]
+    async fn test_verified_drain_correct_diff_id() {
+        let (repo_a, _td_a) = create_test_repo();
+        let (repo_b, _td_b) = create_test_repo();
+
+        // Build a layer with both inline and external content.
+        let tar_bytes = build_tar_layer(&[10, 128 * 1024]); // 10 B inline + 128 KiB external
+        let diff_id = crate::sha256_content_digest(&tar_bytes);
+
+        let (verity_a, _) = crate::import_layer(&repo_a, &diff_id, None, tar_bytes.as_slice())
+            .await
+            .expect("import_layer into repo_a");
+
+        let (pipe_read, dir_fds, producer) = produce_to_pipe(repo_a, verity_a.clone());
+
+        let repo_b_clone = repo_b.clone();
+        let diff_id_clone = diff_id.clone();
+        let result = tokio::task::spawn_blocking(move || {
+            drain_splitdirfdstream_verified(
+                repo_b_clone,
+                pipe_read,
+                dir_fds,
+                &diff_id_clone,
+                false,
+                composefs::repository::ImportContext::default(),
+            )
+        })
+        .await
+        .expect("spawn_blocking");
+
+        let (verity_b, _stats, _ctx) = result.expect("verified drain must succeed");
+
+        // Producer must have completed cleanly (drain succeeded, so it did).
+        join_producer(producer).await;
+
+        // The layer must now be committed in repo_b.
+        let content_id = crate::layer_content_id(&diff_id);
+        assert!(
+            repo_b
+                .has_stream(&content_id)
+                .expect("has_stream")
+                .is_some(),
+            "repo_b must have the layer stream after verified drain"
+        );
+
+        // Both repos resolved to the same verity hash.
+        assert_eq!(
+            verity_a, verity_b,
+            "verity hash must be identical across repos"
+        );
+    }
+
+    /// Negative test: produce → verified drain with a WRONG diff_id.
+    ///
+    /// The drain must return `DiffIdMismatch` and repo_b must NOT have the
+    /// stream committed.
+    #[tokio::test]
+    async fn test_verified_drain_wrong_diff_id() {
+        let (repo_a, _td_a) = create_test_repo();
+        let (repo_b, _td_b) = create_test_repo();
+
+        let tar_bytes = build_tar_layer(&[10, 128 * 1024]);
+        let correct_diff_id = crate::sha256_content_digest(&tar_bytes);
+
+        let (verity_a, _) =
+            crate::import_layer(&repo_a, &correct_diff_id, None, tar_bytes.as_slice())
+                .await
+                .expect("import_layer into repo_a");
+
+        // Construct a diff_id that is a valid sha256 digest but definitely wrong.
+        let wrong_diff_id: crate::OciDigest =
+            "sha256:0000000000000000000000000000000000000000000000000000000000000000"
+                .parse()
+                .unwrap();
+
+        let (pipe_read, dir_fds, producer) = produce_to_pipe(repo_a, verity_a);
+
+        let repo_b_clone = repo_b.clone();
+        let wrong_diff_id_clone = wrong_diff_id.clone();
+        let result = tokio::task::spawn_blocking(move || {
+            drain_splitdirfdstream_verified(
+                repo_b_clone,
+                pipe_read,
+                dir_fds,
+                &wrong_diff_id_clone,
+                false,
+                composefs::repository::ImportContext::default(),
+            )
+        })
+        .await
+        .expect("spawn_blocking");
+
+        // The drain hashes the whole stream before rejecting, so it consumes
+        // the entire pipe and the producer completes cleanly.
+        join_producer(producer).await;
+
+        // Must fail with DiffIdMismatch.
+        match result {
+            Err(VerifiedDrainError::DiffIdMismatch { expected, actual }) => {
+                assert_eq!(
+                    expected,
+                    wrong_diff_id.to_string(),
+                    "expected field must be the wrong diff_id"
+                );
+                assert_eq!(
+                    actual,
+                    correct_diff_id.to_string(),
+                    "actual field must be the real content hash"
+                );
+            }
+            other => panic!("expected DiffIdMismatch, got {other:?}"),
+        }
+
+        // The stream must NOT be committed in repo_b.
+        let wrong_content_id = crate::layer_content_id(&wrong_diff_id);
+        assert!(
+            repo_b
+                .has_stream(&wrong_content_id)
+                .expect("has_stream")
+                .is_none(),
+            "repo_b must NOT have a committed stream for the wrong diff_id"
+        );
+    }
+
+    /// A non-sha256 diff_id must be rejected up front with a clear error
+    /// rather than producing a confusing hash "mismatch".
+    #[tokio::test]
+    async fn test_verified_drain_rejects_non_sha256() {
+        let (repo_a, _td_a) = create_test_repo();
+        let (repo_b, _td_b) = create_test_repo();
+
+        let tar_bytes = build_tar_layer(&[10, 128 * 1024]);
+        let diff_id = crate::sha256_content_digest(&tar_bytes);
+        let (verity_a, _) = crate::import_layer(&repo_a, &diff_id, None, tar_bytes.as_slice())
+            .await
+            .expect("import_layer into repo_a");
+
+        // A valid sha512 digest (64 bytes hex) — well-formed but unsupported.
+        let sha512_diff_id: crate::OciDigest = format!("sha512:{}", "0".repeat(128))
+            .parse()
+            .expect("valid sha512 digest");
+
+        let (pipe_read, dir_fds, producer) = produce_to_pipe(repo_a, verity_a);
+
+        let repo_b_clone = repo_b.clone();
+        let result = tokio::task::spawn_blocking(move || {
+            drain_splitdirfdstream_verified(
+                repo_b_clone,
+                pipe_read,
+                dir_fds,
+                &sha512_diff_id,
+                false,
+                composefs::repository::ImportContext::default(),
+            )
+        })
+        .await
+        .expect("spawn_blocking");
+
+        // The drain rejects before reading the pipe, so the producer may error
+        // with a broken pipe; join it regardless so it isn't left untracked.
+        drain_producer(producer).await;
+
+        match result {
+            Err(VerifiedDrainError::Other(e)) => {
+                let msg = format!("{e:#}");
+                assert!(
+                    msg.contains("sha256") && msg.contains("sha512"),
+                    "error should explain the algorithm restriction, got: {msg}"
+                );
+            }
+            other => panic!("expected Other(unsupported algorithm) error, got {other:?}"),
+        }
+    }
+
+    // -------------------------------------------------------------------------
+    // Tests for finalize_oci_image
+    // -------------------------------------------------------------------------
+
+    /// End-to-end test for `finalize_oci_image`:
+    ///
+    /// 1. Import 2 synthetic tar layers with valid OCI structure.
+    /// 2. Build matching config + manifest JSON.
+    /// 3. Call `finalize_oci_image`.
+    /// 4. Assert config/manifest splitstreams exist and EROFS was produced.
+    #[tokio::test]
+    async fn test_finalize_oci_image() {
+        let (repo, _tempdir) = create_test_repo();
+
+        // Import two layers: one all-inline (10 B), one external (128 KiB).
+        let tar1 = crate::test_util::build_oci_tar_layer(10);
+        let tar2 = crate::test_util::build_oci_tar_layer(128 * 1024);
+
+        let diff_id1 = crate::sha256_content_digest(&tar1);
+        let diff_id2 = crate::sha256_content_digest(&tar2);
+
+        let (verity1, _) = crate::import_layer(&repo, &diff_id1, None, tar1.as_slice())
+            .await
+            .expect("import layer 1");
+        let (verity2, _) = crate::import_layer(&repo, &diff_id2, None, tar2.as_slice())
+            .await
+            .expect("import layer 2");
+
+        let diff_ids = vec![diff_id1.to_string(), diff_id2.to_string()];
+        let config_json = crate::test_util::make_config_json(&diff_ids);
+        let config_digest = crate::sha256_content_digest(&config_json);
+        let manifest_json =
+            crate::test_util::make_manifest_json(&config_json, config_digest.as_ref(), &diff_ids);
+
+        let layer_refs = vec![(diff_id1.clone(), verity1), (diff_id2.clone(), verity2)];
+
+        let ((manifest_digest, manifest_verity), (out_config_digest, config_verity)) =
+            finalize_oci_image(
+                &repo,
+                &manifest_json,
+                &config_json,
+                &layer_refs,
+                Some("test:v1"),
+            )
+            .expect("finalize_oci_image");
+
+        // Digests must be non-empty.
+        assert!(!manifest_digest.to_string().is_empty());
+        assert!(!out_config_digest.to_string().is_empty());
+
+        // Splitstreams must exist.
+        use crate::oci_image::manifest_identifier;
+        let manifest_id = manifest_identifier(&manifest_digest);
+        let config_id = crate::config_identifier(&out_config_digest);
+
+        assert!(
+            repo.has_stream(&manifest_id)
+                .expect("has_stream manifest")
+                .is_some(),
+            "manifest splitstream must exist"
+        );
+        assert!(
+            repo.has_stream(&config_id)
+                .expect("has_stream config")
+                .is_some(),
+            "config splitstream must exist"
+        );
+
+        // Verify the returned verities match what's stored.
+        let stored_manifest_verity = repo
+            .has_stream(&manifest_id)
+            .unwrap()
+            .expect("manifest verity must be stored");
+        assert_eq!(
+            manifest_verity, stored_manifest_verity,
+            "returned manifest_verity must match stored"
+        );
+        let stored_config_verity = repo
+            .has_stream(&config_id)
+            .unwrap()
+            .expect("config verity must be stored");
+        assert_eq!(
+            config_verity, stored_config_verity,
+            "returned config_verity must match stored"
+        );
+
+        // EROFS must have been generated for this container image.
+        let erofs = crate::composefs_erofs_for_manifest(
+            &repo,
+            &manifest_digest,
+            Some(&manifest_verity),
+            repo.erofs_version(),
+        )
+        .expect("composefs_erofs_for_manifest");
+        assert!(
+            erofs.is_some(),
+            "EROFS image must exist after finalize_oci_image for a container image"
+        );
+
+        // Idempotency: calling again must succeed and return the same digests.
+        let ((md2, _mv2), (cd2, _cv2)) = finalize_oci_image(
+            &repo,
+            &manifest_json,
+            &config_json,
+            &layer_refs,
+            Some("test:v1"),
+        )
+        .expect("finalize_oci_image idempotent");
+        assert_eq!(manifest_digest, md2, "idempotent call: manifest_digest");
+        assert_eq!(out_config_digest, cd2, "idempotent call: config_digest");
+    }
+}
diff --git a/crates/composefs-oci/src/layer_transport.rs b/crates/composefs-oci/src/layer_transport.rs
new file mode 100644
index 00000000..51f744ea
--- /dev/null
+++ b/crates/composefs-oci/src/layer_transport.rs
@@ -0,0 +1,218 @@
+//! Abstraction layer for the OCI `GetLayer` producer side.
+//!
+//! Defines the [`LayerSource`] trait that decouples the fd-assembly and
+//! framing mechanics from the specific backing store (composefs repo today,
+//! containers-storage in a later step).  Both the in-process `copy` path and
+//! the varlink `GetLayer` handler use this trait.
+//!
+//! The shared entry point is [`serve_get_layer`], which:
+//! 1. Calls [`LayerSource::open`] to get real dirfds and a lifetime guard.
+//! 2. Calls [`composefs_splitdirfdstream::build_layer_fd_layout`] to place them
+//!    in the sparse layout, add dummies, keepalive pipe and extra lifetime fds.
+//! 3. Spawns the 3-phase self-reaping producer via
+//!    [`composefs_splitdirfdstream::spawn_self_reaping_producer`].
+//! 4. Splits the fd array into transport frames via
+//!    [`composefs_splitdirfdstream::split_fds_into_frames`].
+//! 5. Returns the ready frame batches + `dir_count` for the caller to wrap in
+//!    zlink `Reply` frames (interface-specific error types stay in the callers).
+//!
+//! # Dependency note
+//!
+//! This module lives in `composefs-oci` rather than `composefs-splitdirfdstream`
+//! because the trait references `anyhow::Result` and repository types, while
+//! `composefs-splitdirfdstream` must stay dependency-free of those crates.
+//! The cstor service (`composefs-storage`) CANNOT call `serve_get_layer`
+//! (it would create a dep cycle), so it calls `build_layer_fd_layout` +
+//! `spawn_self_reaping_producer` directly.
+
+use std::io::Write;
+use std::os::fd::{AsFd as _, OwnedFd};
+use std::sync::Arc;
+
+use anyhow::{Context as _, Result};
+
+use composefs::fsverity::FsVerityHashValue;
+use composefs::repository::Repository;
+use composefs_splitdirfdstream::{
+    FdLimitError, build_layer_fd_layout, spawn_self_reaping_producer, split_fds_into_frames,
+};
+
+use crate::layer_sync::produce_layer_splitdirfdstream;
+
+// ── LayerSource trait ─────────────────────────────────────────────────────────
+
+/// A producer that can open the real diff-directory file descriptors for one
+/// layer and stream the layer content as a `splitdirfdstream`.
+///
+/// Implementors supply two things:
+/// 1. The set of real diff-directory fds (in chain order) **plus an opaque
+///    lifetime guard** that must be held open until the consumer signals
+///    completion (keepalive-pipe EOF).  For the repo case the guard is `()`;
+///    for containers-storage the guard is the shared `LayerStoreLock`.
+/// 2. A function that, given the slot-index assignments from the sparse layout,
+///    produces the `splitdirfdstream` bytes into a writer.
+///
+/// The trait is object-safe so it can be stored in `Box<dyn LayerSource>`.
+pub trait LayerSource: Send + 'static {
+    /// Open the real diff-directory file descriptors for this layer, in chain order,
+    /// AND acquire any lifetime-bound resources (e.g. a shared layer-store lock).
+    ///
+    /// Returns `(dirfds, guard)` where:
+    /// - `dirfds` — the real diff-dir fds, one per chain layer, in chain order.
+    /// - `guard` — an opaque object that **must be held open** until the
+    ///   consumer finishes processing (keepalive-pipe EOF). Dropping the guard
+    ///   signals that the layer's storage may be released.  For the repo source
+    ///   this is `Box::new(())` (no-op).
+    ///
+    /// The lock is acquired BEFORE the fds are opened so that the lock →
+    /// open sequence is atomic (avoids a TOCTOU race where a concurrent
+    /// `podman rmi` could unlink a diff directory between lock and open).
+    ///
+    /// Called once per `GetLayer` request, before the sparse layout is built.
+    fn open(&self) -> Result<(Vec<OwnedFd>, Box<dyn Send + 'static>)>;
+
+    /// Write the layer content as a `splitdirfdstream` to `out`.
+    ///
+    /// `dirfd_index_map[i]` is the slot index within the sparse dirfds region
+    /// where the `i`-th real diff-dir fd was placed.  The producer must use
+    /// `write_file_backed_data(dirfd_index_map[i], ...)` for every file that
+    /// belongs to chain layer `i`.
+    ///
+    /// This method is called from a `spawn_blocking` context so it MAY block.
+    fn produce(&self, dirfd_index_map: &[u32], out: Box<dyn Write + Send>) -> Result<()>;
+}
+
+// ── RepoLayerSource ───────────────────────────────────────────────────────────
+
+/// [`LayerSource`] implementation backed by a composefs [`Repository`].
+///
+/// The objects directory is the single real diff-dir (chain index 0).
+pub struct RepoLayerSource<ObjectID: FsVerityHashValue> {
+    /// The repository from which to serve the layer.
+    pub repo: Arc<Repository<ObjectID>>,
+    /// fs-verity hash of the layer splitstream to serve.
+    pub layer_verity: ObjectID,
+}
+
+impl<ObjectID> std::fmt::Debug for RepoLayerSource<ObjectID>
+where
+    ObjectID: FsVerityHashValue + std::fmt::Debug,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RepoLayerSource")
+            .field("layer_verity", &self.layer_verity)
+            .finish_non_exhaustive()
+    }
+}
+
+impl<ObjectID> LayerSource for RepoLayerSource<ObjectID>
+where
+    ObjectID: FsVerityHashValue,
+{
+    fn open(&self) -> Result<(Vec<OwnedFd>, Box<dyn Send + 'static>)> {
+        let objects_dir = self
+            .repo
+            .objects_dir()
+            .context("opening repository objects dir")?;
+        let dup = rustix::io::dup(objects_dir.as_fd())
+            .map_err(std::io::Error::from)
+            .context("dup objects_dir fd")?;
+        // No external lock needed for the repo source; the guard is a no-op.
+        Ok((vec![dup], Box::new(())))
+    }
+
+    fn produce(&self, dirfd_index_map: &[u32], out: Box<dyn Write + Send>) -> Result<()> {
+        // The objects dir is at chain layer index 0 → dirfd_index_map[0].
+        let objects_dirfd_index = dirfd_index_map
+            .first()
+            .copied()
+            .context("dirfd_index_map is empty: expected at least one real dir")?;
+        produce_layer_splitdirfdstream(&self.repo, &self.layer_verity, objects_dirfd_index, out)
+    }
+}
+
+// ── serve_get_layer ───────────────────────────────────────────────────────────
+
+/// Ready frame batches returned by [`serve_get_layer`].
+///
+/// The caller wraps these in interface-specific zlink `Reply` frames and
+/// yields them from the streaming `GetLayer` method.
+#[derive(Debug)]
+pub struct GetLayerFrames {
+    /// Number of dirfd slots in the sparse layout (`fds[1..=dir_count]`).
+    pub dir_count: u32,
+    /// FD batches in frame order; one inner `Vec<OwnedFd>` per transport frame.
+    pub batches: Vec<Vec<OwnedFd>>,
+}
+
+/// Error from [`serve_get_layer`].
+#[derive(Debug)]
+pub enum ServeGetLayerError {
+    /// `more=false` but total fd count exceeds `MAX_FDS_PER_FRAME`.
+    FdLimitExceeded(FdLimitError),
+    /// Any other I/O or source error.
+    Other(anyhow::Error),
+}
+
+impl From<anyhow::Error> for ServeGetLayerError {
+    fn from(e: anyhow::Error) -> Self {
+        ServeGetLayerError::Other(e)
+    }
+}
+
+/// Drive the full `GetLayer` flow for a [`LayerSource`] and return ready frame
+/// batches.
+///
+/// This is the single canonical implementation of the fd-layout + producer +
+/// framing logic for the repo `GetLayer` path.  The cstor `GetLayer` path in
+/// `composefs-storage` cannot call this function (dep cycle), so it calls
+/// `build_layer_fd_layout` and `spawn_self_reaping_producer` directly.
+/// Both paths share the same primitives from `composefs-splitdirfdstream`.
+///
+/// # Arguments
+/// * `source` — The layer source.  `source.open()` is called once; the returned
+///   guard is moved into the self-reaping producer task.
+/// * `seed` — Deterministic seed for sparse layout and frame count.
+/// * `more` — Whether the client requested multi-frame streaming (`more=true`)
+///   or a single-frame reply (`more=false`).
+///
+/// # Returns
+/// `Ok(GetLayerFrames)` — ready to wrap in zlink replies.
+/// `Err(ServeGetLayerError::FdLimitExceeded)` — total fd count exceeds
+///   `MAX_FDS_PER_FRAME`; retry with `more=true`.
+/// `Err(ServeGetLayerError::Other)` — source or I/O error.
+pub fn serve_get_layer(
+    source: impl LayerSource,
+    seed: u64,
+    more: bool,
+) -> std::result::Result<GetLayerFrames, ServeGetLayerError> {
+    // Open real dirfds + acquire lifetime guard.
+    let (real_fds, guard) = source.open()?;
+
+    // Create the data pipe.
+    let (pipe_read, pipe_write) = rustix::pipe::pipe_with(rustix::pipe::PipeFlags::CLOEXEC)
+        .map_err(|e| anyhow::anyhow!("pipe: {e}"))?;
+
+    // Build the sparse fd layout (dirfds region + keepalive + extra lifetime fds).
+    let layout = build_layer_fd_layout(pipe_read, real_fds, seed)
+        .map_err(|e| anyhow::anyhow!("build_layer_fd_layout: {e}"))?;
+
+    let dir_count = layout.dir_count;
+    let real_indices = layout.real_indices.clone();
+    let keepalive_read = layout.keepalive_read;
+
+    // Split into transport frames (enforces more=false single-frame cap).
+    let batches = split_fds_into_frames(layout.fds_all, seed, more)
+        .map_err(|(_fds, e)| ServeGetLayerError::FdLimitExceeded(e))?;
+
+    // Spawn the 3-phase self-reaping producer.
+    // `source` and `guard` are both moved into the closure.
+    // `guard` drops last (Phase 3), releasing any held lock.
+    spawn_self_reaping_producer(pipe_write, keepalive_read, guard, move |wf| {
+        if let Err(e) = source.produce(&real_indices, Box::new(wf)) {
+            tracing::warn!("GetLayer producer error: {e:#}");
+        }
+    });
+
+    Ok(GetLayerFrames { dir_count, batches })
+}
diff --git a/crates/composefs-oci/src/lib.rs b/crates/composefs-oci/src/lib.rs
index 0aefd575..f2de39a7 100644
--- a/crates/composefs-oci/src/lib.rs
+++ b/crates/composefs-oci/src/lib.rs
@@ -22,12 +22,19 @@ pub mod cstor;
 pub(crate) mod delta;
 pub mod image;
 pub mod layer;
+pub mod layer_sync;
+pub mod layer_transport;
 pub mod oci_image;
 pub mod oci_layout;
 /// Re-exported from [`composefs::progress`]; use that path directly in new code.
 pub mod progress;
 pub mod skopeo;
 pub mod tar;
+/// Shared wire types and client proxy for the `org.composefs.Oci` interface.
+///
+/// Available when the `varlink` feature is enabled.
+#[cfg(feature = "varlink")]
+pub mod varlink_types;
 
 /// Test utilities for building OCI images from dumpfile strings.
 #[cfg(any(test, feature = "test"))]
@@ -52,7 +59,7 @@ pub use containers_image_proxy::oci_spec::image::Digest as OciDigest;
 
 use containers_image_proxy::ImageProxyConfig;
 use containers_image_proxy::oci_spec::image::ImageConfiguration;
-use containers_image_proxy::oci_spec::image::{Descriptor, MediaType};
+use containers_image_proxy::oci_spec::image::{Descriptor, ImageManifest, MediaType};
 use sha2::{Digest, Sha256};
 
 use composefs::{
@@ -64,6 +71,14 @@ use composefs::{
 
 use crate::skopeo::{OCI_CONFIG_CONTENT_TYPE, TAR_LAYER_CONTENT_TYPE};
 
+/// The content-type tag used to identify OCI layer (tar) splitstreams in the
+/// repository.
+///
+/// Pass this to [`composefs::repository::Repository::open_stream`] when
+/// reading back a stored layer splitstream by its fs-verity hash.  The value
+/// is the 8-byte ASCII string `"ocilayer"` encoded as a little-endian `u64`.
+pub const LAYER_CONTENT_TYPE: u64 = TAR_LAYER_CONTENT_TYPE;
+
 /// Named ref key for the V2 EROFS image derived from this OCI config.
 pub const IMAGE_REF_KEY: &str = "composefs.image";
 
@@ -343,6 +358,17 @@ pub(crate) fn layer_identifier(diff_id: &OciDigest) -> String {
     format!("oci-layer-{diff_id}")
 }
 
+/// Return the content-identifier string used to register a layer splitstream.
+///
+/// This is the key passed to [`composefs::repository::Repository::has_stream`]
+/// and [`composefs::repository::Repository::write_stream`] for a given OCI
+/// diff-id.  Callers outside this crate (e.g. the varlink layer-sync service)
+/// use this to look up whether a layer has been imported and to obtain its
+/// fs-verity hash.
+pub fn layer_content_id(diff_id: &OciDigest) -> String {
+    layer_identifier(diff_id)
+}
+
 pub(crate) fn config_identifier(config: &OciDigest) -> String {
     format!("oci-config-{config}")
 }
@@ -445,7 +471,11 @@ pub(crate) fn sha256_output_to_digest(output: sha2::digest::Output<Sha256>) -> O
 
 /// Compute the SHA-256 content digest of `bytes`, returning an OCI digest
 /// (e.g. `sha256:abcd...`).
-pub(crate) fn sha256_content_digest(bytes: &[u8]) -> OciDigest {
+///
+/// This is primarily used to derive an OCI diff-id from a tar layer's raw
+/// bytes (before any compression).  It is also used by tests that construct
+/// synthetic layers without going through the full OCI pull path.
+pub fn sha256_content_digest(bytes: &[u8]) -> OciDigest {
     let mut context = Sha256::new();
     context.update(bytes);
     sha256_output_to_digest(context.finalize())
@@ -455,6 +485,25 @@ fn hash_sha256(bytes: &[u8]) -> OciDigest {
     sha256_content_digest(bytes)
 }
 
+/// Extract ordered layer identifiers from raw manifest and config JSON.
+///
+/// For standard container images (`ImageConfig` media type), parses the
+/// config and returns `rootfs.diff_ids`.  For OCI artifacts with
+/// non-standard config types, falls back to the manifest's layer digests.
+///
+/// This is the high-level entry point for callers that have raw JSON
+/// strings (e.g. from a varlink `Inspect` reply).
+pub fn extract_layer_ids(manifest_json: &str, config_json: &str) -> Result<Vec<String>> {
+    let manifest: ImageManifest =
+        serde_json::from_str(manifest_json).context("parsing manifest JSON")?;
+    extract_diff_ids(
+        manifest.config().media_type(),
+        config_json.as_bytes(),
+        manifest.layers(),
+    )
+    .map(|ids| ids.iter().map(|d| d.to_string()).collect())
+}
+
 /// Extract ordered diff_ids from a config descriptor.
 ///
 /// For standard container images (ImageConfig media type), parses the
@@ -464,7 +513,7 @@ fn hash_sha256(bytes: &[u8]) -> OciDigest {
 /// Note: oci-spec models diff_ids as `Vec<String>` but they are actually
 /// OCI content digests.  We parse them here so the rest of the codebase
 /// can work with the strongly-typed `Digest`.
-pub(crate) fn extract_diff_ids(
+pub fn extract_diff_ids(
     media_type: &MediaType,
     config_reader: impl Read,
     manifest_layers: &[Descriptor],
@@ -754,7 +803,7 @@ pub fn write_config_raw<ObjectID: FsVerityHashValue>(
 /// and are collected by the next GC.
 ///
 /// Returns the EROFS image's ObjectID (fs-verity digest).
-fn ensure_oci_composefs_erofs<ObjectID: FsVerityHashValue>(
+pub(crate) fn ensure_oci_composefs_erofs<ObjectID: FsVerityHashValue>(
     repo: &Arc<Repository<ObjectID>>,
     manifest_digest: &OciDigest,
     manifest_verity: Option<&ObjectID>,
diff --git a/crates/composefs-oci/src/test_util.rs b/crates/composefs-oci/src/test_util.rs
index b7558a7a..73afaaef 100644
--- a/crates/composefs-oci/src/test_util.rs
+++ b/crates/composefs-oci/src/test_util.rs
@@ -194,6 +194,146 @@ pub fn dumpfile_to_tar(dumpfile: &str) -> Vec<u8> {
     builder.into_inner().unwrap()
 }
 
+// ============================================================================
+// Low-level tar/config/manifest builders (used by layer_sync tests and
+// integration tests that need to synthesize OCI images from scratch)
+// ============================================================================
+
+/// Build a minimal OCI-compatible tar layer with standard directory entries
+/// and a single regular file whose `payload_size` bytes of content are
+/// deterministic (byte `i` is `(i % 251) as u8`).
+///
+/// The archive contains:
+/// - `./`  — root directory
+/// - `./usr/`  — usr directory
+/// - `./usr/share/`  — usr/share directory
+/// - `./usr/share/data_<payload_size>` — regular file with `payload_size` bytes
+///
+/// Using `payload_size > 64` (the inline threshold) will produce an external
+/// object when the layer is imported into a repository.
+pub fn build_oci_tar_layer(payload_size: usize) -> Vec<u8> {
+    let mut builder = ::tar::Builder::new(vec![]);
+
+    // Root directory.
+    let mut root_hdr = ::tar::Header::new_ustar();
+    root_hdr.set_entry_type(::tar::EntryType::Directory);
+    root_hdr.set_uid(0);
+    root_hdr.set_gid(0);
+    root_hdr.set_mode(0o755);
+    root_hdr.set_size(0);
+    builder
+        .append_data(&mut root_hdr, "./", std::io::empty())
+        .unwrap();
+
+    // usr/ directory.
+    let mut usr_hdr = ::tar::Header::new_ustar();
+    usr_hdr.set_entry_type(::tar::EntryType::Directory);
+    usr_hdr.set_uid(0);
+    usr_hdr.set_gid(0);
+    usr_hdr.set_mode(0o755);
+    usr_hdr.set_size(0);
+    builder
+        .append_data(&mut usr_hdr, "./usr/", std::io::empty())
+        .unwrap();
+
+    // usr/share/ directory.
+    let mut share_hdr = ::tar::Header::new_ustar();
+    share_hdr.set_entry_type(::tar::EntryType::Directory);
+    share_hdr.set_uid(0);
+    share_hdr.set_gid(0);
+    share_hdr.set_mode(0o755);
+    share_hdr.set_size(0);
+    builder
+        .append_data(&mut share_hdr, "./usr/share/", std::io::empty())
+        .unwrap();
+
+    // A data file with deterministic content.
+    let content: Vec<u8> = (0..payload_size).map(|i| (i % 251) as u8).collect();
+    let mut file_hdr = ::tar::Header::new_ustar();
+    file_hdr.set_entry_type(::tar::EntryType::Regular);
+    file_hdr.set_uid(0);
+    file_hdr.set_gid(0);
+    file_hdr.set_mode(0o644);
+    file_hdr.set_size(payload_size as u64);
+    builder
+        .append_data(
+            &mut file_hdr,
+            format!("./usr/share/data_{payload_size}"),
+            content.as_slice(),
+        )
+        .unwrap();
+
+    builder.into_inner().unwrap()
+}
+
+/// Build a minimal valid OCI `ImageConfiguration` JSON whose
+/// `rootfs.diff_ids` list matches `diff_ids`.
+///
+/// The produced JSON is recognized as a container image
+/// (`MediaType::ImageConfig`) so that `finalize_oci_image` / `OciImage::open`
+/// will generate a composefs EROFS from it.
+pub fn make_config_json(diff_ids: &[String]) -> Vec<u8> {
+    let rootfs = RootFsBuilder::default()
+        .typ("layers")
+        .diff_ids(diff_ids.to_vec())
+        .build()
+        .unwrap();
+
+    let cfg = ConfigBuilder::default().build().unwrap();
+
+    let config = ImageConfigurationBuilder::default()
+        .architecture("amd64")
+        .os("linux")
+        .rootfs(rootfs)
+        .config(cfg)
+        .build()
+        .unwrap();
+
+    config.to_string().unwrap().into_bytes()
+}
+
+/// Build a minimal valid OCI `ImageManifest` JSON that references
+/// `config_digest` as the config and has one layer descriptor per entry in
+/// `diff_ids`.
+///
+/// The layer descriptors use the diff_id as the compressed digest (valid for
+/// splitstream tests where no real compressed transport is used).
+pub fn make_manifest_json(
+    config_json: &[u8],
+    config_digest_str: &str,
+    diff_ids: &[String],
+) -> Vec<u8> {
+    let config_digest: OciDigest = config_digest_str.parse().unwrap();
+    let config_desc = DescriptorBuilder::default()
+        .media_type(MediaType::ImageConfig)
+        .digest(config_digest)
+        .size(config_json.len() as u64)
+        .build()
+        .unwrap();
+
+    let layer_descs: Vec<_> = diff_ids
+        .iter()
+        .map(|d| {
+            DescriptorBuilder::default()
+                .media_type(MediaType::ImageLayerGzip)
+                .digest(d.parse::<OciDigest>().unwrap())
+                .size(1u64)
+                .build()
+                .unwrap()
+        })
+        .collect();
+
+    let manifest = ImageManifestBuilder::default()
+        .schema_version(2u32)
+        .media_type(MediaType::ImageManifest)
+        .config(config_desc)
+        .layers(layer_descs)
+        .build()
+        .unwrap();
+
+    manifest.to_string().unwrap().into_bytes()
+}
+
 /// Return value from image creation helpers.
 #[allow(dead_code)]
 pub struct TestImage {
diff --git a/crates/composefs-oci/src/varlink_types.rs b/crates/composefs-oci/src/varlink_types.rs
new file mode 100644
index 00000000..d52ab2f2
--- /dev/null
+++ b/crates/composefs-oci/src/varlink_types.rs
@@ -0,0 +1,253 @@
+//! Shared wire types and the `OciProxy` client trait for the
+//! `org.composefs.Oci` varlink interface.
+//!
+//! These types are defined here (in `composefs-oci`) rather than
+//! `composefs-ctl` so that both the repo-side service (`CfsctlService` in
+//! composefs-ctl) and the containers-storage service (`CstorLayerService` in
+//! composefs-storage) can share the same proxy trait without creating a
+//! dependency cycle.
+//!
+//! `composefs-ctl` re-exports everything from this module; callers should
+//! prefer `composefs_oci::varlink_types` or the re-exports in
+//! `composefs_ctl::varlink::{layer_sync,oci::OciError,proxy::OciProxy}`.
+//!
+//! # Feature gate
+//!
+//! This module is compiled only when the `varlink` feature is enabled on
+//! `composefs-oci` (which pulls in `zlink`).
+
+#![allow(missing_docs)]
+
+use serde::{Deserialize, Serialize};
+
+// ── Locator for a layer inside containers-storage ────────────────────────────
+
+/// Locator for a layer that lives inside a containers-storage store.
+///
+/// Both fields are required when routing a `GetLayer` call to the
+/// `CstorLayerService`; the repo-side service ignores this field entirely.
+#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+pub struct StorageLocator {
+    /// Absolute path to the containers-storage root directory
+    /// (e.g. `/var/lib/containers/storage`).
+    pub storage_path: String,
+    /// The layer ID within that storage root, as returned by
+    /// `storage_layer_ids()`.
+    pub layer_id: String,
+}
+
+/// Parameters for the `GetLayer` method of the `org.composefs.Oci` interface.
+///
+/// Exactly one of `diff_id` or `storage` must be set:
+/// - **Repo service** (`CfsctlService`): reads `diff_id`, errors if `None`.
+/// - **Cstor service** (`CstorLayerService`): reads `storage`, errors if `None`.
+///
+/// Both fields are `Option` for forward-compatibility: new locator kinds can
+/// be added in future without breaking old clients.
+#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+pub struct GetLayerParams {
+    /// OCI diff-id (`sha256:…`) identifying the layer in a composefs repo.
+    pub diff_id: Option<String>,
+    /// Location of a specific layer inside a containers-storage store.
+    pub storage: Option<StorageLocator>,
+}
+
+// ── Reply types ───────────────────────────────────────────────────────────────
+
+/// Reply from `GetInfo`: capability tokens supported by this service.
+#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+pub struct GetInfoReply {
+    /// Capability tokens advertised by this service instance.
+    ///
+    /// Currently only `"splitdirfdstream-v0"` is defined.  The cstor service
+    /// additionally reports `"source-containers-storage"` and `"read-only"`.
+    pub features: Vec<String>,
+}
+
+/// Reply from `HasLayer`: whether the layer is present in the repository.
+#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+pub struct HasLayerReply {
+    /// Whether the layer splitstream for the given diff-id is present.
+    pub present: bool,
+    /// Hex-encoded fs-verity hash of the layer splitstream, if present.
+    pub layer_verity: Option<String>,
+}
+
+/// Reply from `GetLayer`: the number of diff-directory slots in the logical FD
+/// array.
+///
+/// `GetLayer` is a **streaming** method (`more`): it yields multiple frames,
+/// each carrying a batch of FDs.  The client MUST concatenate the FD batches
+/// from all frames (in arrival order) to reconstruct the full logical FD array:
+///
+/// - `fds[0]` — data pipe read end (carries the `splitdirfdstream` bytes).
+/// - `fds[1..=dir_count]` — the dirfds region (`dir_count` slots total).  The
+///   real objects-directory fd sits at a sparse, hash-determined index within
+///   this region; the remaining (gap) slots hold inert dummy fds that
+///   `reconstruct` never dereferences.
+/// - `fds[dir_count+1..]` — opaque lifetime FDs.  The client MUST hold every
+///   one of these open until it has finished reading and processing all dir
+///   fds, then close them all to signal completion to the server.
+#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+pub struct GetLayerReply {
+    /// Number of diff-directory file descriptors in the full logical FD array
+    /// (i.e. `fds[1..=dir_count]` after concatenating all frames' batches).
+    pub dir_count: u32,
+}
+
+/// Reply from `PutLayer`: the verity hash of the imported layer, whether
+/// it was already present, and per-object transfer statistics.
+#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+pub struct PutLayerReply {
+    /// Hex-encoded fs-verity hash of the committed layer splitstream.
+    pub layer_verity: String,
+    /// `true` if the layer was already present before this call.
+    pub already_present: bool,
+
+    /// Number of objects that were reflinked (FICLONE) into the destination.
+    #[serde(default)]
+    pub objects_reflinked: u64,
+    /// Number of objects hardlinked into the destination.
+    #[serde(default)]
+    pub objects_hardlinked: u64,
+    /// Number of objects byte-copied into the destination.
+    #[serde(default)]
+    pub objects_copied: u64,
+    /// Number of objects already present in the destination (skipped).
+    #[serde(default)]
+    pub objects_already_present: u64,
+}
+
+/// A single (diff_id, layer_verity) pair passed to `FinalizeImage`.
+#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+pub struct LayerRef {
+    /// OCI diff-id of the layer (e.g. `"sha256:abcd..."`).
+    pub diff_id: String,
+    /// Hex-encoded fs-verity hash of the layer splitstream in the destination
+    /// repository, as returned by `PutLayer`.
+    pub layer_verity: String,
+}
+
+/// Reply from `FinalizeImage`.
+#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+pub struct FinalizeImageReply {
+    /// OCI digest of the manifest.
+    pub manifest_digest: String,
+    /// Hex-encoded fs-verity hash of the manifest splitstream.
+    pub manifest_verity: String,
+    /// OCI digest of the config.
+    pub config_digest: String,
+    /// Hex-encoded fs-verity hash of the config splitstream.
+    pub config_verity: String,
+}
+
+// ── OciError ──────────────────────────────────────────────────────────────────
+
+/// Errors returned by the `org.composefs.Oci` interface.
+#[derive(Debug, zlink::ReplyError, zlink::introspect::ReplyError)]
+#[zlink(interface = "org.composefs.Oci")]
+pub enum OciError {
+    /// The repository could not be found or opened at the configured path.
+    RepoNotFound {
+        /// Description of the failure.
+        message: String,
+    },
+    /// The given handle does not refer to an open repository.
+    InvalidHandle {
+        /// The handle that was not found.
+        handle: u64,
+    },
+    /// The named OCI image/reference does not exist.
+    NoSuchImage {
+        /// The image reference that was not found.
+        image: String,
+    },
+    /// An unexpected internal error occurred while servicing the request.
+    InternalError {
+        /// Description of the failure.
+        message: String,
+    },
+    /// The requested layer (by diff-id) is not present in the repository.
+    NoSuchLayer {
+        /// The diff-id that was not found.
+        diff_id: String,
+    },
+    /// A supplied digest/diff-id string was malformed.
+    InvalidDigest {
+        /// Human-readable description of the parse failure.
+        message: String,
+    },
+    /// Received layer content did not hash to the declared diff-id.
+    DiffIdMismatch {
+        /// The diff_id that was declared by the client.
+        expected: String,
+        /// The sha256 digest of the data that was actually received.
+        actual: String,
+    },
+    /// The request was malformed (e.g. wrong fd count).
+    InvalidRequest {
+        /// Human-readable description of what was wrong.
+        message: String,
+    },
+    /// The total fd count exceeds the per-frame cap for a `more=false` call.
+    FdLimitExceeded {
+        /// Total number of fds that would be sent.
+        fd_count: u64,
+        /// The per-frame cap that was exceeded.
+        max_per_frame: u64,
+    },
+}
+
+// ── OciProxy trait ────────────────────────────────────────────────────────────
+
+/// Typed client proxy for the `org.composefs.Oci` varlink interface.
+///
+/// Both the composefs repo service and the containers-storage service expose
+/// this interface; this proxy trait can be used against either.
+#[zlink::proxy(interface = "org.composefs.Oci")]
+pub trait OciProxy {
+    /// Query capability tokens supported by the service.
+    async fn get_info(&mut self) -> zlink::Result<Result<GetInfoReply, OciError>>;
+
+    /// Check whether a layer is present in the repository.
+    async fn has_layer(
+        &mut self,
+        handle: u64,
+        diff_id: &str,
+    ) -> zlink::Result<Result<HasLayerReply, OciError>>;
+
+    /// Stream the layer as a `splitdirfdstream` with full hardened fd-transport
+    /// contract (sparse dirfds, keepalive, lifetime fds, multi-frame).
+    ///
+    /// `params.diff_id` is used by the repo service; `params.storage` is used
+    /// by the cstor service.
+    #[zlink(more, return_fds)]
+    async fn get_layer(
+        &mut self,
+        handle: u64,
+        params: GetLayerParams,
+    ) -> zlink::Result<
+        impl zlink::futures_util::Stream<
+            Item = zlink::Result<(Result<GetLayerReply, OciError>, Vec<std::os::fd::OwnedFd>)>,
+        >,
+    >;
+
+    /// Receive a layer as a `splitdirfdstream` from the client and import it.
+    async fn put_layer(
+        &mut self,
+        handle: u64,
+        diff_id: &str,
+        zerocopy: bool,
+        #[zlink(fds)] fds: Vec<std::os::fd::OwnedFd>,
+    ) -> zlink::Result<Result<PutLayerReply, OciError>>;
+
+    /// Finalize an OCI image after all layers have been imported.
+    async fn finalize_image(
+        &mut self,
+        handle: u64,
+        manifest_json: &str,
+        config_json: &str,
+        layers: Vec<LayerRef>,
+        name: Option<&str>,
+    ) -> zlink::Result<Result<FinalizeImageReply, OciError>>;
+}
diff --git a/crates/composefs-splitdirfdstream/Cargo.toml b/crates/composefs-splitdirfdstream/Cargo.toml
new file mode 100644
index 00000000..9f46172c
--- /dev/null
+++ b/crates/composefs-splitdirfdstream/Cargo.toml
@@ -0,0 +1,32 @@
+[package]
+name = "composefs-splitdirfdstream"
+description = "Binary format for serializing data with external dirfd-relative file references"
+keywords = ["splitdirfdstream", "fd", "serialization"]
+
+edition.workspace = true
+license.workspace = true
+# Crate-local README (the inherited workspace value resolves to the workspace
+# root README, not this crate's).
+readme = "README.md"
+repository.workspace = true
+rust-version.workspace = true
+version.workspace = true
+
+[features]
+## Enable `spawn_self_reaping_producer` (requires tokio).
+tokio = ["dep:tokio"]
+
+[dependencies]
+rand = { version = "0.10.0", default-features = false, features = ["alloc"] }
+rand_pcg = { version = "0.10.0", default-features = false }
+rustix = { version = "1.0.0", default-features = false, features = ["fs", "pipe", "std"] }
+sha2 = { version = "0.10", default-features = false, features = ["std"] }
+thiserror = { version = "2.0.0", default-features = false }
+tokio = { version = "1", default-features = false, features = ["rt"], optional = true }
+
+[dev-dependencies]
+proptest = "1"
+tempfile = { version = "3.8.0", default-features = false }
+
+[lints]
+workspace = true
diff --git a/crates/composefs-splitdirfdstream/README.md b/crates/composefs-splitdirfdstream/README.md
new file mode 100644
index 00000000..6fc48887
--- /dev/null
+++ b/crates/composefs-splitdirfdstream/README.md
@@ -0,0 +1,15 @@
+# composefs-splitdirfdstream
+
+A data format and IPC protocol for sending a binary stream across local
+processes via file descriptor passing (DBus, varlink, etc.). Designed for
+sending tar archives of container image layers that are unpacked into a storage
+system such as composefs or docker/podman `overlay` storage.
+
+See the [crate documentation](https://docs.rs/composefs-splitdirfdstream) (or
+run `cargo doc --open`) for the wire format, chunk layouts, limits, safety
+model, and API surface.
+
+## License
+
+Licensed under the same terms as the
+[composefs-rs](https://github.com/containers/composefs-rs) workspace.
diff --git a/crates/composefs-splitdirfdstream/src/lib.rs b/crates/composefs-splitdirfdstream/src/lib.rs
new file mode 100644
index 00000000..51b501d1
--- /dev/null
+++ b/crates/composefs-splitdirfdstream/src/lib.rs
@@ -0,0 +1,1763 @@
+//! A data format and IPC protocol for sending a binary stream across local
+//! processes via file descriptor passing (DBus, varlink, etc.).
+//!
+//! Designed for sending tar archives of container image layers that are unpacked
+//! into a storage system such as composefs or docker/podman `overlay` storage.
+//! More generally it is a mechanism for reassembling any byte stream from a mix
+//! of inline bytes and whole-file references, useful whenever the bulk of a
+//! stream already lives as files in a content store and copying that bulk inline
+//! would be wasteful.
+//!
+//! External content is identified by `(dirfd_index, filename)` pairs so the
+//! receiving side can `openat2` the files itself given a small out-of-band array
+//! of directory file descriptors. This avoids passing one open fd per external
+//! chunk (which would not scale to streams referencing thousands of files) and
+//! suits reconstructing a stream from a content store laid out on disk.
+//!
+//! # Format
+//!
+//! A splitdirfdstream is a sequence of chunks with no header or footer. Combined
+//! with an out-of-band, ordered array of directory file descriptors `dirfds[0..D]`,
+//! it reconstructs a byte stream by concatenating each chunk's contribution:
+//!
+//! - **Metadata chunk** — raw stream metadata (tar header/padding) carried verbatim.
+//! - **InlineData chunk** — file content transported inline (for non-world-readable
+//!   files the producer read through a privileged fd).
+//! - **FileBackedData chunk** — `content_length` bytes of a file, resolved via
+//!   `openat2(dirfds[dirfd_index], filename, RESOLVE_BENEATH)` and read from offset 0.
+//!
+//! All integers are little-endian. Each chunk begins with a single type byte:
+//!
+//! | Type byte | Chunk          | Remaining header                                                  | Body                         |
+//! |-----------|----------------|-------------------------------------------------------------------|------------------------------|
+//! | `0x00`    | Metadata       | `u32 LE` — body length                                            | `length` bytes               |
+//! | `0x01`    | InlineData     | `u32 LE` — body length                                            | `length` bytes               |
+//! | `0x02`    | FileBackedData | `u64 LE` content_length, `u32 LE` dirfd_index, `u32 LE` name_len | `name_len` bytes of filename |
+//!
+//! Any other type byte is a hard error ([`Error::UnknownChunkType`]).
+//!
+//! There is no in-band end-of-stream sentinel: the stream ends at clean EOF at
+//! the start of a type byte. A partial read anywhere inside a chunk is a
+//! truncation error ([`Error::Truncated`]). Because the format carries no length
+//! or checksum of the whole stream, callers that need end-to-end integrity should
+//! verify the reconstructed bytes against an expected size and digest out of band.
+//!
+//! # Chunk layouts
+//!
+//! ### Metadata
+//!
+//! ```text
+//! +--------+---------------+----------------------------+
+//! | 0x00   | length: u32LE | data: `length` raw bytes   |
+//! +--------+---------------+----------------------------+
+//! ```
+//!
+//! `data` (`length` bytes) is written verbatim to output (tar headers, padding,
+//! etc.). Empty writes are silently dropped by the writer; a zero-length Metadata
+//! chunk is never encoded. `length` is bounded by [`MAX_INLINE_CHUNK_SIZE`] (256 MiB).
+//!
+//! ### InlineData
+//!
+//! ```text
+//! +--------+---------------+----------------------------+
+//! | 0x01   | length: u32LE | data: `length` raw bytes   |
+//! +--------+---------------+----------------------------+
+//! ```
+//!
+//! `length` is both the byte count of the data that follows and the logical file
+//! size the consumer uses for its inline-vs-object storage decision. Unlike
+//! FileBackedData, no directory fd is involved; the producer has already read the
+//! content through its own privileged fd.
+//!
+//! A zero-length InlineData **is** written and round-trips correctly — a zero-byte
+//! non-world-readable file must still be transported. `length` is bounded by
+//! [`MAX_INLINE_CHUNK_SIZE`] (256 MiB), since the data is fully buffered in memory
+//! during transport.
+//!
+//! ### FileBackedData
+//!
+//! ```text
+//! +--------+---------------------+--------------------+-----------------+------------------------------+
+//! | 0x02   | content_len: u64LE  | dirfd_index: u32LE | name_len: u32LE | filename: name_len raw bytes |
+//! +--------+---------------------+--------------------+-----------------+------------------------------+
+//! ```
+//!
+//! The consumer reads exactly `content_len` bytes starting at offset 0 of the
+//! opened file. The explicit `content_len` makes the stream self-framing — entry
+//! boundaries never depend on trusting the backing file's size, which closes a
+//! TOCTOU window if the underlying store is mutated mid-read.
+//!
+//! A FileBackedData chunk always starts at offset 0: it references a whole file,
+//! not a byte range. This is deliberate — it lets every external reference reflink
+//! cleanly (`FICLONE`) when materialized on a CoW filesystem. There is no range
+//! variant.
+//!
+//! `filename` is a path relative to `dirfds[dirfd_index]`. It may contain `/` to
+//! traverse subdirectories within that root; it is not NUL-terminated and must not
+//! contain NUL.
+//!
+//! # Limits
+//!
+//! | Constant | Value | Purpose |
+//! |----------|-------|---------|
+//! | [`MAX_INLINE_CHUNK_SIZE`] | 256 MiB | Bounds memory for a single Metadata or InlineData chunk body. |
+//! | [`MAX_FILENAME_LEN`] | 4096 bytes | Bounds the FileBackedData filename length. |
+//!
+//! The reader rejects an out-of-range `dirfd_index`, and Metadata or InlineData
+//! chunks whose `length` exceeds `MAX_INLINE_CHUNK_SIZE`.
+//!
+//! # Safety
+//!
+//! The consuming side should always use `openat2(RESOLVE_BENEATH)` or equivalent.
+//! This crate uses `rustix::fs::openat2` with
+//! `RESOLVE_BENEATH | RESOLVE_NO_SYMLINKS | RESOLVE_NO_MAGICLINKS`, falling back
+//! to `openat(O_NOFOLLOW)` on kernels older than 5.6. The [`validate_filename`]
+//! function rejects `..` components, absolute paths, and embedded NUL bytes before
+//! any syscall is made.
+//!
+//! The reader performs only *structural* validation (framing, limits); it does
+//! **not** validate filename *content*. Callers that consume
+//! [`Chunk::FileBackedData`] `.filename` directly must call [`validate_filename`]
+//! (or use [`open_beneath`] / [`reconstruct`], which do).
+//!
+//! # Examples
+//!
+//! Write a stream, then inspect its chunk structure:
+//!
+//! ```
+//! use composefs_splitdirfdstream::{SplitdirfdstreamWriter, SplitdirfdstreamReader, Chunk};
+//!
+//! // Write a stream: inline glue bytes plus a reference to dirfds[0]/data/blob.
+//! let mut buffer = Vec::new();
+//! let mut writer = SplitdirfdstreamWriter::new(&mut buffer);
+//! writer.write_metadata(b"tar header bytes").unwrap();
+//! writer.write_file_backed_data(0, 5, b"data/blob").unwrap(); // 5 bytes from dirfds[0]/data/blob
+//! writer.write_metadata(b"tar padding").unwrap();
+//! writer.finish().unwrap();
+//!
+//! // Inspect the chunk structure.
+//! let mut reader = SplitdirfdstreamReader::new(buffer.as_slice());
+//! while let Some(chunk) = reader.next_chunk().unwrap() {
+//!     match chunk {
+//!         Chunk::Metadata(data) => { /* tar header/padding bytes */ }
+//!         Chunk::InlineData(data) => { /* inline file content (non-world-readable files) */ }
+//!         Chunk::FileBackedData { dirfd_index, length, filename } => { /* (dirfds[i], name, len) */ }
+//!     }
+//! }
+//! ```
+//!
+//! To reconstruct the full byte stream, supply the directory fds to
+//! [`reconstruct`], which resolves and splices each external chunk for you:
+//!
+//! ```no_run
+//! use std::os::fd::BorrowedFd;
+//! # fn demo(stream: &[u8], dir: BorrowedFd<'_>, out: &mut Vec<u8>) {
+//! let dirfds = [dir];
+//! let total = composefs_splitdirfdstream::reconstruct(stream, &dirfds, out).unwrap();
+//! # let _ = total;
+//! # }
+//! ```
+//!
+//! # API surface
+//!
+//! | Item | Role |
+//! |------|------|
+//! | [`SplitdirfdstreamWriter`] | Encode inline + external chunks into the wire format. |
+//! | [`SplitdirfdstreamReader`] | Iterate a stream as borrowed [`Chunk`]s. |
+//! | [`Chunk`] | `Metadata(&[u8])`, `InlineData(&[u8])`, or `FileBackedData { dirfd_index, length, filename }`. |
+//! | [`reconstruct`] | Reconstruct the full byte stream given the directory fds. |
+//! | [`open_beneath`] | Safely open one external file beneath a directory fd. |
+//! | [`validate_filename`] | The path-safety predicate (reused by the writer/consumer). |
+//!
+//! # See also
+//!
+//! This crate is only the stream format. A higher-level control channel —
+//! opening a source, negotiating capabilities, and handing over the stream fd
+//! plus directory fds over a socket via `SCM_RIGHTS` — is layered on top
+//! elsewhere (e.g. the `composefs-storage` layer-transfer service); it carries
+//! structured metadata only, with all binary content flowing through this format
+//! and the directory fds.
+
+// This is a library: emit diagnostics via the `log` crate (or return them),
+// never by writing to the process's stdout/stderr. Genuinely-intentional
+// exceptions carry a local `#[allow]` with justification. Test code is exempt.
+#![cfg_attr(not(test), deny(clippy::print_stdout, clippy::print_stderr))]
+
+use std::ffi::CString;
+use std::io::{Read, Write};
+use std::os::fd::{BorrowedFd, OwnedFd};
+
+use rustix::fs::{Mode, OFlags, ResolveFlags};
+use rustix::io::Errno;
+
+pub mod transport;
+#[cfg(feature = "tokio")]
+pub use transport::spawn_self_reaping_producer;
+pub use transport::{
+    FdLimitError, LayerFdLayout, MAX_FDS_PER_FRAME, build_layer_fd_layout, open_devnull,
+    seed_from_id, split_fds_into_frames,
+};
+
+/// Maximum size for an inline chunk (256 MiB).
+///
+/// This limit prevents denial-of-service attacks where a malicious stream
+/// could specify an extremely large inline chunk size, causing unbounded
+/// memory allocation.
+pub const MAX_INLINE_CHUNK_SIZE: usize = 256 * 1024 * 1024;
+
+/// Maximum length of an external filename in bytes.
+///
+/// Filenames longer than this are rejected by [`validate_filename`] and
+/// by the reader before any buffer is resized.
+pub const MAX_FILENAME_LEN: usize = 4096;
+
+/// Errors that can occur while reading or writing a splitdirfdstream.
+#[derive(Debug, thiserror::Error)]
+#[non_exhaustive]
+pub enum Error {
+    /// An underlying I/O error.
+    #[error(transparent)]
+    Io(#[from] std::io::Error),
+
+    /// The stream ended in the middle of a chunk.
+    #[error("truncated stream: expected {expected} more bytes in chunk")]
+    Truncated {
+        /// How many bytes were still expected when EOF was encountered.
+        ///
+        /// Semantics depend on *where* truncation occurred:
+        ///
+        /// - **Type byte**: `1` if EOF arrived after 0 bytes (but that is
+        ///   returned as `Ok(None)`; this only fires for 0 < n < expected).
+        /// - **Inline body or FileBackedData header/name**: the *full* declared
+        ///   size (i.e. the number passed to `read_exact`), because `read_exact`
+        ///   does not report how many bytes it successfully consumed before EOF.
+        expected: u64,
+    },
+
+    /// An inline chunk's size field exceeds [`MAX_INLINE_CHUNK_SIZE`].
+    #[error("inline chunk size {size} exceeds maximum {max}")]
+    InlineTooLarge {
+        /// The size read from the stream.
+        size: usize,
+        /// The maximum allowed size.
+        max: usize,
+    },
+
+    /// An unrecognised chunk type byte was encountered.
+    #[error("unknown chunk type byte 0x{0:02x}")]
+    UnknownChunkType(u8),
+
+    /// A filename exceeds [`MAX_FILENAME_LEN`] bytes.
+    #[error("filename length {len} exceeds maximum {max}")]
+    FilenameTooLong {
+        /// The length of the filename.
+        len: usize,
+        /// The maximum allowed length.
+        max: usize,
+    },
+
+    /// A filename failed validation.
+    #[error("invalid filename: {reason}")]
+    InvalidFilename {
+        /// Human-readable description of why the filename is invalid.
+        reason: &'static str,
+    },
+
+    /// A dirfd index in an external chunk was out of range.
+    #[error("dirfd index {index} out of range (have {count} dirfds)")]
+    DirfdIndexOutOfRange {
+        /// The index that was out of range.
+        index: u32,
+        /// The number of dirfds available.
+        count: usize,
+    },
+
+    /// An external file was shorter than declared in the stream.
+    #[error("external file shorter than declared length {declared} (got {actual})")]
+    ExternalTooShort {
+        /// The length declared in the stream.
+        declared: u64,
+        /// The number of bytes actually read before EOF.
+        actual: u64,
+    },
+}
+
+impl From<Errno> for Error {
+    fn from(e: Errno) -> Self {
+        Error::Io(e.into())
+    }
+}
+
+/// Convenience alias for `Result<T, Error>`.
+pub type Result<T> = std::result::Result<T, Error>;
+
+/// A chunk decoded from a splitdirfdstream.
+///
+/// Chunks carry either stream metadata (raw tar header/padding bytes),
+/// inline-transported file content (non-world-readable files the producer
+/// read through a privileged fd), or references to external files identified
+/// by a directory fd index and a relative filename.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum Chunk<'a> {
+    /// Stream metadata: raw tar header or padding bytes embedded directly in
+    /// the stream. Consumers write these verbatim into the output.
+    Metadata(&'a [u8]),
+
+    /// Inline-transported file content. The producer read these bytes through
+    /// a privileged fd; the consumer decides whether to store them inline
+    /// (≤ splitstream threshold) or as an external object based on `data.len()`.
+    InlineData(&'a [u8]),
+
+    /// Reference to an external file relative to one of the caller-supplied
+    /// directory file descriptors.
+    ///
+    /// The reader does **not** validate `filename` content — callers that
+    /// consume `filename` directly **must** call [`validate_filename`] or use
+    /// [`open_beneath`] / [`reconstruct`], which call it internally.
+    FileBackedData {
+        /// Index into the `dirfds` array supplied to [`reconstruct`].
+        dirfd_index: u32,
+        /// Number of bytes to read from the file, starting at offset 0.
+        length: u64,
+        /// Relative filename within `dirfds[dirfd_index]`.
+        filename: &'a [u8],
+    },
+}
+
+/// Writer for building a splitdirfdstream.
+///
+/// Encodes inline data and dirfd-relative file references into the
+/// splitdirfdstream binary format.
+///
+/// # Example
+///
+/// ```
+/// use composefs_splitdirfdstream::SplitdirfdstreamWriter;
+///
+/// let mut buffer = Vec::new();
+/// let mut writer = SplitdirfdstreamWriter::new(&mut buffer);
+/// writer.write_metadata(b"hello").unwrap();
+/// writer.write_file_backed_data(0, 42, b"objects/abc123").unwrap();
+/// let _buf = writer.finish().unwrap();
+/// ```
+#[derive(Debug)]
+pub struct SplitdirfdstreamWriter<W> {
+    writer: W,
+}
+
+impl<W: Write> SplitdirfdstreamWriter<W> {
+    /// Create a new writer wrapping `writer`.
+    pub fn new(writer: W) -> Self {
+        Self { writer }
+    }
+
+    /// Write an inline chunk containing `data`.
+    ///
+    /// Empty slices are silently ignored (no bytes are written).
+    ///
+    /// # Errors
+    ///
+    /// Returns [`Error::InlineTooLarge`] if `data.len()` exceeds
+    /// [`MAX_INLINE_CHUNK_SIZE`], or propagates I/O errors from the
+    /// underlying writer.
+    pub fn write_metadata(&mut self, data: &[u8]) -> Result<()> {
+        if data.is_empty() {
+            return Ok(());
+        }
+        if data.len() > MAX_INLINE_CHUNK_SIZE {
+            return Err(Error::InlineTooLarge {
+                size: data.len(),
+                max: MAX_INLINE_CHUNK_SIZE,
+            });
+        }
+        self.writer.write_all(&[0x00u8])?;
+        self.writer.write_all(&(data.len() as u32).to_le_bytes())?;
+        self.writer.write_all(data)?;
+        Ok(())
+    }
+
+    /// Write an [`InlineData`](Chunk::InlineData) chunk carrying `data` bytes
+    /// of file content transported inline.
+    ///
+    /// Unlike [`write_metadata`](Self::write_metadata), a zero-length slice IS
+    /// written (a zero-byte non-world-readable file must still round-trip).
+    ///
+    /// # Errors
+    ///
+    /// Returns [`Error::InlineTooLarge`] if `data.len()` exceeds
+    /// [`MAX_INLINE_CHUNK_SIZE`], or propagates I/O errors from the underlying
+    /// writer.
+    pub fn write_inline_data(&mut self, data: &[u8]) -> Result<()> {
+        if data.len() > MAX_INLINE_CHUNK_SIZE {
+            return Err(Error::InlineTooLarge {
+                size: data.len(),
+                max: MAX_INLINE_CHUNK_SIZE,
+            });
+        }
+        self.writer.write_all(&[0x01u8])?;
+        self.writer.write_all(&(data.len() as u32).to_le_bytes())?;
+        self.writer.write_all(data)?;
+        Ok(())
+    }
+
+    /// Write a [`FileBackedData`](Chunk::FileBackedData) chunk referencing a
+    /// file by dirfd index and relative filename.
+    ///
+    /// The consumer will open `filename` beneath `dirfds[dirfd_index]` and
+    /// read exactly `length` bytes from offset 0.
+    ///
+    /// `filename` is validated by [`validate_filename`] before writing.
+    ///
+    /// # Errors
+    ///
+    /// Returns any [`Error`] produced by [`validate_filename`], or propagates
+    /// I/O errors from the underlying writer.
+    pub fn write_file_backed_data(
+        &mut self,
+        dirfd_index: u32,
+        length: u64,
+        filename: &[u8],
+    ) -> Result<()> {
+        validate_filename(filename)?;
+        self.writer.write_all(&[0x02u8])?;
+        self.writer.write_all(&length.to_le_bytes())?;
+        self.writer.write_all(&dirfd_index.to_le_bytes())?;
+        self.writer
+            .write_all(&(filename.len() as u32).to_le_bytes())?;
+        self.writer.write_all(filename)?;
+        Ok(())
+    }
+
+    /// Consume the writer and return the underlying `Write` impl.
+    pub fn finish(self) -> Result<W> {
+        Ok(self.writer)
+    }
+}
+
+/// Reader for parsing a splitdirfdstream.
+///
+/// Yields [`Chunk`] values by parsing the binary format. The internal buffer
+/// is reused across calls so that only one chunk's data is live at a time.
+///
+/// # Example
+///
+/// ```
+/// use composefs_splitdirfdstream::{SplitdirfdstreamReader, Chunk};
+///
+/// // Manually constructed stream: Metadata "hello"
+/// // Format: [0x00][5u32 LE][b"hello"]
+/// let mut data = Vec::new();
+/// data.push(0x00u8);
+/// data.extend_from_slice(&5u32.to_le_bytes());
+/// data.extend_from_slice(b"hello");
+///
+/// let mut reader = SplitdirfdstreamReader::new(data.as_slice());
+/// assert_eq!(reader.next_chunk().unwrap(), Some(Chunk::Metadata(b"hello")));
+/// assert_eq!(reader.next_chunk().unwrap(), None);
+/// ```
+#[derive(Debug)]
+pub struct SplitdirfdstreamReader<R> {
+    reader: R,
+    /// Internal buffer reused across [`next_chunk`](SplitdirfdstreamReader::next_chunk) calls.
+    buffer: Vec<u8>,
+}
+
+impl<R: Read> SplitdirfdstreamReader<R> {
+    /// Create a new reader wrapping `reader`.
+    pub fn new(reader: R) -> Self {
+        Self {
+            reader,
+            buffer: Vec::new(),
+        }
+    }
+
+    /// Consume this reader, returning the underlying `Read` impl.
+    pub fn into_inner(self) -> R {
+        self.reader
+    }
+
+    /// Return the next chunk from the stream, or `None` at a clean EOF.
+    ///
+    /// A clean EOF is one where zero bytes have been consumed from the current
+    /// type byte. Any partial read (0 < n < expected) is [`Error::Truncated`].
+    ///
+    /// The returned [`Chunk::FileBackedData`] contains the raw `filename` bytes
+    /// without any validation — callers that use `filename` directly **must**
+    /// call [`validate_filename`] themselves, or use [`reconstruct`] which does
+    /// it internally via [`open_beneath`].
+    ///
+    /// # Errors
+    ///
+    /// - [`Error::Truncated`] — stream ended mid-chunk
+    /// - [`Error::InlineTooLarge`] — Metadata or InlineData body size exceeds [`MAX_INLINE_CHUNK_SIZE`]
+    /// - [`Error::FilenameTooLong`] — filename length exceeds [`MAX_FILENAME_LEN`]
+    /// - [`Error::UnknownChunkType`] — unrecognised type byte
+    /// - [`Error::Io`] — underlying I/O error
+    pub fn next_chunk(&mut self) -> Result<Option<Chunk<'_>>> {
+        // Step 1: read the 1-byte type, distinguishing clean EOF from truncation.
+        let mut type_byte = [0u8; 1];
+        let mut got = 0usize;
+        loop {
+            match self.reader.read(&mut type_byte[got..]) {
+                Ok(0) => {
+                    if got == 0 {
+                        return Ok(None); // clean EOF at chunk boundary
+                    }
+                    return Err(Error::Truncated { expected: 1 });
+                }
+                Ok(n) => {
+                    got += n;
+                    if got == 1 {
+                        break;
+                    }
+                }
+                Err(e) => return Err(Error::Io(e)),
+            }
+        }
+
+        match type_byte[0] {
+            0x00 | 0x01 => {
+                // Metadata (0x00) or InlineData (0x01): read 4-byte u32 LE body length.
+                let mut len_bytes = [0u8; 4];
+                self.reader.read_exact(&mut len_bytes).map_err(|e| {
+                    if e.kind() == std::io::ErrorKind::UnexpectedEof {
+                        Error::Truncated { expected: 4 }
+                    } else {
+                        Error::Io(e)
+                    }
+                })?;
+                let length = u32::from_le_bytes(len_bytes) as usize;
+                if length > MAX_INLINE_CHUNK_SIZE {
+                    return Err(Error::InlineTooLarge {
+                        size: length,
+                        max: MAX_INLINE_CHUNK_SIZE,
+                    });
+                }
+                self.buffer.resize(length, 0);
+                self.reader.read_exact(&mut self.buffer).map_err(|e| {
+                    if e.kind() == std::io::ErrorKind::UnexpectedEof {
+                        Error::Truncated {
+                            expected: length as u64,
+                        }
+                    } else {
+                        Error::Io(e)
+                    }
+                })?;
+                if type_byte[0] == 0x00 {
+                    Ok(Some(Chunk::Metadata(&self.buffer)))
+                } else {
+                    Ok(Some(Chunk::InlineData(&self.buffer)))
+                }
+            }
+            0x02 => {
+                // FileBackedData: [u64 LE content_length][u32 LE dirfd_index][u32 LE name_len][name bytes]
+                let mut header = [0u8; 16];
+                self.reader.read_exact(&mut header).map_err(|e| {
+                    if e.kind() == std::io::ErrorKind::UnexpectedEof {
+                        Error::Truncated { expected: 16 }
+                    } else {
+                        Error::Io(e)
+                    }
+                })?;
+                let length = u64::from_le_bytes(header[0..8].try_into().unwrap());
+                let dirfd_index = u32::from_le_bytes(header[8..12].try_into().unwrap());
+                let name_len = u32::from_le_bytes(header[12..16].try_into().unwrap()) as usize;
+                if name_len > MAX_FILENAME_LEN {
+                    return Err(Error::FilenameTooLong {
+                        len: name_len,
+                        max: MAX_FILENAME_LEN,
+                    });
+                }
+                self.buffer.resize(name_len, 0);
+                self.reader.read_exact(&mut self.buffer).map_err(|e| {
+                    if e.kind() == std::io::ErrorKind::UnexpectedEof {
+                        Error::Truncated {
+                            expected: name_len as u64,
+                        }
+                    } else {
+                        Error::Io(e)
+                    }
+                })?;
+                Ok(Some(Chunk::FileBackedData {
+                    dirfd_index,
+                    length,
+                    filename: &self.buffer,
+                }))
+            }
+            other => Err(Error::UnknownChunkType(other)),
+        }
+    }
+}
+
+/// Validate that `filename` is an acceptable external filename.
+///
+/// Checks, in order:
+/// 1. Not empty.
+/// 2. Not longer than [`MAX_FILENAME_LEN`].
+/// 3. No embedded NUL bytes.
+/// 4. Not an absolute path (does not start with `/`).
+/// 5. No `..` path components.
+///
+/// Note that `.` components and empty components (from `//`) are permitted,
+/// as the kernel will handle them safely.
+///
+/// # Errors
+///
+/// Returns [`Error::InvalidFilename`] or [`Error::FilenameTooLong`] on failure.
+pub fn validate_filename(filename: &[u8]) -> Result<()> {
+    if filename.is_empty() {
+        return Err(Error::InvalidFilename {
+            reason: "empty filename",
+        });
+    }
+    if filename.len() > MAX_FILENAME_LEN {
+        return Err(Error::FilenameTooLong {
+            len: filename.len(),
+            max: MAX_FILENAME_LEN,
+        });
+    }
+    if filename.contains(&0u8) {
+        return Err(Error::InvalidFilename {
+            reason: "embedded NUL",
+        });
+    }
+    if filename[0] == b'/' {
+        return Err(Error::InvalidFilename {
+            reason: "absolute path",
+        });
+    }
+    for component in filename.split(|&b| b == b'/') {
+        if component == b".." {
+            return Err(Error::InvalidFilename {
+                reason: "`..` component",
+            });
+        }
+    }
+    Ok(())
+}
+
+/// Open a file at `filename` relative to `dirfd` using safe kernel primitives.
+///
+/// Internally calls `openat2(2)` with `RESOLVE_BENEATH | RESOLVE_NO_MAGICLINKS |
+/// RESOLVE_NO_XDEV` when available. On kernels that do not support `openat2`
+/// (`ENOSYS`), falls back to plain `openat(2)`.
+///
+/// # Security note
+///
+/// The `openat2` path prevents directory escapes (`RESOLVE_BENEATH`), blocks
+/// magic-link traversal (`RESOLVE_NO_MAGICLINKS`), prevents crossing filesystem
+/// boundaries (`RESOLVE_NO_XDEV`), and **does** allow symlinks as long as they
+/// resolve within the base directory. A symlink pointing outside the base
+/// directory is still rejected by `RESOLVE_BENEATH`.
+///
+/// The `openat` fallback does not enforce `RESOLVE_BENEATH` (that kernel
+/// feature is unavailable on old kernels), so callers should prefer environments
+/// with kernel ≥ 5.6 (where `openat2` is available) when strict confinement is
+/// required. [`validate_filename`] is still called before any syscall to reject
+/// `..` components and absolute paths.
+///
+/// # Errors
+///
+/// Returns any error from [`validate_filename`], or an [`Error::Io`] wrapping
+/// the kernel error (`EXDEV` for escape attempts, etc.).
+pub fn open_beneath(dirfd: BorrowedFd<'_>, filename: &[u8]) -> Result<OwnedFd> {
+    validate_filename(filename)?;
+
+    // Build a CString: validate_filename rejects embedded NUL so this is infallible.
+    let cname = CString::new(filename).expect("validate_filename guarantees no NUL");
+
+    // Try openat2 first (Linux ≥ 5.6).
+    // RESOLVE_BENEATH  — prevent escaping the base directory via any path tricks.
+    // RESOLVE_NO_MAGICLINKS — block /proc/self/fd-style magic links.
+    // RESOLVE_NO_XDEV  — prevent crossing filesystem mount boundaries.
+    // Symlinks that stay within the base directory are permitted.
+    let oflags = OFlags::RDONLY | OFlags::CLOEXEC;
+    let result = rustix::fs::openat2(
+        dirfd,
+        &cname,
+        oflags,
+        Mode::empty(),
+        ResolveFlags::BENEATH | ResolveFlags::NO_MAGICLINKS | ResolveFlags::NO_XDEV,
+    );
+
+    match result {
+        Ok(fd) => Ok(fd),
+        Err(e) if e == Errno::NOSYS => {
+            // Kernel too old for openat2; fall back to plain openat.
+            // validate_filename already rejected `..` and absolute paths.
+            rustix::fs::openat(
+                dirfd,
+                &cname,
+                OFlags::RDONLY | OFlags::CLOEXEC,
+                Mode::empty(),
+            )
+            .map_err(Error::from)
+        }
+        Err(e) => Err(e.into()),
+    }
+}
+
+/// Reconstruct the byte stream encoded in `stream` by combining inline chunks
+/// and external file data, writing all output to `output`.
+///
+/// For each [`Chunk::FileBackedData`], the corresponding file is opened with
+/// [`open_beneath`] using `dirfds[dirfd_index]` as the base directory, and
+/// exactly `length` bytes are read from offset 0 via positional reads
+/// (`pread(2)`), so the same file can be referenced multiple times without
+/// seeking.
+///
+/// Returns the total number of bytes written to `output`.
+///
+/// # Errors
+///
+/// - [`Error::DirfdIndexOutOfRange`] — `dirfd_index` ≥ `dirfds.len()`
+/// - [`Error::ExternalTooShort`] — external file has fewer bytes than declared
+/// - Any error from [`open_beneath`] or from writing to `output`
+pub fn reconstruct<R: Read, W: Write>(
+    stream: R,
+    dirfds: &[BorrowedFd<'_>],
+    output: &mut W,
+) -> Result<u64> {
+    let mut reader = SplitdirfdstreamReader::new(stream);
+    let mut total: u64 = 0;
+    const BUF_SIZE: usize = 128 * 1024;
+
+    while let Some(chunk) = reader.next_chunk()? {
+        match chunk {
+            Chunk::Metadata(data) => {
+                output.write_all(data)?;
+                total += data.len() as u64;
+            }
+            Chunk::InlineData(data) => {
+                output.write_all(data)?;
+                total += data.len() as u64;
+            }
+            Chunk::FileBackedData {
+                dirfd_index,
+                length,
+                filename,
+            } => {
+                let idx = dirfd_index as usize;
+                if idx >= dirfds.len() {
+                    return Err(Error::DirfdIndexOutOfRange {
+                        index: dirfd_index,
+                        count: dirfds.len(),
+                    });
+                }
+                let fd = open_beneath(dirfds[idx], filename)?;
+                // Cap the scratch buffer at BUF_SIZE; never size it from the
+                // attacker-controlled `length` (which can be up to u64::MAX),
+                // which would overflow `as usize` / wrap on 32-bit and panic
+                // under overflow-checks. `to_read` below bounds each read.
+                let mut buf =
+                    vec![0u8; BUF_SIZE.min(usize::try_from(length).unwrap_or(usize::MAX))];
+                let mut remaining = length;
+                let mut offset: u64 = 0;
+                while remaining > 0 {
+                    let to_read = (remaining as usize).min(buf.len());
+                    let n =
+                        rustix::io::pread(&fd, &mut buf[..to_read], offset).map_err(Error::from)?;
+                    if n == 0 {
+                        return Err(Error::ExternalTooShort {
+                            declared: length,
+                            actual: length - remaining,
+                        });
+                    }
+                    output.write_all(&buf[..n])?;
+                    remaining -= n as u64;
+                    offset += n as u64;
+                }
+                total += length;
+            }
+        }
+    }
+
+    Ok(total)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::os::fd::AsFd;
+
+    // -------------------------------------------------------------------------
+    // Helpers
+    // -------------------------------------------------------------------------
+
+    /// Write `chunks` into a buffer and read them back, returning the decoded
+    /// chunks as `(dirfd_index, length, filename, inline_data)` tuples.
+    fn roundtrip_stream(writes: &[WriteCmd<'_>]) -> (Vec<u8>, Vec<DecodedChunk>) {
+        let mut buf = Vec::new();
+        {
+            let mut w = SplitdirfdstreamWriter::new(&mut buf);
+            for cmd in writes {
+                match cmd {
+                    WriteCmd::Metadata(data) => w.write_metadata(data).unwrap(),
+                    WriteCmd::InlineData(data) => w.write_inline_data(data).unwrap(),
+                    WriteCmd::FileBackedData {
+                        dirfd_index,
+                        length,
+                        filename,
+                    } => w
+                        .write_file_backed_data(*dirfd_index, *length, filename)
+                        .unwrap(),
+                }
+            }
+            w.finish().unwrap();
+        }
+        let mut reader = SplitdirfdstreamReader::new(buf.as_slice());
+        let mut out = Vec::new();
+        while let Some(chunk) = reader.next_chunk().unwrap() {
+            out.push(DecodedChunk::from(&chunk));
+        }
+        (buf, out)
+    }
+
+    #[derive(Debug)]
+    enum WriteCmd<'a> {
+        Metadata(&'a [u8]),
+        InlineData(&'a [u8]),
+        FileBackedData {
+            dirfd_index: u32,
+            length: u64,
+            filename: &'a [u8],
+        },
+    }
+
+    #[derive(Debug, PartialEq, Eq)]
+    enum DecodedChunk {
+        Metadata(Vec<u8>),
+        InlineData(Vec<u8>),
+        FileBackedData {
+            dirfd_index: u32,
+            length: u64,
+            filename: Vec<u8>,
+        },
+    }
+
+    impl<'a> From<&Chunk<'a>> for DecodedChunk {
+        fn from(c: &Chunk<'a>) -> Self {
+            match c {
+                Chunk::Metadata(d) => DecodedChunk::Metadata(d.to_vec()),
+                Chunk::InlineData(d) => DecodedChunk::InlineData(d.to_vec()),
+                Chunk::FileBackedData {
+                    dirfd_index,
+                    length,
+                    filename,
+                } => DecodedChunk::FileBackedData {
+                    dirfd_index: *dirfd_index,
+                    length: *length,
+                    filename: filename.to_vec(),
+                },
+            }
+        }
+    }
+
+    // -------------------------------------------------------------------------
+    // Basic wire-format tests
+    // -------------------------------------------------------------------------
+
+    #[test]
+    fn empty_stream_returns_none() {
+        let mut reader = SplitdirfdstreamReader::new(b"".as_slice());
+        assert_eq!(reader.next_chunk().unwrap(), None);
+    }
+
+    #[test]
+    fn roundtrip_inline_only() {
+        let (_, chunks) =
+            roundtrip_stream(&[WriteCmd::Metadata(b"hello"), WriteCmd::Metadata(b"world")]);
+        assert_eq!(
+            chunks,
+            vec![
+                DecodedChunk::Metadata(b"hello".to_vec()),
+                DecodedChunk::Metadata(b"world".to_vec()),
+            ]
+        );
+    }
+
+    #[test]
+    fn roundtrip_external_only() {
+        let (_, chunks) = roundtrip_stream(&[
+            WriteCmd::FileBackedData {
+                dirfd_index: 0,
+                length: 100,
+                filename: b"a/b",
+            },
+            WriteCmd::FileBackedData {
+                dirfd_index: 3,
+                length: 0,
+                filename: b"x/y/z",
+            },
+        ]);
+        assert_eq!(
+            chunks,
+            vec![
+                DecodedChunk::FileBackedData {
+                    dirfd_index: 0,
+                    length: 100,
+                    filename: b"a/b".to_vec()
+                },
+                DecodedChunk::FileBackedData {
+                    dirfd_index: 3,
+                    length: 0,
+                    filename: b"x/y/z".to_vec()
+                },
+            ]
+        );
+    }
+
+    #[test]
+    fn roundtrip_mixed_interleaved() {
+        let (_, chunks) = roundtrip_stream(&[
+            WriteCmd::Metadata(b"header"),
+            WriteCmd::FileBackedData {
+                dirfd_index: 0,
+                length: 7,
+                filename: b"blob",
+            },
+            WriteCmd::Metadata(b"middle"),
+            WriteCmd::FileBackedData {
+                dirfd_index: 1,
+                length: 3,
+                filename: b"sub/c",
+            },
+            WriteCmd::Metadata(b"footer"),
+        ]);
+        assert_eq!(chunks.len(), 5);
+        assert_eq!(chunks[0], DecodedChunk::Metadata(b"header".to_vec()));
+        assert_eq!(
+            chunks[1],
+            DecodedChunk::FileBackedData {
+                dirfd_index: 0,
+                length: 7,
+                filename: b"blob".to_vec()
+            }
+        );
+        assert_eq!(chunks[2], DecodedChunk::Metadata(b"middle".to_vec()));
+        assert_eq!(
+            chunks[3],
+            DecodedChunk::FileBackedData {
+                dirfd_index: 1,
+                length: 3,
+                filename: b"sub/c".to_vec()
+            }
+        );
+        assert_eq!(chunks[4], DecodedChunk::Metadata(b"footer".to_vec()));
+    }
+
+    #[test]
+    fn empty_inline_is_no_op() {
+        let (buf, chunks) = roundtrip_stream(&[
+            WriteCmd::Metadata(b""),
+            WriteCmd::Metadata(b"real"),
+            WriteCmd::Metadata(b""),
+        ]);
+        // Only "real" produces a chunk; empties are silently dropped.
+        assert_eq!(chunks, vec![DecodedChunk::Metadata(b"real".to_vec())]);
+        // Buffer: 1-byte type + 4-byte u32 length + 4 bytes data
+        assert_eq!(buf.len(), 9);
+    }
+
+    #[test]
+    fn external_wire_layout() {
+        // Verify the exact byte layout for a known external chunk.
+        // dirfd_index=2, length=9, filename=b"ab"
+        // Format: [0x02][9u64 LE][2u32 LE][2u32 LE][b'a',b'b']
+        let mut buf = Vec::new();
+        SplitdirfdstreamWriter::new(&mut buf)
+            .write_file_backed_data(2, 9, b"ab")
+            .unwrap();
+
+        let expected: Vec<u8> = {
+            let mut v = Vec::new();
+            v.push(0x02u8); // type byte
+            v.extend_from_slice(&9u64.to_le_bytes()); // content_length
+            v.extend_from_slice(&2u32.to_le_bytes()); // dirfd_index
+            v.extend_from_slice(&2u32.to_le_bytes()); // name_len
+            v.extend_from_slice(b"ab"); // filename
+            v
+        };
+        assert_eq!(buf, expected);
+    }
+
+    #[test]
+    fn boundary_inline_sizes() {
+        for &size in &[1usize, 7, 8, 9, 255, 256, 257, 4095, 4096, 4097] {
+            let data: Vec<u8> = (0..size).map(|i| (i % 256) as u8).collect();
+            let (buf, chunks) = roundtrip_stream(&[WriteCmd::Metadata(&data)]);
+
+            // Wire layout: 1-byte type + 4-byte u32 length + `size` bytes
+            assert_eq!(buf.len(), 5 + size, "buf.len() for size={size}");
+            assert_eq!(buf[0], 0x00u8, "type byte for size={size}");
+            let len_field = u32::from_le_bytes(buf[1..5].try_into().unwrap());
+            assert_eq!(len_field, size as u32, "length field for size={size}");
+
+            assert_eq!(
+                chunks,
+                vec![DecodedChunk::Metadata(data)],
+                "data for size={size}"
+            );
+        }
+    }
+
+    // -------------------------------------------------------------------------
+    // Reader limit / boundary error tests (hand-crafted buffers)
+    // -------------------------------------------------------------------------
+
+    #[test]
+    fn unknown_chunk_type_returns_error() {
+        // A type byte that is not 0x00, 0x01, or 0x02 must yield UnknownChunkType.
+        let buf = vec![0x03u8];
+        let mut reader = SplitdirfdstreamReader::new(buf.as_slice());
+        let err = reader.next_chunk().unwrap_err();
+        assert!(
+            matches!(err, Error::UnknownChunkType(0x03)),
+            "expected UnknownChunkType(0x03), got {err:?}"
+        );
+    }
+
+    #[test]
+    fn error_unknown_chunk_type() {
+        // Feeding byte 0x80 must yield UnknownChunkType(0x80).
+        let buf = vec![0x80u8];
+        let mut reader = SplitdirfdstreamReader::new(buf.as_slice());
+        let err = reader.next_chunk().unwrap_err();
+        assert!(
+            matches!(err, Error::UnknownChunkType(0x80)),
+            "expected UnknownChunkType(0x80), got {err:?}"
+        );
+    }
+
+    #[test]
+    fn error_filename_too_long_in_reader() {
+        // [0x02][0u64 LE][0u32 LE][name_len as u32 LE] with name_len > MAX_FILENAME_LEN
+        let name_len = MAX_FILENAME_LEN + 1;
+        let mut buf = Vec::new();
+        buf.push(0x02u8);
+        buf.extend_from_slice(&0u64.to_le_bytes()); // content_length
+        buf.extend_from_slice(&0u32.to_le_bytes()); // dirfd_index
+        buf.extend_from_slice(&(name_len as u32).to_le_bytes()); // name_len
+        let mut reader = SplitdirfdstreamReader::new(buf.as_slice());
+        let err = reader.next_chunk().unwrap_err();
+        assert!(
+            matches!(err, Error::FilenameTooLong { len, max } if len == name_len && max == MAX_FILENAME_LEN),
+            "expected FilenameTooLong, got {err:?}"
+        );
+    }
+
+    #[test]
+    fn error_inline_too_large() {
+        // [0x00][(512 MiB) as u32 LE] — no body needed; length check fires first.
+        let size: u32 = 512 * 1024 * 1024;
+        let mut buf = Vec::new();
+        buf.push(0x00u8);
+        buf.extend_from_slice(&size.to_le_bytes());
+        let mut reader = SplitdirfdstreamReader::new(buf.as_slice());
+        let err = reader.next_chunk().unwrap_err();
+        assert!(
+            matches!(err, Error::InlineTooLarge { size: s, max } if s == 512*1024*1024 && max == MAX_INLINE_CHUNK_SIZE),
+            "expected InlineTooLarge, got {err:?}"
+        );
+    }
+
+    #[test]
+    fn error_truncated_inline_content() {
+        // [0x00][100u32 LE] then only 10 data bytes; Truncated{expected:100}.
+        let mut buf = Vec::new();
+        buf.push(0x00u8);
+        buf.extend_from_slice(&100u32.to_le_bytes());
+        buf.extend_from_slice(&[0u8; 10]);
+        let mut reader = SplitdirfdstreamReader::new(buf.as_slice());
+        let err = reader.next_chunk().unwrap_err();
+        assert!(
+            matches!(err, Error::Truncated { expected: 100 }),
+            "expected Truncated{{100}}, got {err:?}"
+        );
+    }
+
+    #[test]
+    fn error_truncated_file_backed_data() {
+        // [0x02] then only 5 of the 16 required header bytes.
+        let mut buf = Vec::new();
+        buf.push(0x02u8);
+        buf.extend_from_slice(&[0u8; 5]);
+        let mut reader = SplitdirfdstreamReader::new(buf.as_slice());
+        let err = reader.next_chunk().unwrap_err();
+        assert!(
+            matches!(err, Error::Truncated { .. }),
+            "expected Truncated, got {err:?}"
+        );
+    }
+
+    #[test]
+    fn error_truncated_prefix_after_type_byte() {
+        // Type byte 0x00 is read, then EOF before the 4-byte length — Truncated{expected:4}.
+        let buf = vec![0x00u8];
+        let mut reader = SplitdirfdstreamReader::new(buf.as_slice());
+        let err = reader.next_chunk().unwrap_err();
+        assert!(
+            matches!(err, Error::Truncated { expected: 4 }),
+            "expected Truncated{{4}}, got {err:?}"
+        );
+    }
+
+    #[test]
+    fn error_dirfd_index_out_of_range_via_reconstruct() {
+        // External chunk references dirfd_index=5 but we supply 1 dirfd.
+        let tmp = tempfile::tempdir().unwrap();
+        let dir_fd = rustix::fs::open(
+            tmp.path(),
+            OFlags::RDONLY | OFlags::DIRECTORY | OFlags::CLOEXEC,
+            Mode::empty(),
+        )
+        .unwrap();
+
+        let mut buf = Vec::new();
+        SplitdirfdstreamWriter::new(&mut buf)
+            .write_file_backed_data(5, 0, b"dummy")
+            .unwrap();
+
+        let dirfds: &[BorrowedFd<'_>] = &[dir_fd.as_fd()];
+        let mut out = Vec::new();
+        let err = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap_err();
+        assert!(
+            matches!(err, Error::DirfdIndexOutOfRange { index: 5, count: 1 }),
+            "expected DirfdIndexOutOfRange, got {err:?}"
+        );
+    }
+
+    // -------------------------------------------------------------------------
+    // validate_filename tests (data-driven)
+    // -------------------------------------------------------------------------
+
+    /// Expected outcome for a validate_filename test case.
+    #[derive(Debug)]
+    enum FilenameExpect {
+        /// validate_filename must return Ok(()).
+        Ok,
+        /// validate_filename must return Err(InvalidFilename { reason }) where
+        /// `reason` contains the given substring.
+        InvalidFilename(&'static str),
+        /// validate_filename must return Err(FilenameTooLong { .. }).
+        TooLong,
+    }
+
+    #[test]
+    fn validate_filename_cases() {
+        let long_ok: Vec<u8> = vec![b'a'; MAX_FILENAME_LEN];
+        let long_bad: Vec<u8> = vec![b'a'; MAX_FILENAME_LEN + 1];
+
+        let cases: &[(&[u8], FilenameExpect)] = &[
+            // ── rejection cases ──────────────────────────────────────────────
+            (b"", FilenameExpect::InvalidFilename("empty filename")),
+            (b"/abs", FilenameExpect::InvalidFilename("absolute path")),
+            (b"a/../b", FilenameExpect::InvalidFilename("`..` component")),
+            (
+                b"../escape",
+                FilenameExpect::InvalidFilename("`..` component"),
+            ),
+            (b"a/b/..", FilenameExpect::InvalidFilename("`..` component")),
+            (b"..", FilenameExpect::InvalidFilename("`..` component")), // D2: bare `..`
+            (b"foo\0bar", FilenameExpect::InvalidFilename("embedded NUL")),
+            (&long_bad, FilenameExpect::TooLong),
+            // ── acceptance cases ─────────────────────────────────────────────
+            (b"a/b/c", FilenameExpect::Ok),   // normal relative path
+            (b"a/./b", FilenameExpect::Ok),   // `.` is allowed
+            (&long_ok, FilenameExpect::Ok),   // exactly MAX_FILENAME_LEN
+            (b"..foo", FilenameExpect::Ok),   // D5: `..`-prefix but not a component
+            (b"foo..", FilenameExpect::Ok),   // D5: `..`-suffix but not a component
+            (b"a/..foo", FilenameExpect::Ok), // D5: `..`-prefixed component
+            (b"foo../b", FilenameExpect::Ok), // D5: `..`-suffixed component in path
+        ];
+
+        for (filename, expect) in cases {
+            let result = validate_filename(filename);
+            match expect {
+                FilenameExpect::Ok => {
+                    assert!(
+                        result.is_ok(),
+                        "validate_filename({filename:?}) should be Ok, got {result:?}"
+                    );
+                }
+                FilenameExpect::InvalidFilename(substr) => {
+                    assert!(
+                        matches!(&result, Err(Error::InvalidFilename { reason }) if reason.contains(substr)),
+                        "validate_filename({filename:?}) should be InvalidFilename containing {substr:?}, got {result:?}"
+                    );
+                }
+                FilenameExpect::TooLong => {
+                    assert!(
+                        matches!(&result, Err(Error::FilenameTooLong { .. })),
+                        "validate_filename({filename:?}) should be FilenameTooLong, got {result:?}"
+                    );
+                }
+            }
+        }
+    }
+
+    // -------------------------------------------------------------------------
+    // Reconstruction tests
+    // -------------------------------------------------------------------------
+
+    fn open_dir(path: &std::path::Path) -> OwnedFd {
+        rustix::fs::open(
+            path,
+            OFlags::RDONLY | OFlags::DIRECTORY | OFlags::CLOEXEC,
+            Mode::empty(),
+        )
+        .unwrap()
+    }
+
+    #[test]
+    fn reconstruct_inline_only() {
+        let tmp = tempfile::tempdir().unwrap();
+        let dir_fd = open_dir(tmp.path());
+
+        let mut buf = Vec::new();
+        {
+            let mut w = SplitdirfdstreamWriter::new(&mut buf);
+            w.write_metadata(b"Hello, ").unwrap();
+            w.write_metadata(b"world!").unwrap();
+            w.finish().unwrap();
+        }
+
+        let dirfds: &[BorrowedFd<'_>] = &[dir_fd.as_fd()];
+        let mut out = Vec::new();
+        let n = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap();
+        assert_eq!(out, b"Hello, world!");
+        assert_eq!(n, 13);
+    }
+
+    #[test]
+    fn reconstruct_with_externals() {
+        let tmp = tempfile::tempdir().unwrap();
+        std::fs::write(tmp.path().join("a"), b"FILEONE").unwrap();
+        std::fs::create_dir(tmp.path().join("subdir")).unwrap();
+        std::fs::write(tmp.path().join("subdir/b"), b"FILETWO").unwrap();
+
+        let dir_fd = open_dir(tmp.path());
+
+        let mut buf = Vec::new();
+        {
+            let mut w = SplitdirfdstreamWriter::new(&mut buf);
+            w.write_metadata(b"[").unwrap();
+            w.write_file_backed_data(0, 7, b"a").unwrap();
+            w.write_metadata(b"|").unwrap();
+            w.write_file_backed_data(0, 7, b"subdir/b").unwrap();
+            w.write_metadata(b"]").unwrap();
+            w.finish().unwrap();
+        }
+
+        let dirfds: &[BorrowedFd<'_>] = &[dir_fd.as_fd()];
+        let mut out = Vec::new();
+        let n = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap();
+        assert_eq!(out, b"[FILEONE|FILETWO]");
+        assert_eq!(n, 17);
+    }
+
+    #[test]
+    fn reconstruct_same_file_twice() {
+        let tmp = tempfile::tempdir().unwrap();
+        std::fs::write(tmp.path().join("data"), b"REPEAT").unwrap();
+        let dir_fd = open_dir(tmp.path());
+
+        let mut buf = Vec::new();
+        {
+            let mut w = SplitdirfdstreamWriter::new(&mut buf);
+            w.write_file_backed_data(0, 6, b"data").unwrap();
+            w.write_metadata(b"-").unwrap();
+            w.write_file_backed_data(0, 6, b"data").unwrap();
+            w.finish().unwrap();
+        }
+
+        let dirfds: &[BorrowedFd<'_>] = &[dir_fd.as_fd()];
+        let mut out = Vec::new();
+        let n = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap();
+        // pread always starts from offset 0, so both refs read the full file.
+        assert_eq!(out, b"REPEAT-REPEAT");
+        assert_eq!(n, 13);
+    }
+
+    #[test]
+    fn reconstruct_length_shorter_than_file() {
+        let tmp = tempfile::tempdir().unwrap();
+        std::fs::write(tmp.path().join("big"), b"ABCDEFGH").unwrap(); // 8 bytes
+        let dir_fd = open_dir(tmp.path());
+
+        let mut buf = Vec::new();
+        {
+            let mut w = SplitdirfdstreamWriter::new(&mut buf);
+            w.write_file_backed_data(0, 4, b"big").unwrap(); // only first 4 bytes
+            w.finish().unwrap();
+        }
+
+        let dirfds: &[BorrowedFd<'_>] = &[dir_fd.as_fd()];
+        let mut out = Vec::new();
+        let n = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap();
+        assert_eq!(out, b"ABCD");
+        assert_eq!(n, 4);
+    }
+
+    #[test]
+    fn reconstruct_length_longer_than_file_is_error() {
+        let tmp = tempfile::tempdir().unwrap();
+        std::fs::write(tmp.path().join("small"), b"HI").unwrap(); // 2 bytes
+        let dir_fd = open_dir(tmp.path());
+
+        let mut buf = Vec::new();
+        {
+            let mut w = SplitdirfdstreamWriter::new(&mut buf);
+            w.write_file_backed_data(0, 100, b"small").unwrap(); // declare 100, file has 2
+            w.finish().unwrap();
+        }
+
+        let dirfds: &[BorrowedFd<'_>] = &[dir_fd.as_fd()];
+        let mut out = Vec::new();
+        let err = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap_err();
+        assert!(
+            matches!(
+                err,
+                Error::ExternalTooShort {
+                    declared: 100,
+                    actual: 2
+                }
+            ),
+            "expected ExternalTooShort, got {err:?}"
+        );
+    }
+
+    /// Build a raw splitdirfdstream buffer containing a single FileBackedData chunk
+    /// with a given `length` field and filename, without going through
+    /// the writer (so we can set length=u64::MAX which the writer itself would
+    /// also accept).
+    fn make_external_chunk_buf(dirfd_index: u32, length: u64, filename: &[u8]) -> Vec<u8> {
+        let mut buf = Vec::new();
+        buf.push(0x02u8); // type byte
+        buf.extend_from_slice(&length.to_le_bytes()); // content_length
+        buf.extend_from_slice(&dirfd_index.to_le_bytes()); // dirfd_index
+        buf.extend_from_slice(&(filename.len() as u32).to_le_bytes()); // name_len
+        buf.extend_from_slice(filename); // filename
+        buf
+    }
+
+    #[test]
+    fn reconstruct_length_u64_max_does_not_overflow() {
+        // Guards the buffer-sizing path: `BUF_SIZE.min(usize::try_from(length).unwrap_or(usize::MAX))`
+        // must not panic or produce a wrong result when length = u64::MAX.
+        // The file has only a few real bytes; the first pread reads them, the
+        // second sees EOF and must return ExternalTooShort — not a panic.
+        let tmp = tempfile::tempdir().unwrap();
+        let fname = b"tiny";
+        std::fs::write(tmp.path().join("tiny"), b"abc").unwrap(); // 3 bytes
+        let dir_fd = open_dir(tmp.path());
+
+        let buf = make_external_chunk_buf(0, u64::MAX, fname);
+
+        let dirfds: &[BorrowedFd<'_>] = &[dir_fd.as_fd()];
+        let mut out = Vec::new();
+        let err = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap_err();
+        assert!(
+            matches!(
+                err,
+                Error::ExternalTooShort {
+                    declared: u64::MAX,
+                    ..
+                }
+            ),
+            "expected ExternalTooShort with declared=u64::MAX, got {err:?}"
+        );
+    }
+
+    #[test]
+    fn reconstruct_multiple_dirfds() {
+        let tmp0 = tempfile::tempdir().unwrap();
+        let tmp1 = tempfile::tempdir().unwrap();
+        std::fs::write(tmp0.path().join("x"), b"FROM0").unwrap();
+        std::fs::write(tmp1.path().join("y"), b"FROM1").unwrap();
+        let dir0 = open_dir(tmp0.path());
+        let dir1 = open_dir(tmp1.path());
+
+        let mut buf = Vec::new();
+        {
+            let mut w = SplitdirfdstreamWriter::new(&mut buf);
+            w.write_file_backed_data(0, 5, b"x").unwrap();
+            w.write_metadata(b"+").unwrap();
+            w.write_file_backed_data(1, 5, b"y").unwrap();
+            w.finish().unwrap();
+        }
+
+        let dirfds: &[BorrowedFd<'_>] = &[dir0.as_fd(), dir1.as_fd()];
+        let mut out = Vec::new();
+        let n = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap();
+        assert_eq!(out, b"FROM0+FROM1");
+        assert_eq!(n, 11);
+    }
+
+    // -------------------------------------------------------------------------
+    // open_beneath safety tests
+    // -------------------------------------------------------------------------
+
+    #[test]
+    fn open_beneath_rejects_escape_via_dotdot() {
+        // validate_filename catches ".." before any syscall, so this is
+        // kernel-independent.
+        let tmp = tempfile::tempdir().unwrap();
+        let dir_fd = open_dir(tmp.path());
+        let err = open_beneath(dir_fd.as_fd(), b"../anything").unwrap_err();
+        assert!(
+            matches!(
+                err,
+                Error::InvalidFilename {
+                    reason: "`..` component"
+                }
+            ),
+            "expected InvalidFilename, got {err:?}"
+        );
+    }
+
+    #[test]
+    fn open_beneath_follows_symlink_within_base() {
+        // A symlink whose target resolves *within* the base directory must be
+        // followed successfully — this is the new behaviour after removing
+        // RESOLVE_NO_SYMLINKS.  The l/<linkid> symlinks created by
+        // containers/storage are exactly this kind of within-base symlink.
+        let tmp = tempfile::tempdir().unwrap();
+        std::fs::write(tmp.path().join("target"), b"data").unwrap();
+        std::os::unix::fs::symlink("target", tmp.path().join("link")).unwrap();
+
+        let dir_fd = open_dir(tmp.path());
+        let result = open_beneath(dir_fd.as_fd(), b"link");
+        assert!(
+            result.is_ok(),
+            "symlink resolving within the base directory must be followed; got {result:?}"
+        );
+    }
+
+    #[test]
+    fn open_beneath_rejects_escape_via_symlink() {
+        // A symlink whose target escapes the base directory must be rejected
+        // (RESOLVE_BENEATH) on kernels with openat2.  On old kernels the
+        // fallback cannot enforce this; skip in that case.
+        let tmp = tempfile::tempdir().unwrap();
+        std::fs::create_dir(tmp.path().join("real")).unwrap();
+        std::fs::write(tmp.path().join("real/passwd"), b"data").unwrap();
+        // Symlink to /etc — outside the base dir.
+        std::os::unix::fs::symlink("/etc", tmp.path().join("link")).unwrap();
+
+        let dir_fd = open_dir(tmp.path());
+
+        // Probe openat2 availability.
+        let probe = rustix::fs::openat2(
+            dir_fd.as_fd(),
+            rustix::cstr!("real/passwd"),
+            OFlags::RDONLY | OFlags::CLOEXEC,
+            Mode::empty(),
+            ResolveFlags::BENEATH | ResolveFlags::NO_MAGICLINKS | ResolveFlags::NO_XDEV,
+        );
+        if let Err(e) = probe
+            && e == Errno::NOSYS
+        {
+            // openat2 not available: skip this test.
+            return;
+        }
+
+        // openat2 is available: symlink escaping outside the base must be rejected.
+        let result = open_beneath(dir_fd.as_fd(), b"link/passwd");
+        assert!(
+            result.is_err(),
+            "openat2 path must reject symlink escaping the base directory; got Ok"
+        );
+    }
+
+    // -------------------------------------------------------------------------
+    // InlineData chunk tests
+    // -------------------------------------------------------------------------
+
+    #[test]
+    fn file_content_wire_layout() {
+        // Exact bytes: [0x01][5u32 LE][b"hello"]
+        let data = b"hello";
+        let mut buf = Vec::new();
+        SplitdirfdstreamWriter::new(&mut buf)
+            .write_inline_data(data)
+            .unwrap();
+
+        let expected: Vec<u8> = {
+            let mut v = Vec::new();
+            v.push(0x01u8); // type byte
+            v.extend_from_slice(&5u32.to_le_bytes()); // length
+            v.extend_from_slice(b"hello"); // data
+            v
+        };
+        assert_eq!(buf, expected, "InlineData wire layout mismatch");
+    }
+
+    #[test]
+    fn roundtrip_file_content_zero_bytes() {
+        // Zero-length FileContent must still be written and round-trip.
+        let (buf, chunks) = roundtrip_stream(&[WriteCmd::InlineData(b"")]);
+        // 1-byte type + 4-byte u32 length + 0 data = 5 bytes
+        assert_eq!(buf.len(), 5, "zero-length InlineData should be 5 bytes");
+        assert_eq!(chunks, vec![DecodedChunk::InlineData(vec![])]);
+    }
+
+    #[test]
+    fn roundtrip_file_content() {
+        for size in [0usize, 1, 64, 65, 4096] {
+            let data: Vec<u8> = (0..size).map(|i| (i % 256) as u8).collect();
+            let (buf, chunks) = roundtrip_stream(&[WriteCmd::InlineData(&data)]);
+
+            // Wire layout: 1-byte type + 4-byte u32 length + `size` bytes
+            assert_eq!(buf.len(), 5 + size, "buf.len() for InlineData size={size}");
+            assert_eq!(
+                chunks,
+                vec![DecodedChunk::InlineData(data)],
+                "decoded data for size={size}"
+            );
+        }
+    }
+
+    #[test]
+    fn reconstruct_file_content() {
+        // FileContent in a stream reconstructs to the verbatim bytes.
+        let tmp = tempfile::tempdir().unwrap();
+        let dir_fd = open_dir(tmp.path());
+
+        let mut buf = Vec::new();
+        {
+            let mut w = SplitdirfdstreamWriter::new(&mut buf);
+            w.write_metadata(b"[").unwrap();
+            w.write_inline_data(b"CONTENT").unwrap();
+            w.write_metadata(b"]").unwrap();
+            w.finish().unwrap();
+        }
+
+        let dirfds: &[BorrowedFd<'_>] = &[dir_fd.as_fd()];
+        let mut out = Vec::new();
+        let n = reconstruct(buf.as_slice(), dirfds, &mut out).unwrap();
+        assert_eq!(out, b"[CONTENT]");
+        assert_eq!(n, 9);
+    }
+
+    // -------------------------------------------------------------------------
+    // Proptest suite
+    // -------------------------------------------------------------------------
+
+    mod proptest_tests {
+        use super::*;
+        use proptest::prelude::*;
+
+        /// Strategy: 1–4 path components of [a-z0-9]{1,8} joined by '/',
+        /// always stays below MAX_FILENAME_LEN and never contains `..`.
+        fn filename_strategy() -> impl Strategy<Value = Vec<u8>> {
+            let component = prop::string::string_regex("[a-z0-9]{1,8}").unwrap();
+            prop::collection::vec(component, 1..=4).prop_map(|parts| parts.join("/").into_bytes())
+        }
+
+        /// A chunk that carries its own content for proptest generation.
+        #[derive(Debug, Clone)]
+        enum TestChunk {
+            Metadata(Vec<u8>),
+            InlineData(Vec<u8>),
+            FileBackedData {
+                /// Unnormalized index in `0..4`; normalized to `% num_dirs` at test time.
+                dirfd_index: usize,
+                filename: Vec<u8>,
+                content: Vec<u8>,
+            },
+        }
+
+        fn metadata_strategy() -> impl Strategy<Value = TestChunk> {
+            prop::collection::vec(any::<u8>(), 1..=4096).prop_map(TestChunk::Metadata)
+        }
+
+        fn file_backed_data_strategy() -> impl Strategy<Value = TestChunk> {
+            (
+                0usize..4,
+                filename_strategy(),
+                prop::collection::vec(any::<u8>(), 0..=8192),
+            )
+                .prop_map(|(idx, name, content)| TestChunk::FileBackedData {
+                    dirfd_index: idx,
+                    filename: name,
+                    content,
+                })
+        }
+
+        fn inline_data_strategy() -> impl Strategy<Value = TestChunk> {
+            prop::collection::vec(any::<u8>(), 0..=4096).prop_map(TestChunk::InlineData)
+        }
+
+        fn chunk_strategy() -> impl Strategy<Value = TestChunk> {
+            prop_oneof![
+                metadata_strategy(),
+                file_backed_data_strategy(),
+                inline_data_strategy()
+            ]
+        }
+
+        fn chunks_strategy() -> impl Strategy<Value = Vec<TestChunk>> {
+            prop::collection::vec(chunk_strategy(), 0..=32)
+        }
+
+        proptest! {
+            #![proptest_config(ProptestConfig::with_cases(256))]
+
+            #[test]
+            fn proptest_roundtrip(chunks in chunks_strategy()) {
+                use std::collections::HashMap;
+
+                let num_dirs = 4usize;
+                let tmpdirs: Vec<tempfile::TempDir> = (0..num_dirs)
+                    .map(|_| tempfile::tempdir().unwrap())
+                    .collect();
+
+                // Normalize chunks: assign each External chunk a name of the
+                // form "N/<orig>" where N is a small bucket index (0..=3).
+                // Using only 4 buckets instead of a per-chunk unique counter
+                // means some chunks will intentionally share (dir_idx, name).
+                //
+                // When two chunks share a (dir_idx, name) key, the *first*
+                // chunk's content wins: its bytes are written to disk and both
+                // references reconstruct the same content, so the expected
+                // output stays well-defined.  This exercises the pread-from-
+                // offset-0 restart property (same file opened twice).
+                #[derive(Debug)]
+                enum ResolvedChunk {
+                    Metadata(Vec<u8>),
+                    InlineData(Vec<u8>),
+                    FileBackedData { dir_idx: usize, unique_name: Vec<u8>, content: Vec<u8> },
+                }
+
+                // Map (dir_idx, unique_name) -> first-seen content.
+                let mut content_map: HashMap<(usize, Vec<u8>), Vec<u8>> = HashMap::new();
+
+                let mut resolved: Vec<ResolvedChunk> = Vec::with_capacity(chunks.len());
+                let mut ext_counter = 0usize;
+                for chunk in &chunks {
+                    match chunk {
+                        TestChunk::Metadata(data) => {
+                            resolved.push(ResolvedChunk::Metadata(data.clone()));
+                        }
+                        TestChunk::InlineData(data) => {
+                            resolved.push(ResolvedChunk::InlineData(data.clone()));
+                        }
+                        TestChunk::FileBackedData { dirfd_index, filename, content } => {
+                            let dir_idx = dirfd_index % num_dirs;
+                            // Flatten any '/' in the generated name so a file
+                            // path can never be a *prefix* of another file path
+                            // (which on disk would require the prefix to be both
+                            // a regular file and a directory, an impossible
+                            // fixture that yields ENOTDIR). The library itself
+                            // supports nested paths; this only keeps the
+                            // on-disk test fixtures internally consistent.
+                            let orig = std::str::from_utf8(filename)
+                                .unwrap()
+                                .replace('/', "_");
+                            // Bucket index cycles through 0..=3 so collisions
+                            // happen with probability ~(1 - 3/4^k) for k chunks,
+                            // exercising the "same file referenced twice" path.
+                            let bucket = ext_counter % 4;
+                            ext_counter += 1;
+                            let unique_name =
+                                format!("{bucket}/{orig}").into_bytes();
+                            // Canonical content: first writer wins.
+                            let key = (dir_idx, unique_name.clone());
+                            let canonical = content_map
+                                .entry(key)
+                                .or_insert_with(|| content.clone())
+                                .clone();
+                            resolved.push(ResolvedChunk::FileBackedData {
+                                dir_idx,
+                                unique_name,
+                                content: canonical,
+                            });
+                        }
+                    }
+                }
+
+                // Materialize external files on disk (write each unique path once;
+                // duplicates are skipped because the file already exists).
+                for chunk in &resolved {
+                    if let ResolvedChunk::FileBackedData { dir_idx, unique_name, content } = chunk {
+                        let path = tmpdirs[*dir_idx].path().join(
+                            std::str::from_utf8(unique_name).unwrap()
+                        );
+                        if !path.exists() {
+                            if let Some(parent) = path.parent() {
+                                std::fs::create_dir_all(parent).unwrap();
+                            }
+                            std::fs::write(&path, content).unwrap();
+                        }
+                    }
+                }
+
+                // Build the stream and expected output.
+                let mut stream_buf = Vec::new();
+                let mut expected_output: Vec<u8> = Vec::new();
+                {
+                    let mut w = SplitdirfdstreamWriter::new(&mut stream_buf);
+                    for chunk in &resolved {
+                        match chunk {
+                            ResolvedChunk::Metadata(data) => {
+                                w.write_metadata(data).unwrap();
+                                expected_output.extend_from_slice(data);
+                            }
+                            ResolvedChunk::InlineData(data) => {
+                                w.write_inline_data(data).unwrap();
+                                expected_output.extend_from_slice(data);
+                            }
+                            ResolvedChunk::FileBackedData { dir_idx, unique_name, content } => {
+                                let len = content.len() as u64;
+                                w.write_file_backed_data(*dir_idx as u32, len, unique_name).unwrap();
+                                expected_output.extend_from_slice(content);
+                            }
+                        }
+                    }
+                    w.finish().unwrap();
+                }
+
+                // Open dir fds.
+                let dir_fds: Vec<OwnedFd> = tmpdirs
+                    .iter()
+                    .map(|d| open_dir(d.path()))
+                    .collect();
+                let borrowed: Vec<BorrowedFd<'_>> = dir_fds.iter().map(|fd| fd.as_fd()).collect();
+
+                // Verify chunk sequence matches what we wrote.
+                {
+                    let mut reader = SplitdirfdstreamReader::new(stream_buf.as_slice());
+                    let mut res_iter = resolved.iter();
+                    while let Some(chunk) = reader.next_chunk().unwrap() {
+                        let expected = res_iter.next().unwrap();
+                        match (&chunk, expected) {
+                            (Chunk::Metadata(data), ResolvedChunk::Metadata(exp)) => {
+                                prop_assert_eq!(*data, exp.as_slice());
+                            }
+                            (Chunk::InlineData(data), ResolvedChunk::InlineData(exp)) => {
+                                prop_assert_eq!(*data, exp.as_slice());
+                            }
+                            (
+                                Chunk::FileBackedData { dirfd_index, length, filename },
+                                ResolvedChunk::FileBackedData { dir_idx, unique_name, content },
+                            ) => {
+                                prop_assert_eq!(*dirfd_index, *dir_idx as u32);
+                                prop_assert_eq!(*length, content.len() as u64);
+                                prop_assert_eq!(*filename, unique_name.as_slice());
+                            }
+                            _ => {
+                                return Err(TestCaseError::fail("chunk type mismatch"));
+                            }
+                        }
+                    }
+                }
+
+                // Verify reconstruction produces the expected concatenation.
+                let mut out = Vec::new();
+                reconstruct(stream_buf.as_slice(), &borrowed, &mut out).unwrap();
+                prop_assert_eq!(out, expected_output);
+            }
+        }
+    }
+}
diff --git a/crates/composefs-splitdirfdstream/src/transport.rs b/crates/composefs-splitdirfdstream/src/transport.rs
new file mode 100644
index 00000000..cd3cbfd6
--- /dev/null
+++ b/crates/composefs-splitdirfdstream/src/transport.rs
@@ -0,0 +1,663 @@
+//! Source-agnostic FD-transport mechanics for the splitdirfdstream wire protocol.
+//!
+//! This module contains the pure, Storage/Layer-independent helpers that govern
+//! *how* a set of file descriptors is partitioned into transport frames and how
+//! the sparse dirfd-slot layout is computed.  Both the `composefs-storage`
+//! layer-transfer producer and future producers (e.g. a composefs-repo producer)
+//! can share these primitives without pulling in any higher-level dependencies.
+//!
+//! # Public API
+//!
+//! | Item | Role |
+//! |------|------|
+//! | [`MAX_FDS_PER_FRAME`] | Enforced per-frame fd cap (240, safely below 253). |
+//! | [`seed_from_id`] | Derive a deterministic `u64` seed from an opaque string id. |
+//! | [`open_devnull`] | Open `/dev/null` for use as a dummy fd. |
+//! | [`FdLimitError`] | Error from [`split_fds_into_frames`] when `more=false` overflows. |
+//! | [`LayerFdLayout`] | Complete assembled wire layout from [`build_layer_fd_layout`]. |
+//! | [`build_layer_fd_layout`] | Assemble the full sparse dirfd+keepalive+lifetime fd array. |
+//! | [`split_fds_into_frames`] | Partition fds into frames, enforcing `more=false` cap. |
+//! | [`spawn_self_reaping_producer`] | Spawn the 3-phase keepalive-lock producer task (requires `tokio` feature). |
+//!
+//! The producer-side seed jitter (sparse-slot placement, frame count, extra
+//! lifetime fds) is computed by crate-internal helpers (`sparse_dir_slots`,
+//! `compute_n_frames`, `n_extra_lifetime_fds`); the consumer never recomputes
+//! it, since every `FileBackedData` chunk carries an explicit `dirfd_index`.
+
+use std::os::fd::OwnedFd;
+
+use rand::seq::SliceRandom as _;
+use rand_pcg::Pcg64;
+use rand_pcg::rand_core::SeedableRng as _;
+
+// ── Constants ────────────────────────────────────────────────────────────────
+
+/// Our enforced per-frame fd cap.
+///
+/// Kept safely below the hard kernel limit of 253 fds per `sendmsg(SCM_RIGHTS)`
+/// call.  Every transport frame carries at most this many fds.
+pub const MAX_FDS_PER_FRAME: usize = 240;
+
+// ── Frame splitting ───────────────────────────────────────────────────────────
+
+/// Partition `fds` into `n_frames` contiguous batches as evenly as possible.
+///
+/// The first `fds.len() % n_frames` batches receive one extra fd (so all
+/// batches are non-empty when `n_frames <= fds.len()`).  Ordering is preserved:
+/// concatenating the returned batches in order reconstructs the original vec.
+///
+/// # Panics
+/// Panics if `n_frames == 0` or `n_frames > fds.len()`.
+pub(crate) fn split_into_frames(fds: Vec<OwnedFd>, n_frames: usize) -> Vec<Vec<OwnedFd>> {
+    assert!(n_frames > 0 && n_frames <= fds.len());
+    let fd_count = fds.len();
+    let base = fd_count / n_frames;
+    let remainder = fd_count % n_frames;
+    let mut result = Vec::with_capacity(n_frames);
+    let mut it = fds.into_iter();
+    for i in 0..n_frames {
+        let batch_size = base + if i < remainder { 1 } else { 0 };
+        result.push(it.by_ref().take(batch_size).collect());
+    }
+    result
+}
+
+// ── Seed derivation ───────────────────────────────────────────────────────────
+
+/// Derive a deterministic `u64` seed from an opaque string identifier.
+///
+/// Uses the first 8 bytes of SHA-256(id) interpreted as little-endian u64.
+/// The result is stable across runs for the same id string.
+pub fn seed_from_id(id: &str) -> u64 {
+    use sha2::Digest as _;
+    let hash = sha2::Sha256::digest(id.as_bytes());
+    u64::from_le_bytes(hash[..8].try_into().expect("sha256 is at least 8 bytes"))
+}
+
+// ── Frame-count helpers ───────────────────────────────────────────────────────
+
+/// Compute the *hash-derived* minimum frame count for `fd_count` FDs keyed by `seed`.
+///
+/// - If `fd_count < 3`: returns `max(fd_count, 1)` (never more frames than FDs,
+///   but always at least one so `split_into_frames` has a valid divisor).  This
+///   branch is effectively dead: a real transport array is always ≥4 FDs
+///   (pipe + ≥2 dirfd slots + keepalive).
+/// - Otherwise: `max(3, seed % (fd_count + 1))` clamped to `fd_count`.
+///
+/// This guarantees ≥3 frames whenever there are ≥3 FDs, which exercises the
+/// multi-frame path in tests while keeping the arithmetic deterministic.
+///
+/// **This is the hash-min component only.**  The call site additionally enforces
+/// a per-frame cap ([`MAX_FDS_PER_FRAME`]) by taking `hash_min.max(cap_min)` where
+/// `cap_min = ceil(fd_count / MAX_FDS_PER_FRAME)`.  For small layers
+/// (≤`MAX_FDS_PER_FRAME` fds) `cap_min = 1` so the hash-min always wins,
+/// preserving test-hardening behaviour.
+pub(crate) fn n_frames_for(seed: u64, fd_count: usize) -> usize {
+    if fd_count < 3 {
+        return fd_count.max(1);
+    }
+    let raw = (seed % (fd_count as u64 + 1)) as usize;
+    raw.max(3).min(fd_count)
+}
+
+/// Compute the actual number of transport frames for a `more=true` call.
+///
+/// Returns the larger of:
+/// - the hash-derived minimum ([`n_frames_for`]`(seed, fd_count)`, ≥3 for real
+///   layers), which exercises multi-frame paths in tests; and
+/// - `ceil(fd_count / MAX_FDS_PER_FRAME)`, the minimum needed so every frame
+///   carries at most [`MAX_FDS_PER_FRAME`] fds (kernel SCM_RIGHTS cap).
+///
+/// The result is additionally clamped to `[1, fd_count]` so `split_into_frames`
+/// never receives an out-of-range value.
+///
+/// Proved invariant: `fd_count.div_ceil(n) <= MAX_FDS_PER_FRAME` for the
+/// returned `n`, because `n >= ceil(fd_count / MAX_FDS_PER_FRAME)`.
+pub(crate) fn compute_n_frames(seed: u64, fd_count: usize) -> usize {
+    let hash_min = n_frames_for(seed, fd_count);
+    let cap_min = fd_count.div_ceil(MAX_FDS_PER_FRAME);
+    hash_min.max(cap_min).min(fd_count).max(1)
+}
+
+// ── Dummy-fd helpers ──────────────────────────────────────────────────────────
+
+/// Open `/dev/null` as an opaque dummy fd.
+///
+/// Returns a plain [`std::io::Result`] so callers can map it to their own error type.
+pub fn open_devnull() -> std::io::Result<OwnedFd> {
+    rustix::fs::open(
+        c"/dev/null",
+        rustix::fs::OFlags::RDONLY,
+        rustix::fs::Mode::empty(),
+    )
+    .map_err(std::io::Error::from)
+}
+
+/// Open an anonymous `memfd` as an opaque dummy fd.
+///
+/// Returns a plain [`std::io::Result`] so callers can map it to their own error type.
+pub(crate) fn open_memfd() -> std::io::Result<OwnedFd> {
+    rustix::fs::memfd_create(
+        c"composefs-layer-transfer-dummy",
+        rustix::fs::MemfdFlags::CLOEXEC,
+    )
+    .map_err(std::io::Error::from)
+}
+
+// ── Sparse-slot layout ────────────────────────────────────────────────────────
+
+/// Describes the sparse dirfd-slot layout produced by [`sparse_dir_slots`].
+///
+/// The `dirfds` array passed to the consumer has length `total_slots` and contains
+/// real diff-dir fds at `real_slot_indices[i]` (for chain layer `i`) and dummy fds
+/// at all other positions.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub(crate) struct SparseLayout {
+    /// Total number of slots in the dirfds array (real + dummy).
+    pub(crate) total_slots: usize,
+    /// Ascending slot indices assigned to real layers.
+    ///
+    /// `real_slot_indices[i]` is the slot index for chain layer `i`.
+    pub(crate) real_slot_indices: Vec<u32>,
+    /// Slot indices that are dummy (complement of `real_slot_indices` in `0..total_slots`).
+    pub(crate) dummy_slot_indices: Vec<u32>,
+}
+
+/// Compute the sparse slot assignment for `n_real` real layers using `seed`.
+///
+/// This is a pure function (no I/O) that encodes the deterministic slot-placement
+/// algorithm:
+///
+/// 1. `n_dummy = (seed >> 8) % (n_real + 1) + 1`  (at least 1 dummy).
+/// 2. `total_slots = n_real + n_dummy`.
+/// 3. Deterministically shuffle `0..total_slots` with a `seed`-seeded PCG RNG,
+///    take the first `n_real` as real-layer slot indices, sort them ascending.
+/// 4. Dummy slots are `0..total_slots` minus the real indices.
+///
+/// The shuffle uses [`Pcg64`], whose algorithm is stable across `rand_pcg`
+/// releases, so the placement is reproducible from the seed (useful for tests).
+/// Reproducibility is *not* a correctness requirement: the consumer is driven
+/// entirely by the explicit `dirfd_index` in each `FileBackedData` chunk and
+/// never recomputes this layout.
+///
+/// The returned [`SparseLayout`] records `total_slots`, `real_slot_indices`
+/// (ascending, length `n_real`), and `dummy_slot_indices` (ascending).
+pub(crate) fn sparse_dir_slots(n_real: usize, seed: u64) -> SparseLayout {
+    // ── 1/2: total slot count (at least 1 dummy) ─────────────────────────────
+    let n_dummy: usize = ((seed >> 8) % (n_real as u64 + 1) + 1) as usize;
+    let total_slots = n_real + n_dummy;
+
+    // ── 3: select real-layer slot positions ──────────────────────────────────
+    // Deterministically shuffle all slot indices from the seed, take the first
+    // n_real as the real-layer positions, then sort them ascending.
+    let mut rng = Pcg64::seed_from_u64(seed);
+    let mut shuffled_slots: Vec<u32> = (0..total_slots as u32).collect();
+    shuffled_slots.shuffle(&mut rng);
+    let mut real_indices: Vec<u32> = shuffled_slots[..n_real].to_vec();
+    real_indices.sort_unstable();
+
+    // ── 4: compute dummy indices (complement) ─────────────────────────────────
+    let real_set: std::collections::HashSet<u32> = real_indices.iter().copied().collect();
+    let dummy_indices: Vec<u32> = (0..total_slots as u32)
+        .filter(|i| !real_set.contains(i))
+        .collect();
+
+    SparseLayout {
+        total_slots,
+        real_slot_indices: real_indices,
+        dummy_slot_indices: dummy_indices,
+    }
+}
+
+// ── Lifetime dummy count ──────────────────────────────────────────────────────
+
+/// Compute the number of extra opaque lifetime dummy fds for a given seed.
+///
+/// Returns 0, 1, or 2 — derived as `(seed >> 16) % 3`.  These are additional
+/// dummy fds (beyond the keepalive pipe write-end) that the client must hold open
+/// until it has finished reading the splitdirfdstream, then close.  Alternating
+/// `/dev/null` and `memfd` fd kinds exercise transparent client pass-through.
+pub(crate) fn n_extra_lifetime_fds(seed: u64) -> usize {
+    ((seed >> 16) % 3) as usize
+}
+
+// ── High-level fd-layout helpers ─────────────────────────────────────────────
+
+/// Error returned by [`split_fds_into_frames`] when `more=false` and the fd
+/// count exceeds [`MAX_FDS_PER_FRAME`].
+///
+/// The caller must map this to their interface error and instruct the client to
+/// retry with `more=true`.
+#[derive(Debug, thiserror::Error)]
+#[error("fd count {fd_count} exceeds per-frame limit {max_per_frame}; retry with more=true")]
+pub struct FdLimitError {
+    /// Total number of fds that would be sent.
+    pub fd_count: usize,
+    /// The per-frame cap that was exceeded.
+    pub max_per_frame: usize,
+}
+
+/// Assembled FD layout ready to wire to the client in one or more transport frames.
+///
+/// This is the output of [`build_layer_fd_layout`]: a complete description of
+/// every fd that will be sent across the socket, split into two groups:
+///
+/// * The **wire region** — everything that travels to the client:
+///   - `fds_all[0]` — pipe read end (carries the `splitdirfdstream` bytes).
+///   - `fds_all[1..=dir_count]` — dirfds region (sparse: real dirs + dummies).
+///   - `fds_all[dir_count+1..]` — lifetime region (keepalive write + extras).
+/// * The **keepalive read end** — retained on the server; when the client drops
+///   the keepalive write end the server sees EOF and may release resources.
+///
+/// The `real_indices` slice maps chain layer `i` → slot index within the
+/// *dirfds region* where that layer's real diff-dir fd sits.  Pass this to the
+/// producer (via [`build_layer_fd_layout`]'s return value) so it can write the
+/// correct `dirfd_index` into each `FileBackedData` chunk.
+#[derive(Debug)]
+pub struct LayerFdLayout {
+    /// All fds to send to the client, ordered as described above.
+    pub fds_all: Vec<OwnedFd>,
+    /// Number of slots in the dirfds region (`dir_count` in reply and in the
+    /// wire layout: `fds_all[1..=dir_count]`).
+    pub dir_count: u32,
+    /// Slot index within the dirfds region for each real layer (ascending).
+    ///
+    /// `real_indices[i]` is the `dirfd_index` the producer must use for chain
+    /// layer `i`.
+    pub real_indices: Vec<u32>,
+    /// Server-side read end of the keepalive pipe.
+    ///
+    /// Hold this `OwnedFd` until the client has finished consuming the layer
+    /// (or until the request is complete if keepalive is not needed for resource
+    /// management).  When it drops, the client's matching write end sees EOF.
+    pub keepalive_read: OwnedFd,
+}
+
+/// Build the complete wire fd layout for a producer serving `n_real` real diff
+/// directories.
+///
+/// This pure-ish (only I/O: `pipe`, `/dev/null`, `memfd`) function:
+///
+/// 1. Calls [`sparse_dir_slots`] to compute the sparse placement.
+/// 2. Opens dummy fds (`/dev/null` alternating with `memfd`) for gap slots.
+/// 3. Builds the dirfds region by interleaving real and dummy fds in slot order.
+/// 4. Creates a keepalive pipe; the write end goes into `fds_all`, the read end
+///    is returned in [`LayerFdLayout::keepalive_read`].
+/// 5. Adds `n_extra_lifetime_fds(seed)` extra dummy lifetime fds.
+/// 6. Prepends the `pipe_read` fd so `fds_all[0]` is always the data pipe.
+///
+/// The caller is responsible for:
+/// * Passing the correct `real_fds` (in chain order) — the first real_fd gets
+///   slot `real_indices[0]`, the second gets `real_indices[1]`, etc.
+/// * Spawning the producer with `write_fd` (not returned here; the caller opens
+///   the pipe and passes `write_fd` to the producer separately).
+///
+/// # Arguments
+/// * `pipe_read` — Read end of the data pipe (will become `fds_all[0]`).
+/// * `real_fds` — Pre-opened real diff-directory file descriptors, one per
+///   chain layer, in chain order.
+/// * `seed` — Deterministic seed (see [`seed_from_id`]).
+///
+/// # Errors
+/// Returns `io::Error` if any dummy-fd or keepalive-pipe creation fails.
+pub fn build_layer_fd_layout(
+    pipe_read: OwnedFd,
+    real_fds: Vec<OwnedFd>,
+    seed: u64,
+) -> std::io::Result<LayerFdLayout> {
+    let n_real = real_fds.len();
+    let layout = sparse_dir_slots(n_real, seed);
+    let total_slots = layout.total_slots;
+
+    // ── Build dirfds region: slot i = real fd or dummy fd ────────────────────
+    let mut dirfd_region: Vec<Option<OwnedFd>> = (0..total_slots).map(|_| None).collect();
+
+    // Place real fds at their assigned slots.
+    for (chain_idx, real_fd) in real_fds.into_iter().enumerate() {
+        let slot = layout.real_slot_indices[chain_idx] as usize;
+        dirfd_region[slot] = Some(real_fd);
+    }
+
+    // Fill dummy slots, alternating /dev/null and memfd.
+    for (dummy_ordinal, &dummy_slot) in layout.dummy_slot_indices.iter().enumerate() {
+        let dummy_fd = if dummy_ordinal % 2 == 0 {
+            open_devnull()?
+        } else {
+            open_memfd()?
+        };
+        dirfd_region[dummy_slot as usize] = Some(dummy_fd);
+    }
+
+    // Unwrap: every slot was filled above.
+    let dirfd_region: Vec<OwnedFd> = dirfd_region.into_iter().map(|o| o.unwrap()).collect();
+
+    // ── Keepalive pipe ────────────────────────────────────────────────────────
+    let (keepalive_read, keepalive_write) =
+        rustix::pipe::pipe_with(rustix::pipe::PipeFlags::CLOEXEC).map_err(std::io::Error::from)?;
+
+    // ── Extra lifetime dummy fds ──────────────────────────────────────────────
+    let n_extra = n_extra_lifetime_fds(seed);
+    let mut extra_lifetime: Vec<OwnedFd> = Vec::with_capacity(n_extra);
+    for i in 0..n_extra {
+        let fd = if i % 2 == 0 {
+            open_devnull()?
+        } else {
+            open_memfd()?
+        };
+        extra_lifetime.push(fd);
+    }
+
+    // ── Assemble fds_all ──────────────────────────────────────────────────────
+    //   index 0              : pipe read
+    //   index 1..=dir_count  : dirfds region
+    //   index dir_count+1    : keepalive write end
+    //   remaining            : extra lifetime dummies
+    let dir_count = total_slots as u32;
+    let mut fds_all: Vec<OwnedFd> = Vec::with_capacity(1 + total_slots + 1 + n_extra);
+    fds_all.push(pipe_read);
+    fds_all.extend(dirfd_region);
+    fds_all.push(keepalive_write);
+    fds_all.extend(extra_lifetime);
+
+    Ok(LayerFdLayout {
+        fds_all,
+        dir_count,
+        real_indices: layout.real_slot_indices,
+        keepalive_read,
+    })
+}
+
+/// Partition a complete fd array into transport frames.
+///
+/// When `more` is `true` the fds are split across `compute_n_frames(seed, n)`
+/// batches (≥3 for test hardening, cap-limited to ≤[`MAX_FDS_PER_FRAME`] per
+/// frame).
+///
+/// When `more` is `false` all fds must fit in a single frame.  If the total
+/// count exceeds [`MAX_FDS_PER_FRAME`] the fds are returned intact inside an
+/// [`Err(FdLimitError)`] so they can be dropped cleanly by the caller before
+/// returning an error reply.
+///
+/// # Returns
+/// `Ok(Vec<Vec<OwnedFd>>)` — one inner vec per transport frame (1…N frames).
+/// `Err(FdLimitError { fds, .. })` — the original fds are returned so the
+/// caller can drop them; the error contains the count and cap for a diagnostic.
+pub fn split_fds_into_frames(
+    fds: Vec<OwnedFd>,
+    seed: u64,
+    more: bool,
+) -> Result<Vec<Vec<OwnedFd>>, (Vec<OwnedFd>, FdLimitError)> {
+    let fd_count = fds.len();
+    if !more && fd_count > MAX_FDS_PER_FRAME {
+        return Err((
+            fds,
+            FdLimitError {
+                fd_count,
+                max_per_frame: MAX_FDS_PER_FRAME,
+            },
+        ));
+    }
+    let n = if more {
+        compute_n_frames(seed, fd_count)
+    } else {
+        1
+    };
+    Ok(split_into_frames(fds, n))
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Self-reaping producer task (optional: requires `tokio` feature)
+// ─────────────────────────────────────────────────────────────────────────────
+
+/// Spawn a self-reaping blocking producer task that implements the 3-phase
+/// keepalive-lock protocol used by both the repo and cstor `GetLayer`
+/// implementations.
+///
+/// # Phases
+///
+/// 1. **Produce**: call `produce(write_fd_as_File)`.  When `produce` returns the
+///    write end of the data pipe drops, signalling data-EOF to the consumer.
+/// 2. **Keepalive wait**: read from `keepalive_read` until EOF.  The consumer
+///    drops its keepalive write end after it finishes draining the data pipe,
+///    so this phase acts as a rendezvous before releasing any held resources.
+/// 3. **Release**: drop `guard`.  For the repo path `guard` is `()` (no-op);
+///    for the cstor path `guard` is a `LayerStoreLock` whose drop releases the
+///    shared flock.
+///
+/// # Ordering guarantee
+///
+/// Phase 2 begins **after** `write_fd` is dropped (end of Phase 1), so the
+/// data pipe is always at EOF before we start waiting on the keepalive.  The
+/// reverse would cause a deadlock: if the consumer must drain to EOF to drop
+/// the keepalive but the producer never finishes, the pipe never EOF.
+///
+/// # Arguments
+///
+/// * `write_fd` — Write end of the data pipe; moved into the closure and
+///   dropped at the end of Phase 1.
+/// * `keepalive_read` — Read end of the keepalive pipe; retained until Phase 2
+///   EOF, then dropped.
+/// * `guard` — Opaque lifetime guard released in Phase 3 (e.g. a lock).
+/// * `produce` — Called with a `std::fs::File` wrapping `write_fd`.  Should
+///   write the complete `splitdirfdstream` into the file and return.  Errors
+///   are logged as warnings; a short/corrupt stream will be detected later by
+///   the consumer's integrity check.
+#[cfg(feature = "tokio")]
+pub fn spawn_self_reaping_producer(
+    write_fd: OwnedFd,
+    keepalive_read: OwnedFd,
+    guard: impl Send + 'static,
+    produce: impl FnOnce(std::fs::File) + Send + 'static,
+) {
+    tokio::task::spawn_blocking(move || {
+        // Phase 1: produce into write_fd; drop write_fd (data-pipe EOF).
+        {
+            let wf = std::fs::File::from(write_fd);
+            produce(wf);
+            // write_fd drops here → data-pipe EOF visible to consumer.
+        }
+
+        // Phase 2: wait for consumer to signal completion via keepalive EOF.
+        let mut buf = [0u8; 64];
+        loop {
+            match rustix::io::read(&keepalive_read, &mut buf) {
+                Ok(0) => break,                    // EOF — consumer dropped its write end
+                Ok(_) => {}                        // unexpected bytes — drain and continue
+                Err(rustix::io::Errno::INTR) => {} // EINTR — retry
+                Err(_) => break,                   // other error — bail
+            }
+        }
+
+        // Phase 3: release guard (e.g. drop the shared flock).
+        drop(guard);
+    });
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Tests
+// ─────────────────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Seed for "child-layer-001" (SHA-256("child-layer-001")[0..8] as LE-u64).
+    ///
+    /// Verified: `seed_from_id("child-layer-001") == LAYER_SEED`.
+    const LAYER_SEED: u64 = 9_551_015_030_439_334_514;
+
+    /// `compute_n_frames` must ensure every frame holds ≤ MAX_FDS_PER_FRAME fds
+    /// for a large synthetic fd count where the hash-min alone would be too small.
+    #[test]
+    fn test_compute_n_frames_caps_large_fd_count() {
+        // Use a seed whose hash-min is small (e.g. seed=0 → raw=0 → max(3,0)=3).
+        let seed: u64 = 0;
+        // Choose fd_count large enough that 3 frames would exceed 240 each.
+        // ceil(1000 / 3) = 334 > 240, so cap_min must win.
+        let fd_count: usize = 1000;
+
+        let n = compute_n_frames(seed, fd_count);
+
+        // Invariant: every frame carries at most MAX_FDS_PER_FRAME fds.
+        let max_frame_size = fd_count.div_ceil(n);
+        assert!(
+            max_frame_size <= MAX_FDS_PER_FRAME,
+            "frame size {max_frame_size} exceeds cap {MAX_FDS_PER_FRAME} (n={n}, fd_count={fd_count})",
+        );
+
+        // Also verify n is large enough: ceil(1000/240) = 5.
+        let cap_min = fd_count.div_ceil(MAX_FDS_PER_FRAME);
+        assert!(n >= cap_min, "n={n} < cap_min={cap_min}",);
+    }
+
+    /// For small fd counts (≤ MAX_FDS_PER_FRAME) the hash-min should still
+    /// dominate so test-hardening (≥3 frames for real layers) is preserved.
+    #[test]
+    fn test_compute_n_frames_small_fd_count_preserves_hash_min() {
+        // The existing test layer has 9 fds → cap_min=1, hash_min=4.
+        let n = compute_n_frames(LAYER_SEED, 9);
+        // EXPECTED_N_FRAMES = 4 for "child-layer-001"
+        assert_eq!(n, 4, "hash_min should win for small fd counts");
+
+        // Any fd count ≤ MAX_FDS_PER_FRAME has cap_min=1; hash_min (≥3) always wins.
+        for fd_count in 3..=MAX_FDS_PER_FRAME {
+            let n = compute_n_frames(LAYER_SEED, fd_count);
+            let max_frame_size = fd_count.div_ceil(n);
+            assert!(
+                max_frame_size <= MAX_FDS_PER_FRAME,
+                "frame size {max_frame_size} > cap for fd_count={fd_count}",
+            );
+        }
+    }
+
+    /// `compute_n_frames` invariant holds across a sweep of large fd counts.
+    #[test]
+    fn test_compute_n_frames_cap_invariant_sweep() {
+        let seeds: &[u64] = &[0, 1, 42, LAYER_SEED, u64::MAX];
+        let fd_counts: &[usize] = &[
+            MAX_FDS_PER_FRAME,
+            MAX_FDS_PER_FRAME + 1,
+            MAX_FDS_PER_FRAME * 2,
+            MAX_FDS_PER_FRAME * 5,
+            1000,
+            5000,
+        ];
+        for &seed in seeds {
+            for &fd_count in fd_counts {
+                let n = compute_n_frames(seed, fd_count);
+                let max_frame_size = fd_count.div_ceil(n);
+                assert!(
+                    max_frame_size <= MAX_FDS_PER_FRAME,
+                    "invariant violated: seed={seed} fd_count={fd_count} n={n} \
+                     max_frame_size={max_frame_size} cap={MAX_FDS_PER_FRAME}",
+                );
+            }
+        }
+    }
+
+    /// A non-streaming (`more=false`) call that would exceed MAX_FDS_PER_FRAME
+    /// must be detected by the over-cap check before the producer is spawned.
+    ///
+    /// We test the pure decision logic (`!more && fd_count > MAX_FDS_PER_FRAME`)
+    /// without opening real fds, keeping the test deterministic and cheap.
+    #[test]
+    fn test_more_false_over_cap_decision() {
+        // Verify the threshold is exactly MAX_FDS_PER_FRAME.
+        assert!(
+            !(!false && MAX_FDS_PER_FRAME > MAX_FDS_PER_FRAME),
+            "fd_count == cap should NOT trigger error",
+        );
+        assert!(
+            !false && (MAX_FDS_PER_FRAME + 1) > MAX_FDS_PER_FRAME,
+            "fd_count == cap+1 MUST trigger error",
+        );
+
+        // Simulate: if !more and fd_count > MAX_FDS_PER_FRAME → error.
+        let should_error =
+            |more: bool, fd_count: usize| -> bool { !more && fd_count > MAX_FDS_PER_FRAME };
+
+        assert!(
+            !should_error(true, MAX_FDS_PER_FRAME + 1),
+            "more=true should never error on fd count"
+        );
+        assert!(
+            !should_error(false, MAX_FDS_PER_FRAME),
+            "more=false at exactly cap should not error"
+        );
+        assert!(
+            should_error(false, MAX_FDS_PER_FRAME + 1),
+            "more=false over cap must error"
+        );
+        assert!(
+            should_error(false, 1000),
+            "more=false with 1000 fds must error"
+        );
+    }
+
+    /// `seed_from_id` must match the precomputed value for "child-layer-001".
+    #[test]
+    fn test_seed_from_id_child_layer() {
+        assert_eq!(
+            seed_from_id("child-layer-001"),
+            LAYER_SEED,
+            "seed_from_id must match precomputed SHA-256-derived value"
+        );
+    }
+
+    /// `sparse_dir_slots` must produce a valid, seed-reproducible layout.
+    ///
+    /// The dummy count derives from the seed independently of placement, so for
+    /// "child-layer-001" (seed = 9551015030439334514): `n_dummy = (seed >> 8) %
+    /// (2+1) + 1 = 3`, hence `total_slots = 5`.  Real-slot *placement* comes from
+    /// a `Pcg64` shuffle: we assert the structural invariants plus determinism
+    /// (same seed → same layout) rather than pinning the exact slot indices.
+    #[test]
+    fn test_sparse_dir_slots_known_layout() {
+        let layout = sparse_dir_slots(2, LAYER_SEED);
+
+        // Seed-derived counts (independent of the shuffle).
+        assert_eq!(layout.total_slots, 5, "total_slots must be 5");
+        assert_eq!(layout.real_slot_indices.len(), 2, "two real layers");
+        assert_eq!(layout.dummy_slot_indices.len(), 3, "three dummy slots");
+
+        // Structural invariants: both lists ascending, disjoint, and together
+        // covering exactly 0..total_slots.
+        assert!(
+            layout.real_slot_indices.windows(2).all(|w| w[0] < w[1]),
+            "real_slot_indices ascending"
+        );
+        assert!(
+            layout.dummy_slot_indices.windows(2).all(|w| w[0] < w[1]),
+            "dummy_slot_indices ascending"
+        );
+        let mut all: Vec<u32> = layout
+            .real_slot_indices
+            .iter()
+            .chain(&layout.dummy_slot_indices)
+            .copied()
+            .collect();
+        all.sort_unstable();
+        assert_eq!(
+            all,
+            (0..layout.total_slots as u32).collect::<Vec<_>>(),
+            "real and dummy slots must partition 0..total_slots"
+        );
+
+        // Determinism: the Pcg64 shuffle is reproducible from the seed.
+        assert_eq!(
+            sparse_dir_slots(2, LAYER_SEED),
+            layout,
+            "same seed must yield the same layout"
+        );
+    }
+
+    /// `n_extra_lifetime_fds` must return 2 for "child-layer-001"'s seed.
+    #[test]
+    fn test_n_extra_lifetime_fds_child_layer() {
+        assert_eq!(
+            n_extra_lifetime_fds(LAYER_SEED),
+            2,
+            "n_extra_lifetime_fds must be 2 for child-layer-001 seed"
+        );
+    }
+}
diff --git a/crates/composefs-splitfdstream/Cargo.toml b/crates/composefs-splitfdstream/Cargo.toml
deleted file mode 100644
index 1aba5988..00000000
--- a/crates/composefs-splitfdstream/Cargo.toml
+++ /dev/null
@@ -1,22 +0,0 @@
-[package]
-name = "composefs-splitfdstream"
-description = "Binary format for serializing data with external file descriptor references"
-keywords = ["splitfdstream", "fd", "serialization"]
-publish = false
-
-edition.workspace = true
-license.workspace = true
-readme.workspace = true
-repository.workspace = true
-rust-version.workspace = true
-version.workspace = true
-
-[dependencies]
-rustix = { version = "1.0.0", default-features = false, features = ["fs", "std"] }
-
-[dev-dependencies]
-proptest = "1"
-tempfile = { version = "3.8.0", default-features = false }
-
-[lints]
-workspace = true
diff --git a/crates/composefs-splitfdstream/src/lib.rs b/crates/composefs-splitfdstream/src/lib.rs
deleted file mode 100644
index 1c575868..00000000
--- a/crates/composefs-splitfdstream/src/lib.rs
+++ /dev/null
@@ -1,1328 +0,0 @@
-//! Split file descriptor stream format for serializing binary data with external chunks.
-//!
-//! This module implements a binary format for representing serialized binary files
-//! (tar archives, zip files, filesystem images, etc.) where data chunks can be stored
-//! externally as file descriptors rather than inline in the stream.
-//!
-//! # Format Overview
-//!
-//! A splitfdstream is a sequential stream of chunks. Each chunk begins with a signed
-//! 64-bit little-endian prefix that determines the chunk type:
-//!
-//! | Prefix Value | Meaning |
-//! |--------------|---------|
-//! | `< 0`        | **Inline**: The next `abs(prefix)` bytes are literal data |
-//! | `>= 0`       | **External**: Content comes from `fd[prefix + 1]` |
-//!
-//! # Use Cases
-//!
-//! The splitfdstream format is designed for scenarios where:
-//!
-//! - Large binary files need to be transferred with some data stored externally
-//! - File descriptors can be passed alongside the stream (e.g., via Unix sockets)
-//! - Deduplication is desired by referencing the same external fd multiple times
-//! - Zero-copy operations are possible by referencing files directly
-//!
-//! # Example
-//!
-//! ```
-//! use composefs_splitfdstream::{SplitfdstreamWriter, SplitfdstreamReader, Chunk};
-//!
-//! // Write a stream with mixed inline and external chunks
-//! let mut buffer = Vec::new();
-//! let mut writer = SplitfdstreamWriter::new(&mut buffer);
-//! writer.write_inline(b"inline data").unwrap();
-//! writer.write_external(0).unwrap();  // Reference fd[1]
-//! writer.write_inline(b"more inline").unwrap();
-//! writer.finish().unwrap();
-//!
-//! // Read the stream back
-//! let mut reader = SplitfdstreamReader::new(buffer.as_slice());
-//! while let Some(chunk) = reader.next_chunk().unwrap() {
-//!     match chunk {
-//!         Chunk::Inline(data) => println!("Inline: {} bytes", data.len()),
-//!         Chunk::External(fd_index) => println!("External: fd[{}]", fd_index + 1),
-//!     }
-//! }
-//! ```
-//!
-//! # Wire Format Details
-//!
-//! The stream consists of a sequence of chunks with no framing header or footer.
-//! Each chunk is:
-//!
-//! 1. An 8-byte signed little-endian integer (the prefix)
-//! 2. For inline chunks only: `abs(prefix)` bytes of literal data
-//!
-//! External chunks have no additional data after the prefix; the content is
-//! retrieved from the file descriptor array passed alongside the stream.
-
-// This is a library: emit diagnostics via the `log` crate (or return them),
-// never by writing to the process's stdout/stderr. Genuinely-intentional
-// exceptions carry a local `#[allow]` with justification. Test code is exempt.
-#![cfg_attr(not(test), deny(clippy::print_stdout, clippy::print_stderr))]
-
-use std::io::{self, Read, Write};
-#[cfg(test)]
-use std::os::fd::AsFd;
-
-/// Maximum size for an inline chunk (256 MB).
-///
-/// This limit prevents denial-of-service attacks where a malicious stream
-/// could specify an extremely large inline chunk size, causing unbounded
-/// memory allocation.
-pub const MAX_INLINE_CHUNK_SIZE: usize = 256 * 1024 * 1024;
-
-/// A chunk read from a splitfdstream.
-///
-/// Chunks are either inline data embedded in the stream, or references to
-/// external file descriptors that should be read separately.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum Chunk<'a> {
-    /// Inline data embedded directly in the stream.
-    Inline(&'a [u8]),
-
-    /// Reference to an external file descriptor.
-    ///
-    /// The value is the fd index (0-based), meaning the actual fd is at
-    /// position `fd_index + 1` in the fd array (fd\[0\] is typically the
-    /// stream itself).
-    External(u32),
-}
-
-/// Writer for building a splitfdstream.
-///
-/// The writer encodes inline data and external fd references into the
-/// splitfdstream binary format.
-///
-/// # Example
-///
-/// ```
-/// use composefs_splitfdstream::{SplitfdstreamWriter, SplitfdstreamReader, Chunk};
-///
-/// // Write a stream with mixed inline and external chunks
-/// let mut buffer = Vec::new();
-/// let mut writer = SplitfdstreamWriter::new(&mut buffer);
-/// writer.write_inline(b"inline data").unwrap();
-/// writer.write_external(0).unwrap();  // Reference fd[1]
-/// writer.write_inline(b"more inline").unwrap();
-/// writer.finish().unwrap();
-///
-/// // Read the stream back
-/// let mut reader = SplitfdstreamReader::new(buffer.as_slice());
-/// while let Some(chunk) = reader.next_chunk().unwrap() {
-///     match chunk {
-///         Chunk::Inline(data) => println!("Inline: {} bytes", data.len()),
-///         Chunk::External(fd_index) => println!("External: fd[{}]", fd_index + 1),
-///     }
-/// }
-/// ```
-#[derive(Debug)]
-pub struct SplitfdstreamWriter<W> {
-    writer: W,
-}
-
-impl<W: Write> SplitfdstreamWriter<W> {
-    /// Create a new splitfdstream writer wrapping the given writer.
-    pub fn new(writer: W) -> Self {
-        Self { writer }
-    }
-
-    /// Write inline data to the stream.
-    ///
-    /// The data is prefixed with a negative i64 indicating the length,
-    /// followed by the literal bytes.
-    ///
-    /// # Errors
-    ///
-    /// Returns an error if writing to the underlying writer fails.
-    pub fn write_inline(&mut self, data: &[u8]) -> io::Result<()> {
-        if data.is_empty() {
-            return Ok(());
-        }
-
-        // Prefix is negative length
-        let len = data.len() as i64;
-        let prefix = -len;
-        self.writer.write_all(&prefix.to_le_bytes())?;
-        self.writer.write_all(data)?;
-        Ok(())
-    }
-
-    /// Write an external fd reference to the stream.
-    ///
-    /// The fd_index is the 0-based index into the fd array. The actual
-    /// file descriptor is at position `fd_index + 1` (since fd\[0\] is
-    /// typically the stream itself).
-    ///
-    /// # Errors
-    ///
-    /// Returns an error if writing to the underlying writer fails.
-    pub fn write_external(&mut self, fd_index: u32) -> io::Result<()> {
-        // Prefix is fd_index (non-negative), actual fd is at fd_index + 1
-        let prefix = fd_index as i64;
-        self.writer.write_all(&prefix.to_le_bytes())?;
-        Ok(())
-    }
-
-    /// Finish writing and return the underlying writer.
-    ///
-    /// This consumes the writer and returns the underlying `Write` impl.
-    pub fn finish(self) -> io::Result<W> {
-        Ok(self.writer)
-    }
-}
-
-/// Reader for parsing a splitfdstream.
-///
-/// The reader parses the binary format and yields chunks that are either
-/// inline data or references to external file descriptors.
-///
-/// # Example
-///
-/// ```
-/// use composefs_splitfdstream::{SplitfdstreamReader, Chunk};
-///
-/// let data = vec![
-///     // Inline chunk: prefix = -5, then 5 bytes
-///     0xfb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  // -5 as i64 LE
-///     b'h', b'e', b'l', b'l', b'o',
-/// ];
-///
-/// let mut reader = SplitfdstreamReader::new(data.as_slice());
-/// let chunk = reader.next_chunk().unwrap().unwrap();
-/// assert_eq!(chunk, Chunk::Inline(b"hello"));
-/// ```
-#[derive(Debug)]
-pub struct SplitfdstreamReader<R> {
-    reader: R,
-    /// Buffer for reading inline data
-    buffer: Vec<u8>,
-}
-
-impl<R: Read> SplitfdstreamReader<R> {
-    /// Create a new splitfdstream reader wrapping the given reader.
-    pub fn new(reader: R) -> Self {
-        Self {
-            reader,
-            buffer: Vec::new(),
-        }
-    }
-
-    /// Consume this reader, returning the underlying reader.
-    pub fn into_inner(self) -> R {
-        self.reader
-    }
-
-    /// Read the next chunk from the stream.
-    ///
-    /// Returns `Ok(None)` when the stream is exhausted.
-    ///
-    /// # Errors
-    ///
-    /// Returns an error if:
-    /// - Reading from the underlying reader fails
-    /// - The stream contains invalid data (e.g., inline size exceeds maximum)
-    pub fn next_chunk(&mut self) -> io::Result<Option<Chunk<'_>>> {
-        // Read the 8-byte prefix
-        let mut prefix_bytes = [0u8; 8];
-        match self.reader.read_exact(&mut prefix_bytes) {
-            Ok(()) => {}
-            Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
-            Err(e) => return Err(e),
-        }
-
-        let prefix = i64::from_le_bytes(prefix_bytes);
-
-        if prefix < 0 {
-            // Inline chunk: read abs(prefix) bytes
-            let len = (-prefix) as usize;
-            if len > MAX_INLINE_CHUNK_SIZE {
-                return Err(io::Error::new(
-                    io::ErrorKind::InvalidData,
-                    format!(
-                        "inline chunk size {} exceeds maximum allowed size {}",
-                        len, MAX_INLINE_CHUNK_SIZE
-                    ),
-                ));
-            }
-            self.buffer.clear();
-            self.buffer.resize(len, 0);
-            self.reader.read_exact(&mut self.buffer)?;
-            Ok(Some(Chunk::Inline(&self.buffer)))
-        } else {
-            // External chunk: prefix is the fd index
-            Ok(Some(Chunk::External(prefix as u32)))
-        }
-    }
-}
-
-#[cfg(test)]
-/// A helper that reads a file from offset 0 using positional reads.
-///
-/// This allows reading the same file multiple times without seeking,
-/// since each read specifies its position explicitly.
-#[derive(Debug)]
-struct ReadAtReader<'a, F> {
-    file: &'a F,
-    offset: u64,
-}
-
-#[cfg(test)]
-impl<'a, F: AsFd> ReadAtReader<'a, F> {
-    fn new(file: &'a F) -> Self {
-        Self { file, offset: 0 }
-    }
-}
-
-#[cfg(test)]
-impl<F: AsFd> Read for ReadAtReader<'_, F> {
-    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        let n = rustix::io::pread(self.file, buf, self.offset)?;
-        self.offset += n as u64;
-        Ok(n)
-    }
-}
-
-#[cfg(test)]
-/// A `Read` adapter that reconstructs a byte stream from a splitfdstream.
-///
-/// This struct implements `Read` by combining inline chunks and external file
-/// descriptor content into a contiguous byte stream.
-///
-/// External files are read using positional read (pread/read_at), so the
-/// same file can be referenced multiple times in the splitfdstream without
-/// needing to reopen or seek it.
-#[derive(Debug)]
-struct SplitfdstreamAsRead<'files, R: Read> {
-    reader: SplitfdstreamReader<R>,
-    files: &'files [std::fs::File],
-    /// Buffer for inline data (partially consumed)
-    inline_buffer: Vec<u8>,
-    /// Position within inline_buffer
-    inline_pos: usize,
-    /// Current external file being read (if any)
-    current_external: Option<ReadAtReader<'files, std::fs::File>>,
-}
-
-#[cfg(test)]
-impl<'files, R: Read> SplitfdstreamAsRead<'files, R> {
-    /// Create a new reader from a splitfdstream and files.
-    ///
-    /// The `files` slice provides the external files referenced by the
-    /// splitfdstream. Each external chunk at index N reads from `files[N]`.
-    fn new(splitfdstream: R, files: &'files [std::fs::File]) -> Self {
-        Self {
-            reader: SplitfdstreamReader::new(splitfdstream),
-            files,
-            inline_buffer: Vec::new(),
-            inline_pos: 0,
-            current_external: None,
-        }
-    }
-}
-
-#[cfg(test)]
-impl<R: Read> Read for SplitfdstreamAsRead<'_, R> {
-    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        // First, drain any buffered inline data
-        if self.inline_pos < self.inline_buffer.len() {
-            let remaining = &self.inline_buffer[self.inline_pos..];
-            let n = buf.len().min(remaining.len());
-            buf[..n].copy_from_slice(&remaining[..n]);
-            self.inline_pos += n;
-            return Ok(n);
-        }
-
-        // Next, drain current external file if any
-        if let Some(ref mut ext) = self.current_external {
-            let n = ext.read(buf)?;
-            if n > 0 {
-                return Ok(n);
-            }
-            // External exhausted, move to next chunk
-            self.current_external = None;
-        }
-
-        // Get next chunk from splitfdstream
-        match self.reader.next_chunk()? {
-            None => Ok(0), // EOF
-            Some(Chunk::Inline(data)) => {
-                let n = buf.len().min(data.len());
-                buf[..n].copy_from_slice(&data[..n]);
-                if n < data.len() {
-                    // Buffer remaining data for next read
-                    self.inline_buffer.clear();
-                    self.inline_buffer.extend_from_slice(&data[n..]);
-                    self.inline_pos = 0;
-                }
-                Ok(n)
-            }
-            Some(Chunk::External(idx)) => {
-                let idx = idx as usize;
-                if idx >= self.files.len() {
-                    return Err(io::Error::new(
-                        io::ErrorKind::InvalidData,
-                        format!(
-                            "external chunk references fd index {} but only {} files provided",
-                            idx,
-                            self.files.len()
-                        ),
-                    ));
-                }
-                self.current_external = Some(ReadAtReader::new(&self.files[idx]));
-                // Recurse to read from the new external
-                self.read(buf)
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-/// Reconstruct a stream from splitfdstream + file descriptors.
-///
-/// This function reads a splitfdstream and writes the reconstructed data to `output`.
-/// Inline chunks are written directly, while external chunks are read from the
-/// corresponding file descriptors in `files`.
-fn reconstruct<R, W>(splitfdstream: R, files: &[std::fs::File], output: &mut W) -> io::Result<u64>
-where
-    R: Read,
-    W: Write,
-{
-    let mut reader = SplitfdstreamReader::new(splitfdstream);
-    let mut bytes_written = 0u64;
-
-    while let Some(chunk) = reader.next_chunk()? {
-        match chunk {
-            Chunk::Inline(data) => {
-                output.write_all(data)?;
-                bytes_written += data.len() as u64;
-            }
-            Chunk::External(idx) => {
-                let file = files.get(idx as usize).ok_or_else(|| {
-                    io::Error::new(
-                        io::ErrorKind::InvalidData,
-                        format!(
-                            "external chunk references fd index {} but only {} files provided",
-                            idx,
-                            files.len()
-                        ),
-                    )
-                })?;
-                let mut ext_reader = ReadAtReader::new(file);
-                let copied = io::copy(&mut ext_reader, output)?;
-                bytes_written += copied;
-            }
-        }
-    }
-
-    Ok(bytes_written)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    /// Helper to write and read back chunks, verifying round-trip.
-    fn roundtrip_chunks(
-        inline_chunks: &[&[u8]],
-        external_indices: &[u32],
-        interleave: bool,
-    ) -> Vec<(bool, Vec<u8>, u32)> {
-        let mut buffer = Vec::new();
-        {
-            let mut writer = SplitfdstreamWriter::new(&mut buffer);
-
-            if interleave {
-                let max_len = inline_chunks.len().max(external_indices.len());
-                for i in 0..max_len {
-                    if i < inline_chunks.len() {
-                        writer.write_inline(inline_chunks[i]).unwrap();
-                    }
-                    if i < external_indices.len() {
-                        writer.write_external(external_indices[i]).unwrap();
-                    }
-                }
-            } else {
-                for chunk in inline_chunks {
-                    writer.write_inline(chunk).unwrap();
-                }
-                for &idx in external_indices {
-                    writer.write_external(idx).unwrap();
-                }
-            }
-
-            writer.finish().unwrap();
-        }
-
-        // Read back
-        let mut reader = SplitfdstreamReader::new(buffer.as_slice());
-        let mut results = Vec::new();
-
-        while let Some(chunk) = reader.next_chunk().unwrap() {
-            match chunk {
-                Chunk::Inline(data) => {
-                    results.push((true, data.to_vec(), 0));
-                }
-                Chunk::External(idx) => {
-                    results.push((false, Vec::new(), idx));
-                }
-            }
-        }
-
-        results
-    }
-
-    #[test]
-    fn test_empty_stream() {
-        let buffer: Vec<u8> = Vec::new();
-        let mut reader = SplitfdstreamReader::new(buffer.as_slice());
-        assert!(reader.next_chunk().unwrap().is_none());
-    }
-
-    #[test]
-    fn test_only_inline_chunks() {
-        let chunks: &[&[u8]] = &[b"hello", b"world", b"test"];
-        let results = roundtrip_chunks(chunks, &[], false);
-
-        assert_eq!(results.len(), 3);
-        assert!(results[0].0); // is_inline
-        assert_eq!(results[0].1, b"hello");
-        assert!(results[1].0);
-        assert_eq!(results[1].1, b"world");
-        assert!(results[2].0);
-        assert_eq!(results[2].1, b"test");
-    }
-
-    #[test]
-    fn test_only_external_chunks() {
-        let results = roundtrip_chunks(&[], &[0, 5, 42, 100], false);
-
-        assert_eq!(results.len(), 4);
-        assert!(!results[0].0); // is_external
-        assert_eq!(results[0].2, 0);
-        assert!(!results[1].0);
-        assert_eq!(results[1].2, 5);
-        assert!(!results[2].0);
-        assert_eq!(results[2].2, 42);
-        assert!(!results[3].0);
-        assert_eq!(results[3].2, 100);
-    }
-
-    #[test]
-    fn test_mixed_inline_external() {
-        let inline: &[&[u8]] = &[b"header", b"middle", b"footer"];
-        let external: &[u32] = &[0, 1, 2];
-        let results = roundtrip_chunks(inline, external, true);
-
-        // Interleaved: inline0, ext0, inline1, ext1, inline2, ext2
-        assert_eq!(results.len(), 6);
-
-        assert!(results[0].0);
-        assert_eq!(results[0].1, b"header");
-
-        assert!(!results[1].0);
-        assert_eq!(results[1].2, 0);
-
-        assert!(results[2].0);
-        assert_eq!(results[2].1, b"middle");
-
-        assert!(!results[3].0);
-        assert_eq!(results[3].2, 1);
-
-        assert!(results[4].0);
-        assert_eq!(results[4].1, b"footer");
-
-        assert!(!results[5].0);
-        assert_eq!(results[5].2, 2);
-    }
-
-    #[test]
-    fn test_large_inline_chunk() {
-        // Test with a large chunk to verify i64 handles sizes correctly
-        let large_data: Vec<u8> = (0..100_000).map(|i| (i % 256) as u8).collect();
-
-        let mut buffer = Vec::new();
-        {
-            let mut writer = SplitfdstreamWriter::new(&mut buffer);
-            writer.write_inline(&large_data).unwrap();
-            writer.finish().unwrap();
-        }
-
-        let mut reader = SplitfdstreamReader::new(buffer.as_slice());
-        let chunk = reader.next_chunk().unwrap().unwrap();
-
-        match chunk {
-            Chunk::Inline(data) => {
-                assert_eq!(data.len(), 100_000);
-                assert_eq!(data, large_data.as_slice());
-            }
-            Chunk::External(_) => panic!("Expected inline chunk"),
-        }
-
-        assert!(reader.next_chunk().unwrap().is_none());
-    }
-
-    #[test]
-    fn test_empty_inline_chunk_is_skipped() {
-        // Empty inline writes should be no-ops
-        let mut buffer = Vec::new();
-        {
-            let mut writer = SplitfdstreamWriter::new(&mut buffer);
-            writer.write_inline(b"").unwrap();
-            writer.write_inline(b"actual").unwrap();
-            writer.write_inline(b"").unwrap();
-            writer.finish().unwrap();
-        }
-
-        let mut reader = SplitfdstreamReader::new(buffer.as_slice());
-        let chunk = reader.next_chunk().unwrap().unwrap();
-        assert_eq!(chunk, Chunk::Inline(b"actual"));
-        assert!(reader.next_chunk().unwrap().is_none());
-    }
-
-    #[test]
-    fn test_boundary_sizes() {
-        // Test various boundary sizes
-        let sizes = [
-            1, 7, 8, 9, 255, 256, 257, 1023, 1024, 1025, 4095, 4096, 4097,
-        ];
-
-        for &size in &sizes {
-            let data: Vec<u8> = (0..size).map(|i| (i % 256) as u8).collect();
-
-            let mut buffer = Vec::new();
-            {
-                let mut writer = SplitfdstreamWriter::new(&mut buffer);
-                writer.write_inline(&data).unwrap();
-                writer.finish().unwrap();
-            }
-
-            // Verify buffer structure: 8-byte prefix + data
-            assert_eq!(buffer.len(), 8 + size);
-
-            // Verify prefix is correct negative value
-            let prefix = i64::from_le_bytes(buffer[..8].try_into().unwrap());
-            assert_eq!(prefix, -(size as i64));
-
-            // Read back and verify
-            let mut reader = SplitfdstreamReader::new(buffer.as_slice());
-            let chunk = reader.next_chunk().unwrap().unwrap();
-            match chunk {
-                Chunk::Inline(read_data) => {
-                    assert_eq!(read_data.len(), size);
-                    assert_eq!(read_data, data.as_slice());
-                }
-                Chunk::External(_) => panic!("Expected inline"),
-            }
-        }
-    }
-
-    #[test]
-    fn test_external_fd_index_zero() {
-        // fd_index 0 means fd[1], test this boundary
-        let mut buffer = Vec::new();
-        {
-            let mut writer = SplitfdstreamWriter::new(&mut buffer);
-            writer.write_external(0).unwrap();
-            writer.finish().unwrap();
-        }
-
-        // Should be exactly 8 bytes (the prefix)
-        assert_eq!(buffer.len(), 8);
-
-        // Prefix should be 0
-        let prefix = i64::from_le_bytes(buffer[..8].try_into().unwrap());
-        assert_eq!(prefix, 0);
-
-        let mut reader = SplitfdstreamReader::new(buffer.as_slice());
-        let chunk = reader.next_chunk().unwrap().unwrap();
-        assert_eq!(chunk, Chunk::External(0));
-    }
-
-    #[test]
-    fn test_large_fd_index() {
-        // Test with maximum u32 fd index
-        let mut buffer = Vec::new();
-        {
-            let mut writer = SplitfdstreamWriter::new(&mut buffer);
-            writer.write_external(u32::MAX).unwrap();
-            writer.finish().unwrap();
-        }
-
-        let mut reader = SplitfdstreamReader::new(buffer.as_slice());
-        let chunk = reader.next_chunk().unwrap().unwrap();
-        assert_eq!(chunk, Chunk::External(u32::MAX));
-    }
-
-    #[test]
-    fn test_single_byte_inline() {
-        let results = roundtrip_chunks(&[b"x"], &[], false);
-        assert_eq!(results.len(), 1);
-        assert!(results[0].0);
-        assert_eq!(results[0].1, b"x");
-    }
-
-    #[test]
-    fn test_writer_finish_returns_writer() {
-        let mut buffer = Vec::new();
-        let writer = SplitfdstreamWriter::new(&mut buffer);
-        let returned = writer.finish().unwrap();
-
-        // Verify we got the writer back (can write to it)
-        returned.len(); // Just verify it's accessible
-    }
-
-    #[test]
-    fn test_chunk_equality() {
-        assert_eq!(Chunk::Inline(b"test"), Chunk::Inline(b"test"));
-        assert_ne!(Chunk::Inline(b"test"), Chunk::Inline(b"other"));
-        assert_eq!(Chunk::External(5), Chunk::External(5));
-        assert_ne!(Chunk::External(5), Chunk::External(6));
-        assert_ne!(Chunk::Inline(b"test"), Chunk::External(0));
-    }
-
-    #[test]
-    fn test_many_small_chunks() {
-        // Stress test with many small chunks
-        let chunks: Vec<Vec<u8>> = (0..1000).map(|i| vec![i as u8; (i % 10) + 1]).collect();
-        let chunk_refs: Vec<&[u8]> = chunks.iter().map(|c| c.as_slice()).collect();
-
-        let mut buffer = Vec::new();
-        {
-            let mut writer = SplitfdstreamWriter::new(&mut buffer);
-            for chunk in &chunk_refs {
-                writer.write_inline(chunk).unwrap();
-            }
-            writer.finish().unwrap();
-        }
-
-        let mut reader = SplitfdstreamReader::new(buffer.as_slice());
-        let mut count = 0;
-        while let Some(chunk) = reader.next_chunk().unwrap() {
-            match chunk {
-                Chunk::Inline(data) => {
-                    assert_eq!(data, chunk_refs[count]);
-                    count += 1;
-                }
-                Chunk::External(_) => panic!("Unexpected external"),
-            }
-        }
-        assert_eq!(count, 1000);
-    }
-
-    #[test]
-    fn test_alternating_inline_external() {
-        let mut buffer = Vec::new();
-        {
-            let mut writer = SplitfdstreamWriter::new(&mut buffer);
-            for i in 0..50 {
-                writer.write_inline(&[i as u8]).unwrap();
-                writer.write_external(i as u32).unwrap();
-            }
-            writer.finish().unwrap();
-        }
-
-        let mut reader = SplitfdstreamReader::new(buffer.as_slice());
-        let mut inline_count = 0;
-        let mut external_count = 0;
-
-        while let Some(chunk) = reader.next_chunk().unwrap() {
-            match chunk {
-                Chunk::Inline(data) => {
-                    assert_eq!(data.len(), 1);
-                    assert_eq!(data[0], inline_count as u8);
-                    inline_count += 1;
-                }
-                Chunk::External(idx) => {
-                    assert_eq!(idx, external_count as u32);
-                    external_count += 1;
-                }
-            }
-        }
-
-        assert_eq!(inline_count, 50);
-        assert_eq!(external_count, 50);
-    }
-
-    #[test]
-    fn test_truncated_prefix_returns_none() {
-        // Partial prefix (less than 8 bytes) at end of stream
-        let buffer = vec![0x01, 0x02, 0x03]; // Only 3 bytes
-
-        let mut reader = SplitfdstreamReader::new(buffer.as_slice());
-        // Should return None (EOF) since we can't read a complete prefix
-        assert!(reader.next_chunk().unwrap().is_none());
-    }
-
-    #[test]
-    fn test_truncated_data_is_error() {
-        // Valid prefix saying 100 bytes, but only 10 bytes of data
-        let mut buffer = Vec::new();
-        let prefix: i64 = -100; // Inline, 100 bytes
-        buffer.extend_from_slice(&prefix.to_le_bytes());
-        buffer.extend_from_slice(&[0u8; 10]); // Only 10 bytes
-
-        let mut reader = SplitfdstreamReader::new(buffer.as_slice());
-        let result = reader.next_chunk();
-        assert!(result.is_err());
-    }
-
-    #[test]
-    fn test_inline_chunk_size_limit() {
-        // Attempt to read a chunk that exceeds MAX_INLINE_CHUNK_SIZE
-        let mut buffer = Vec::new();
-        // Request 512 MB (exceeds 256 MB limit)
-        let prefix: i64 = -(512 * 1024 * 1024);
-        buffer.extend_from_slice(&prefix.to_le_bytes());
-
-        let mut reader = SplitfdstreamReader::new(buffer.as_slice());
-        let result = reader.next_chunk();
-        assert!(result.is_err());
-        let err = result.unwrap_err();
-        assert_eq!(err.kind(), io::ErrorKind::InvalidData);
-        assert!(err.to_string().contains("exceeds maximum"));
-    }
-
-    mod reconstruct_tests {
-        use super::*;
-        use std::io::Cursor;
-        use tempfile::NamedTempFile;
-
-        #[test]
-        fn test_reconstruct_inline_only() {
-            // Create a splitfdstream with only inline data
-            let mut stream_buf = Vec::new();
-            {
-                let mut writer = SplitfdstreamWriter::new(&mut stream_buf);
-                writer.write_inline(b"Hello, ").unwrap();
-                writer.write_inline(b"world!").unwrap();
-                writer.finish().unwrap();
-            }
-
-            let mut output = Vec::new();
-            let files: &[std::fs::File] = &[];
-            let bytes = reconstruct(stream_buf.as_slice(), files, &mut output).unwrap();
-
-            assert_eq!(output, b"Hello, world!");
-            assert_eq!(bytes, 13);
-        }
-
-        #[test]
-        fn test_reconstruct_empty_stream() {
-            let stream_buf: Vec<u8> = Vec::new();
-            let mut output = Vec::new();
-            let files: &[std::fs::File] = &[];
-            let bytes = reconstruct(stream_buf.as_slice(), files, &mut output).unwrap();
-
-            assert!(output.is_empty());
-            assert_eq!(bytes, 0);
-        }
-
-        #[test]
-        fn test_reconstruct_with_external_fds() {
-            // Create temp files with known content
-            let mut file0 = NamedTempFile::new().unwrap();
-            let mut file1 = NamedTempFile::new().unwrap();
-
-            use std::io::Write;
-            file0.write_all(b"EXTERNAL0").unwrap();
-            file1.write_all(b"EXTERNAL1").unwrap();
-
-            // Create splitfdstream that references these files
-            let mut stream_buf = Vec::new();
-            {
-                let mut writer = SplitfdstreamWriter::new(&mut stream_buf);
-                writer.write_inline(b"[start]").unwrap();
-                writer.write_external(0).unwrap(); // Reference first fd
-                writer.write_inline(b"[mid]").unwrap();
-                writer.write_external(1).unwrap(); // Reference second fd
-                writer.write_inline(b"[end]").unwrap();
-                writer.finish().unwrap();
-            }
-
-            // Open files for reading
-            let f0 = std::fs::File::open(file0.path()).unwrap();
-            let f1 = std::fs::File::open(file1.path()).unwrap();
-            let files = [f0, f1];
-
-            let mut output = Vec::new();
-            let bytes = reconstruct(stream_buf.as_slice(), &files, &mut output).unwrap();
-
-            assert_eq!(output, b"[start]EXTERNAL0[mid]EXTERNAL1[end]");
-            assert_eq!(bytes, output.len() as u64);
-        }
-
-        #[test]
-        fn test_reconstruct_external_fd_out_of_bounds() {
-            // Create splitfdstream referencing fd index 5, but only provide 1 file
-            let mut stream_buf = Vec::new();
-            {
-                let mut writer = SplitfdstreamWriter::new(&mut stream_buf);
-                writer.write_external(5).unwrap(); // Out of bounds
-                writer.finish().unwrap();
-            }
-
-            let file = NamedTempFile::new().unwrap();
-            let f = std::fs::File::open(file.path()).unwrap();
-            let files = [f];
-
-            let mut output = Vec::new();
-            let result = reconstruct(stream_buf.as_slice(), &files, &mut output);
-
-            assert!(result.is_err());
-            let err = result.unwrap_err();
-            assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
-            assert!(err.to_string().contains("fd index 5"));
-        }
-
-        #[test]
-        fn test_reconstruct_large_external_file() {
-            // Create a larger external file to test efficient copying
-            let mut file = NamedTempFile::new().unwrap();
-            let large_data: Vec<u8> = (0..100_000).map(|i| (i % 256) as u8).collect();
-
-            use std::io::Write;
-            file.write_all(&large_data).unwrap();
-
-            let mut stream_buf = Vec::new();
-            {
-                let mut writer = SplitfdstreamWriter::new(&mut stream_buf);
-                writer.write_inline(b"header").unwrap();
-                writer.write_external(0).unwrap();
-                writer.write_inline(b"footer").unwrap();
-                writer.finish().unwrap();
-            }
-
-            let f = std::fs::File::open(file.path()).unwrap();
-            let files = [f];
-
-            let mut output = Vec::new();
-            let bytes = reconstruct(stream_buf.as_slice(), &files, &mut output).unwrap();
-
-            // Verify header + large data + footer
-            assert_eq!(&output[..6], b"header");
-            assert_eq!(&output[6..100_006], large_data.as_slice());
-            assert_eq!(&output[100_006..], b"footer");
-            assert_eq!(bytes, 6 + 100_000 + 6);
-        }
-
-        #[test]
-        fn test_reconstruct_same_fd_multiple_times() {
-            // Test that the same fd can be referenced multiple times
-            let mut file = NamedTempFile::new().unwrap();
-
-            use std::io::Write;
-            file.write_all(b"REPEATED").unwrap();
-
-            let mut stream_buf = Vec::new();
-            {
-                let mut writer = SplitfdstreamWriter::new(&mut stream_buf);
-                writer.write_external(0).unwrap();
-                writer.write_inline(b"-").unwrap();
-                writer.write_external(0).unwrap();
-                writer.write_inline(b"-").unwrap();
-                writer.write_external(0).unwrap();
-                writer.finish().unwrap();
-            }
-
-            let f = std::fs::File::open(file.path()).unwrap();
-            let files = [f];
-
-            let mut output = Vec::new();
-            let bytes = reconstruct(stream_buf.as_slice(), &files, &mut output).unwrap();
-
-            // Each reference uses pread from offset 0, so each reads from start
-            assert_eq!(output, b"REPEATED-REPEATED-REPEATED");
-            assert_eq!(bytes, 26);
-        }
-
-        #[test]
-        fn test_into_inner() {
-            let data = vec![1, 2, 3, 4];
-            let cursor = Cursor::new(data.clone());
-            let reader = SplitfdstreamReader::new(cursor);
-            let inner = reader.into_inner();
-            assert_eq!(inner.into_inner(), data);
-        }
-    }
-
-    mod proptest_tests {
-        use super::*;
-        use proptest::prelude::*;
-        use std::io::Write;
-        use tempfile::NamedTempFile;
-
-        /// Represents a chunk in the stream for testing purposes.
-        #[derive(Debug, Clone)]
-        enum TestChunk {
-            Inline(Vec<u8>),
-            External { fd_index: usize, content: Vec<u8> },
-        }
-
-        /// Strategy for generating inline chunk data.
-        /// Bounded to reasonable sizes to keep tests fast.
-        fn inline_data_strategy() -> impl Strategy<Value = Vec<u8>> {
-            prop::collection::vec(any::<u8>(), 0..4096)
-        }
-
-        /// Strategy for generating external chunk content.
-        fn external_content_strategy() -> impl Strategy<Value = Vec<u8>> {
-            prop::collection::vec(any::<u8>(), 0..8192)
-        }
-
-        /// Strategy for generating a single test chunk.
-        /// The fd_index is relative and will be resolved during test execution.
-        fn chunk_strategy() -> impl Strategy<Value = TestChunk> {
-            prop_oneof![
-                inline_data_strategy().prop_map(TestChunk::Inline),
-                (0..16usize, external_content_strategy()).prop_map(|(idx, content)| {
-                    TestChunk::External {
-                        fd_index: idx,
-                        content,
-                    }
-                })
-            ]
-        }
-
-        /// Strategy for generating a sequence of chunks.
-        fn chunks_strategy() -> impl Strategy<Value = Vec<TestChunk>> {
-            prop::collection::vec(chunk_strategy(), 0..64)
-        }
-
-        /// Execute a roundtrip test: write chunks, read them back, verify reconstruction.
-        fn roundtrip_test(chunks: Vec<TestChunk>) -> Result<(), TestCaseError> {
-            // Collect unique external contents and assign fd indices
-            let mut external_contents: Vec<Vec<u8>> = Vec::new();
-
-            // Normalize fd_indices to actual file indices
-            let normalized_chunks: Vec<TestChunk> = chunks
-                .into_iter()
-                .filter_map(|chunk| match chunk {
-                    TestChunk::Inline(data) => {
-                        // Skip empty inline chunks (writer skips them)
-                        if data.is_empty() {
-                            None
-                        } else {
-                            Some(TestChunk::Inline(data))
-                        }
-                    }
-                    TestChunk::External { fd_index, content } => {
-                        // Map fd_index to actual position in external_contents
-                        let actual_index = fd_index % 8.max(1); // Limit to 8 files max
-
-                        // Ensure we have enough files
-                        while external_contents.len() <= actual_index {
-                            external_contents.push(Vec::new());
-                        }
-
-                        // Store content (may overwrite previous)
-                        external_contents[actual_index] = content.clone();
-
-                        Some(TestChunk::External {
-                            fd_index: actual_index,
-                            content,
-                        })
-                    }
-                })
-                .collect();
-
-            // Create temp files for external data
-            let mut temp_files: Vec<NamedTempFile> = Vec::new();
-            for content in &external_contents {
-                let mut f = NamedTempFile::new().map_err(|e| TestCaseError::fail(e.to_string()))?;
-                f.write_all(content)
-                    .map_err(|e| TestCaseError::fail(e.to_string()))?;
-                temp_files.push(f);
-            }
-
-            // Write the splitfdstream
-            let mut stream_buf = Vec::new();
-            {
-                let mut writer = SplitfdstreamWriter::new(&mut stream_buf);
-                for chunk in &normalized_chunks {
-                    match chunk {
-                        TestChunk::Inline(data) => {
-                            writer
-                                .write_inline(data)
-                                .map_err(|e| TestCaseError::fail(e.to_string()))?;
-                        }
-                        TestChunk::External { fd_index, .. } => {
-                            writer
-                                .write_external(*fd_index as u32)
-                                .map_err(|e| TestCaseError::fail(e.to_string()))?;
-                        }
-                    }
-                }
-                writer
-                    .finish()
-                    .map_err(|e| TestCaseError::fail(e.to_string()))?;
-            }
-
-            // Read back and verify chunk sequence
-            let mut reader = SplitfdstreamReader::new(stream_buf.as_slice());
-            let mut read_chunks = Vec::new();
-            while let Some(chunk) = reader
-                .next_chunk()
-                .map_err(|e| TestCaseError::fail(e.to_string()))?
-            {
-                match chunk {
-                    Chunk::Inline(data) => read_chunks.push(TestChunk::Inline(data.to_vec())),
-                    Chunk::External(idx) => read_chunks.push(TestChunk::External {
-                        fd_index: idx as usize,
-                        content: external_contents
-                            .get(idx as usize)
-                            .cloned()
-                            .unwrap_or_default(),
-                    }),
-                }
-            }
-
-            // Verify we got the same number of chunks
-            prop_assert_eq!(
-                normalized_chunks.len(),
-                read_chunks.len(),
-                "Chunk count mismatch"
-            );
-
-            // Verify each chunk matches
-            for (i, (expected, actual)) in
-                normalized_chunks.iter().zip(read_chunks.iter()).enumerate()
-            {
-                match (expected, actual) {
-                    (TestChunk::Inline(expected_data), TestChunk::Inline(actual_data)) => {
-                        prop_assert_eq!(
-                            expected_data,
-                            actual_data,
-                            "Inline chunk {} data mismatch",
-                            i
-                        );
-                    }
-                    (
-                        TestChunk::External { fd_index: ei, .. },
-                        TestChunk::External { fd_index: ai, .. },
-                    ) => {
-                        prop_assert_eq!(ei, ai, "External chunk {} fd_index mismatch", i);
-                    }
-                    _ => {
-                        return Err(TestCaseError::fail(format!(
-                            "Chunk {} type mismatch: expected {:?}, got {:?}",
-                            i, expected, actual
-                        )));
-                    }
-                }
-            }
-
-            // Verify reconstruction produces correct output
-            let files: Vec<std::fs::File> = temp_files
-                .iter()
-                .map(|f| std::fs::File::open(f.path()))
-                .collect::<Result<Vec<_>, _>>()
-                .map_err(|e| TestCaseError::fail(e.to_string()))?;
-
-            let mut output = Vec::new();
-            reconstruct(stream_buf.as_slice(), &files, &mut output)
-                .map_err(|e| TestCaseError::fail(e.to_string()))?;
-
-            // Build expected output
-            let mut expected_output = Vec::new();
-            for chunk in &normalized_chunks {
-                match chunk {
-                    TestChunk::Inline(data) => expected_output.extend_from_slice(data),
-                    TestChunk::External { fd_index, .. } => {
-                        expected_output.extend_from_slice(&external_contents[*fd_index]);
-                    }
-                }
-            }
-
-            prop_assert_eq!(output, expected_output, "Reconstructed output mismatch");
-
-            Ok(())
-        }
-
-        proptest! {
-            #![proptest_config(ProptestConfig::with_cases(256))]
-
-            #[test]
-            fn test_arbitrary_chunk_sequences(chunks in chunks_strategy()) {
-                roundtrip_test(chunks)?;
-            }
-
-            #[test]
-            fn test_inline_only_sequences(
-                chunks in prop::collection::vec(inline_data_strategy(), 0..32)
-            ) {
-                let test_chunks: Vec<TestChunk> = chunks.into_iter()
-                    .map(TestChunk::Inline)
-                    .collect();
-                roundtrip_test(test_chunks)?;
-            }
-
-            #[test]
-            fn test_external_only_sequences(
-                chunks in prop::collection::vec(
-                    (0..8usize, external_content_strategy()),
-                    0..32
-                )
-            ) {
-                let test_chunks: Vec<TestChunk> = chunks.into_iter()
-                    .map(|(idx, content)| TestChunk::External { fd_index: idx, content })
-                    .collect();
-                roundtrip_test(test_chunks)?;
-            }
-
-            #[test]
-            fn test_alternating_pattern(
-                inline_data in prop::collection::vec(inline_data_strategy(), 1..16),
-                external_data in prop::collection::vec(external_content_strategy(), 1..16)
-            ) {
-                let mut test_chunks = Vec::new();
-                let max_len = inline_data.len().max(external_data.len());
-                for i in 0..max_len {
-                    if i < inline_data.len() {
-                        test_chunks.push(TestChunk::Inline(inline_data[i].clone()));
-                    }
-                    if i < external_data.len() {
-                        test_chunks.push(TestChunk::External {
-                            fd_index: i % 8,
-                            content: external_data[i].clone(),
-                        });
-                    }
-                }
-                roundtrip_test(test_chunks)?;
-            }
-
-            #[test]
-            fn test_same_fd_multiple_references(
-                content in external_content_strategy(),
-                ref_count in 1..10usize
-            ) {
-                let mut test_chunks = Vec::new();
-                for _ in 0..ref_count {
-                    test_chunks.push(TestChunk::External {
-                        fd_index: 0,
-                        content: content.clone(),
-                    });
-                }
-                roundtrip_test(test_chunks)?;
-            }
-
-            #[test]
-            fn test_varying_chunk_sizes(
-                small in prop::collection::vec(any::<u8>(), 0..16),
-                medium in prop::collection::vec(any::<u8>(), 256..1024),
-                large in prop::collection::vec(any::<u8>(), 4096..8192)
-            ) {
-                let test_chunks = vec![
-                    TestChunk::Inline(small),
-                    TestChunk::Inline(medium),
-                    TestChunk::Inline(large),
-                ];
-                roundtrip_test(test_chunks)?;
-            }
-        }
-
-        /// Test SplitfdstreamAsRead with property-based approach
-        mod tar_reader_tests {
-            use super::*;
-
-            proptest! {
-                #![proptest_config(ProptestConfig::with_cases(128))]
-
-                #[test]
-                fn test_tar_reader_matches_reconstruct(chunks in chunks_strategy()) {
-                    tar_reader_test(chunks)?;
-                }
-            }
-
-            fn tar_reader_test(chunks: Vec<TestChunk>) -> Result<(), TestCaseError> {
-                // Collect unique external contents and assign fd indices
-                let mut external_contents: Vec<Vec<u8>> = Vec::new();
-
-                // Normalize fd_indices to actual file indices
-                let normalized_chunks: Vec<TestChunk> = chunks
-                    .into_iter()
-                    .filter_map(|chunk| match chunk {
-                        TestChunk::Inline(data) => {
-                            if data.is_empty() {
-                                None
-                            } else {
-                                Some(TestChunk::Inline(data))
-                            }
-                        }
-                        TestChunk::External { fd_index, content } => {
-                            let actual_index = fd_index % 8.max(1);
-                            while external_contents.len() <= actual_index {
-                                external_contents.push(Vec::new());
-                            }
-                            external_contents[actual_index] = content.clone();
-                            Some(TestChunk::External {
-                                fd_index: actual_index,
-                                content,
-                            })
-                        }
-                    })
-                    .collect();
-
-                // Create temp files for external data
-                let mut temp_files: Vec<NamedTempFile> = Vec::new();
-                for content in &external_contents {
-                    let mut f =
-                        NamedTempFile::new().map_err(|e| TestCaseError::fail(e.to_string()))?;
-                    f.write_all(content)
-                        .map_err(|e| TestCaseError::fail(e.to_string()))?;
-                    temp_files.push(f);
-                }
-
-                // Write the splitfdstream
-                let mut stream_buf = Vec::new();
-                {
-                    let mut writer = SplitfdstreamWriter::new(&mut stream_buf);
-                    for chunk in &normalized_chunks {
-                        match chunk {
-                            TestChunk::Inline(data) => {
-                                writer
-                                    .write_inline(data)
-                                    .map_err(|e| TestCaseError::fail(e.to_string()))?;
-                            }
-                            TestChunk::External { fd_index, .. } => {
-                                writer
-                                    .write_external(*fd_index as u32)
-                                    .map_err(|e| TestCaseError::fail(e.to_string()))?;
-                            }
-                        }
-                    }
-                    writer
-                        .finish()
-                        .map_err(|e| TestCaseError::fail(e.to_string()))?;
-                }
-
-                // Open files for reading
-                let files: Vec<std::fs::File> = temp_files
-                    .iter()
-                    .map(|f| std::fs::File::open(f.path()))
-                    .collect::<Result<Vec<_>, _>>()
-                    .map_err(|e| TestCaseError::fail(e.to_string()))?;
-
-                // Read via SplitfdstreamAsRead
-                let mut tar_reader = SplitfdstreamAsRead::new(stream_buf.as_slice(), &files);
-                let mut tar_output = Vec::new();
-                std::io::copy(&mut tar_reader, &mut tar_output)
-                    .map_err(|e| TestCaseError::fail(e.to_string()))?;
-
-                // Read via reconstruct
-                let mut reconstruct_output = Vec::new();
-                reconstruct(stream_buf.as_slice(), &files, &mut reconstruct_output)
-                    .map_err(|e| TestCaseError::fail(e.to_string()))?;
-
-                prop_assert_eq!(
-                    tar_output,
-                    reconstruct_output,
-                    "SplitfdstreamAsRead and reconstruct outputs differ"
-                );
-
-                Ok(())
-            }
-        }
-    }
-}
diff --git a/crates/composefs-storage/Cargo.toml b/crates/composefs-storage/Cargo.toml
index 904551a5..6d0b037c 100644
--- a/crates/composefs-storage/Cargo.toml
+++ b/crates/composefs-storage/Cargo.toml
@@ -17,25 +17,29 @@ cap-std = { version = "4.0", default-features = false }
 cap-std-ext = { version = "4.0", default-features = false }
 crc = { version = "3.0", default-features = false }
 flate2 = { version = "1.0", default-features = false, features = ["rust_backend"] }
-jsonrpc-fdpass = { workspace = true, optional = true }
+composefs-splitdirfdstream = { path = "../composefs-splitdirfdstream", version = "0.7.0", optional = true, features = ["tokio"] }
 oci-spec = { version = "0.9", default-features = false, features = ["image"] }
-rustix = { version = "1.0", default-features = false, features = ["fs", "std", "process", "thread"] }
+rustix = { version = "1.0", default-features = false, features = ["fs", "std", "process", "thread", "pipe"] }
 serde = { version = "1.0", default-features = false, features = ["derive"] }
 serde_json = { version = "1.0", default-features = false, features = ["std"] }
-sha2 = { version = "0.10", default-features = false, features = ["std"] }
 tar-core = "0.1.0"
 thiserror = { version = "2.0", default-features = false }
-tokio = { version = "1.40", default-features = false, features = ["rt", "net", "sync"], optional = true }
+tokio = { version = "1.40", default-features = false, features = ["rt", "rt-multi-thread", "net", "sync", "macros"], optional = true }
 toml = { version = "0.8", default-features = false, features = ["parse"] }
 tracing = { version = "0.1", default-features = false, optional = true }
+zlink = { workspace = true, optional = true }
 zstd = { version = "0.13", default-features = false }
 
 [features]
 default = []
-userns-helper = ["dep:jsonrpc-fdpass", "dep:tokio", "dep:tracing"]
+# Layer transfer over zlink + splitdirfdstream. Enables both the in-process
+# transport and the rootless `podman unshare` helper (init_if_helper /
+# StorageProxy).
+layer-transfer = ["dep:zlink", "dep:composefs-splitdirfdstream", "dep:tokio", "dep:tracing"]
 
 [dev-dependencies]
 tempfile = { version = "3.8", default-features = false }
+tokio = { version = "1.40", default-features = false, features = ["rt", "rt-multi-thread", "macros", "net", "sync"] }
 
 [lints]
 workspace = true
diff --git a/crates/composefs-storage/src/cstor_service.rs b/crates/composefs-storage/src/cstor_service.rs
new file mode 100644
index 00000000..da5a34e5
--- /dev/null
+++ b/crates/composefs-storage/src/cstor_service.rs
@@ -0,0 +1,1096 @@
+//! Stateless `org.composefs.Oci` service backed by containers-storage.
+//!
+//! Exposes `GetInfo` and `GetLayer` from the `org.composefs.Oci` varlink
+//! interface, using the `containers-storage` layer store as the source.
+#![allow(missing_docs)]
+//!
+//! # Design
+//!
+//! The service is **stateless**: there is no handle map and no `Open`/`Close`
+//! lifecycle.  `GetLayer` receives all necessary parameters inline via
+//! [`GetLayerParams`]:
+//!
+//! ```text
+//! GetLayer(handle, params: GetLayerParams) → streaming frames
+//! ```
+//!
+//! Where `params.storage` carries both the `storage_path` and `layer_id`.
+//!
+//! # Lock safety
+//!
+//! The service acquires a **shared flock** on `overlay-layers/layers.lock`
+//! before opening any diff-directory file descriptors.  This prevents a
+//! concurrent `podman rmi` (which holds an exclusive lock) from deleting a
+//! layer's diff directory while we are streaming it.
+//!
+//! The lock is held by a **self-reaping producer task** (a `spawn_blocking`
+//! closure) across three phases:
+//!
+//! 1. **Phase 1**: Produce the `splitdirfdstream` bytes into the data pipe
+//!    write end; drop the write end (data-pipe EOF to consumer).
+//! 2. **Phase 2**: Read the keepalive-pipe read end to EOF, blocking until
+//!    the consumer drops its keepalive write end (signalling it is finished).
+//! 3. **Phase 3**: Drop the lock guard (releases the flock) and all other
+//!    resources.
+//!
+//! **Critical ordering**: Phase 2 comes AFTER the write end drops (Phase 1
+//! end), otherwise the data pipe never EOF and the consumer deadlocks.
+//!
+//! On process death (SIGKILL, etc.) the kernel closes all fds, automatically
+//! releasing the flock — so the userns-helper subprocess can be killed
+//! safely without leaving containers-storage in a locked state.
+
+use std::collections::HashMap;
+use std::io::Write;
+use std::os::fd::OwnedFd;
+
+use composefs_splitdirfdstream::{
+    MAX_FDS_PER_FRAME, SplitdirfdstreamWriter, build_layer_fd_layout, seed_from_id,
+    spawn_self_reaping_producer, split_fds_into_frames,
+};
+use serde::{Deserialize, Serialize};
+
+use crate::layer::Layer;
+use crate::lock::LayerStoreLock;
+use crate::storage::Storage;
+use crate::tar_split::{TarSplitFdStream, TarSplitItem as TarItem};
+
+// ── Wire types (compatible with composefs-oci::varlink_types) ────────────────
+//
+// These definitions are independent of composefs-oci to avoid a dependency
+// cycle. They are wire-format-compatible (same field names, same JSON
+// encoding) with the types in composefs-oci::varlink_types.
+
+/// Locator for a layer inside a containers-storage store.
+///
+/// This is the wire-format counterpart of `composefs_oci::varlink_types::StorageLocator`.
+#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+pub struct StorageLocator {
+    /// Absolute path to the containers-storage root directory.
+    pub storage_path: String,
+    /// The layer ID within that storage root.
+    pub layer_id: String,
+}
+
+/// Parameters for the `GetLayer` method.
+///
+/// This is the wire-format counterpart of `composefs_oci::varlink_types::GetLayerParams`.
+/// The `diff_id` field is ignored by this service; only `storage` is used.
+#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+pub struct GetLayerParams {
+    /// Ignored by the cstor service (used by the repo service).
+    pub diff_id: Option<String>,
+    /// Location of the layer in containers-storage (required).
+    pub storage: Option<StorageLocator>,
+}
+
+/// Reply from `GetInfo`.
+#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+pub struct GetInfoReply {
+    /// Capability tokens supported by this service.
+    pub features: Vec<String>,
+}
+
+/// Reply from `GetLayer`.
+#[derive(Debug, Clone, Serialize, Deserialize, zlink::introspect::Type)]
+pub struct GetLayerReply {
+    /// Number of diff-directory fd slots in the logical FD array.
+    pub dir_count: u32,
+}
+
+/// Errors from the `org.composefs.Oci` interface (cstor service subset).
+#[derive(Debug, zlink::ReplyError, zlink::introspect::ReplyError)]
+#[zlink(interface = "org.composefs.Oci")]
+pub enum CstorOciError {
+    /// The repository could not be found or opened.
+    RepoNotFound { message: String },
+    /// The given handle is ignored (stateless service).
+    InvalidHandle { handle: u64 },
+    /// The named OCI image/reference does not exist.
+    NoSuchImage { image: String },
+    /// Internal error.
+    InternalError { message: String },
+    /// Layer not found.
+    NoSuchLayer { diff_id: String },
+    /// Malformed digest/diff-id.
+    InvalidDigest { message: String },
+    /// Diff-id mismatch (unused in this service, kept for interface compat).
+    DiffIdMismatch { expected: String, actual: String },
+    /// Malformed request.
+    InvalidRequest { message: String },
+    /// Too many fds for a single non-streaming reply.
+    FdLimitExceeded { fd_count: u64, max_per_frame: u64 },
+}
+
+// ── CstorLayerService ─────────────────────────────────────────────────────────
+
+/// Stateless service implementing the `org.composefs.Oci` interface, backed
+/// by a containers-storage layer store.
+///
+/// Only `GetInfo` and `GetLayer` are implemented; all other methods return
+/// a `MethodNotFound` error from zlink.
+#[derive(Debug, Default)]
+pub struct CstorLayerService;
+
+// ── Producer helpers ──────────────────────────────────────────────────────────
+
+/// Produce a `splitdirfdstream`-encoded byte sequence for `layer` in
+/// `storage` into `out`.
+///
+/// `link_id_to_dirfd` maps each layer's link ID to the *sparse* dirfd slot
+/// index where its pre-opened diff-directory fd was placed.
+pub(crate) fn produce_splitdirfdstream<W: Write>(
+    storage: &Storage,
+    layer: &Layer,
+    link_id_to_dirfd: &HashMap<String, u32>,
+    real_indices: &std::collections::HashSet<u32>,
+    out: W,
+) -> crate::Result<()> {
+    let mut stream = TarSplitFdStream::new(storage, layer)?;
+    let mut writer = SplitdirfdstreamWriter::new(out);
+    const ZERO_PADDING: [u8; 512] = [0u8; 512];
+    let mut prev_pad: usize = 0;
+
+    while let Some(item) = stream.next()? {
+        match item {
+            TarItem::Segment(bytes) => {
+                let stripped = if prev_pad <= bytes.len() {
+                    &bytes[prev_pad..]
+                } else {
+                    &[][..]
+                };
+                if !stripped.is_empty() {
+                    writer.write_metadata(stripped).map_err(|e| {
+                        crate::error::StorageError::TarSplitError(format!(
+                            "splitdirfdstream write_inline: {e}"
+                        ))
+                    })?;
+                }
+                prev_pad = 0;
+            }
+            TarItem::FileContent {
+                fd,
+                size,
+                name,
+                link_id,
+            } => {
+                let relpath = name.strip_prefix("./").unwrap_or(&name);
+                let dirfd_index = *link_id_to_dirfd.get(link_id.as_str()).ok_or_else(|| {
+                    crate::error::StorageError::TarSplitError(format!(
+                        "link_id {link_id} not found in dirfd map"
+                    ))
+                })?;
+                assert!(
+                    real_indices.contains(&dirfd_index),
+                    "BUG: producer emitting dirfd_index={dirfd_index} for {relpath:?} \
+                     (link_id={link_id:?}) but that slot is NOT a real diff-dir slot; \
+                     real_indices={real_indices:?} link_id_to_dirfd={link_id_to_dirfd:?}"
+                );
+                let stat = rustix::fs::fstat(&fd).map_err(|e| {
+                    crate::error::StorageError::TarSplitError(format!(
+                        "fstat failed for {relpath}: {e}"
+                    ))
+                })?;
+                let world_readable = stat.st_mode & 0o004 != 0;
+                if world_readable {
+                    writer
+                        .write_file_backed_data(dirfd_index, size, relpath.as_bytes())
+                        .map_err(|e| {
+                            crate::error::StorageError::TarSplitError(format!(
+                                "splitdirfdstream write_external {relpath}: {e}"
+                            ))
+                        })?;
+                } else {
+                    let mut buf = vec![0u8; size as usize];
+                    let mut off = 0u64;
+                    while (off as usize) < buf.len() {
+                        let n =
+                            rustix::io::pread(&fd, &mut buf[off as usize..], off).map_err(|e| {
+                                crate::error::StorageError::TarSplitError(format!(
+                                    "pread failed for {relpath}: {e}"
+                                ))
+                            })?;
+                        if n == 0 {
+                            return Err(crate::error::StorageError::TarSplitError(format!(
+                                "unexpected EOF reading {relpath}"
+                            )));
+                        }
+                        off += n as u64;
+                    }
+                    writer.write_inline_data(&buf).map_err(|e| {
+                        crate::error::StorageError::TarSplitError(format!(
+                            "write_file_content {relpath}: {e}"
+                        ))
+                    })?;
+                }
+                let pad = (size.next_multiple_of(512) - size) as usize;
+                if pad > 0 {
+                    writer.write_metadata(&ZERO_PADDING[..pad]).map_err(|e| {
+                        crate::error::StorageError::TarSplitError(format!(
+                            "splitdirfdstream write_inline padding: {e}"
+                        ))
+                    })?;
+                }
+                prev_pad = pad;
+            }
+        }
+    }
+
+    writer.finish().map_err(|e| {
+        crate::error::StorageError::TarSplitError(format!("splitdirfdstream finish: {e}"))
+    })?;
+    Ok(())
+}
+
+/// Open one diff-directory fd per layer in the chain, in chain order.
+///
+/// Returns `(real_fds, link_ids)` where `real_fds[i]` is the diff-dir fd for
+/// `link_ids[i]`.  The sparse-slot placement is **not** done here; callers
+/// should pass `real_fds` to [`composefs_splitdirfdstream::build_layer_fd_layout`]
+/// which fills dummy slots using the canonical ordinal-parity rule.  The
+/// returned `link_ids` can then be zipped with `layout.real_indices` to build
+/// the `link_id → dirfd_slot_index` map required by [`produce_splitdirfdstream`].
+pub(crate) fn open_layer_diff_fds(
+    storage: &Storage,
+    layer: &Layer,
+) -> crate::Result<(Vec<OwnedFd>, Vec<String>)> {
+    use std::os::fd::AsFd as _;
+    let chain = crate::layer::Layer::open(storage, layer.id())?
+        .layer_chain(storage)
+        .map_err(|e| crate::error::StorageError::Io(std::io::Error::other(format!("{e:#}"))))?;
+
+    let mut real_fds = Vec::with_capacity(chain.len());
+    let mut link_ids = Vec::with_capacity(chain.len());
+
+    for l in &chain {
+        let fd = rustix::io::dup(l.diff_dir().as_fd())
+            .map_err(|e| crate::error::StorageError::Io(std::io::Error::from(e)))?;
+        real_fds.push(fd);
+        link_ids.push(l.link_id().to_string());
+    }
+
+    Ok((real_fds, link_ids))
+}
+
+// ── zlink service impl ────────────────────────────────────────────────────────
+
+mod service_impl {
+    #![allow(missing_docs)]
+
+    use super::{
+        CstorLayerService, CstorOciError, GetInfoReply, GetLayerParams, GetLayerReply, Layer,
+        LayerStoreLock, MAX_FDS_PER_FRAME, Storage, build_layer_fd_layout, open_layer_diff_fds,
+        produce_splitdirfdstream, seed_from_id, spawn_self_reaping_producer, split_fds_into_frames,
+    };
+
+    #[zlink::service(
+        interface = "org.composefs.Oci",
+        vendor = "org.composefs",
+        product = "composefs-storage",
+        version = env!("CARGO_PKG_VERSION"),
+        url = "https://github.com/composefs/composefs-rs"
+    )]
+    impl<Sock> CstorLayerService {
+        /// Advertise the capabilities of this service.
+        async fn get_info(&self) -> std::result::Result<GetInfoReply, CstorOciError> {
+            Ok(GetInfoReply {
+                features: vec![
+                    "splitdirfdstream-v0".into(),
+                    "source-containers-storage".into(),
+                    "read-only".into(),
+                ],
+            })
+        }
+
+        /// Stream a layer from containers-storage as a `splitdirfdstream`.
+        ///
+        /// `params.storage` must be set; `params.diff_id` is ignored.
+        ///
+        /// The self-reaping producer task:
+        ///   Phase 1 — produce stream into data pipe; drop write end (data EOF).
+        ///   Phase 2 — wait for keepalive-pipe EOF (consumer finished).
+        ///   Phase 3 — drop lock guard + all resources.
+        #[zlink(more, return_fds)]
+        async fn get_layer(
+            &self,
+            more: bool,
+            handle: u64,
+            params: GetLayerParams,
+            #[zlink(fds)] _fds: Vec<std::os::fd::OwnedFd>,
+        ) -> impl zlink::futures_util::Stream<
+            Item = (
+                std::result::Result<zlink::Reply<GetLayerReply>, CstorOciError>,
+                Vec<std::os::fd::OwnedFd>,
+            ),
+        > + Unpin {
+            use zlink::futures_util::stream::{self, StreamExt as _};
+
+            let _ = handle; // stateless — handle is ignored
+
+            type StreamItem = (
+                std::result::Result<zlink::Reply<GetLayerReply>, CstorOciError>,
+                Vec<std::os::fd::OwnedFd>,
+            );
+
+            macro_rules! err_stream {
+                ($e:expr) => {
+                    return stream::iter(std::iter::once::<StreamItem>((Err($e), vec![])))
+                        .left_stream()
+                };
+            }
+
+            // Extract storage locator (required).
+            let locator = match params.storage {
+                Some(l) => l,
+                None => err_stream!(CstorOciError::InvalidRequest {
+                    message: "GetLayer: params.storage is required for the cstor service".into(),
+                }),
+            };
+            let storage_path = locator.storage_path;
+            let layer_id = locator.layer_id;
+
+            // Open storage and locate the layer.
+            let storage = match Storage::open(&storage_path) {
+                Ok(s) => s,
+                Err(e) => err_stream!(CstorOciError::RepoNotFound {
+                    message: format!("{e:#}"),
+                }),
+            };
+            let layer = match Layer::open(&storage, &layer_id) {
+                Ok(l) => l,
+                Err(_) => err_stream!(CstorOciError::NoSuchLayer {
+                    diff_id: layer_id.clone(),
+                }),
+            };
+
+            // Seed for sparse layout (derived from layer_id).
+            let seed = seed_from_id(&layer_id);
+
+            // Acquire shared lock FIRST (before opening diff-dir fds).
+            // Lock → open ordering is atomic w.r.t. concurrent `podman rmi`.
+            let lock: LayerStoreLock = match storage.lock_layers_shared() {
+                Ok(l) => l,
+                Err(e) => err_stream!(CstorOciError::InternalError {
+                    message: format!("lock_layers_shared: {e:#}"),
+                }),
+            };
+
+            // Open one real diff-dir fd per chain layer (in chain order).
+            // Sparse placement and dummy-fd insertion are delegated to
+            // build_layer_fd_layout so that both paths share one implementation.
+            let (real_fds, link_ids) = match open_layer_diff_fds(&storage, &layer) {
+                Ok(pair) => pair,
+                Err(e) => err_stream!(CstorOciError::InternalError {
+                    message: format!("open diff dirs: {e:#}"),
+                }),
+            };
+
+            // Create the data pipe.
+            let (read_fd, write_fd) =
+                match rustix::pipe::pipe_with(rustix::pipe::PipeFlags::CLOEXEC) {
+                    Ok(p) => p,
+                    Err(e) => err_stream!(CstorOciError::InternalError {
+                        message: format!("pipe: {e}"),
+                    }),
+                };
+
+            // Build the full sparse fd layout (dirfds region + keepalive + extras).
+            // This is the same call the repo path makes, ensuring one canonical
+            // dummy-fd parity rule (ordinal-based, not slot-value-based).
+            let layout = match build_layer_fd_layout(read_fd, real_fds, seed) {
+                Ok(l) => l,
+                Err(e) => err_stream!(CstorOciError::InternalError {
+                    message: format!("build_layer_fd_layout: {e}"),
+                }),
+            };
+            let dir_count = layout.dir_count;
+            // Build the link_id → sparse-slot-index map from the layout's real_indices.
+            // layout.real_indices[i] is the slot where chain layer i's real fd was placed.
+            let link_id_to_dirfd: std::collections::HashMap<String, u32> = link_ids
+                .into_iter()
+                .zip(layout.real_indices.iter().copied())
+                .collect();
+            let real_indices: std::collections::HashSet<u32> =
+                layout.real_indices.iter().copied().collect();
+
+            // Split into transport frames (with more=false guard).
+            let (batches, n_frames) = match split_fds_into_frames(layout.fds_all, seed, more) {
+                Ok(b) => {
+                    let n = b.len();
+                    (b, n)
+                }
+                Err((_fds, e)) => {
+                    err_stream!(CstorOciError::FdLimitExceeded {
+                        fd_count: e.fd_count as u64,
+                        max_per_frame: MAX_FDS_PER_FRAME as u64,
+                    })
+                }
+            };
+
+            // Spawn the 3-phase self-reaping producer task via the shared helper.
+            // Phase 1: produce; drop write_fd (data-pipe EOF).
+            // Phase 2: keepalive-pipe EOF wait (consumer finished).
+            // Phase 3: drop lock (releases shared flock).
+            let keepalive_read = layout.keepalive_read;
+            spawn_self_reaping_producer(write_fd, keepalive_read, lock, move |wf| {
+                if let Err(e) =
+                    produce_splitdirfdstream(&storage, &layer, &link_id_to_dirfd, &real_indices, wf)
+                {
+                    tracing::warn!("cstor producer error: {e:#}");
+                }
+            });
+
+            stream::iter(batches.into_iter().enumerate().map(move |(i, batch)| {
+                let is_last = i == n_frames - 1;
+                (
+                    Ok(zlink::Reply::new(Some(GetLayerReply { dir_count }))
+                        .set_continues(Some(!is_last))),
+                    batch,
+                )
+            }))
+            .right_stream()
+        }
+    }
+}
+
+// ── In-process transport ──────────────────────────────────────────────────────
+
+/// Handle to an in-process [`CstorLayerService`] server.
+///
+/// Drop or call [`InProcessServer::shutdown`] when done.
+#[derive(Debug)]
+pub struct InProcessServer {
+    shutdown: Option<tokio::sync::oneshot::Sender<()>>,
+    thread: Option<std::thread::JoinHandle<()>>,
+}
+
+impl InProcessServer {
+    /// Signal the server to stop and reap its thread.
+    pub async fn shutdown(mut self) {
+        if let Some(tx) = self.shutdown.take() {
+            let _ = tx.send(());
+        }
+        if let Some(thread) = self.thread.take() {
+            let _ = tokio::task::spawn_blocking(move || thread.join()).await;
+        }
+    }
+}
+
+impl Drop for InProcessServer {
+    fn drop(&mut self) {
+        if let Some(tx) = self.shutdown.take() {
+            let _ = tx.send(());
+        }
+    }
+}
+
+/// Spawn a [`CstorLayerService`] in-process over a Unix socket pair.
+///
+/// Returns a connected `zlink::unix::Connection` and an [`InProcessServer`]
+/// handle for shutdown.
+pub fn spawn_in_process(
+    service: CstorLayerService,
+) -> std::io::Result<(zlink::unix::Connection, InProcessServer)> {
+    let (client_std, server_std) = std::os::unix::net::UnixStream::pair()?;
+    client_std.set_nonblocking(true)?;
+    server_std.set_nonblocking(true)?;
+
+    let client_stream = tokio::net::UnixStream::from_std(client_std)?;
+    let client_zlink = zlink::unix::Stream::from(client_stream);
+    let client_conn = zlink::Connection::from(client_zlink);
+
+    let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel::<()>();
+
+    let handle = std::thread::Builder::new()
+        .name("cstor-oci-server".into())
+        .spawn(move || {
+            let rt = match tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()
+            {
+                Ok(rt) => rt,
+                Err(e) => {
+                    tracing::error!("CstorLayerService server runtime build failed: {e:#?}");
+                    return;
+                }
+            };
+            let local = tokio::task::LocalSet::new();
+            local.block_on(&rt, async move {
+                let server_stream = match tokio::net::UnixStream::from_std(server_std) {
+                    Ok(s) => s,
+                    Err(e) => {
+                        tracing::error!(
+                            "CstorLayerService server stream conversion failed: {e:#?}"
+                        );
+                        return;
+                    }
+                };
+                let server_zlink = zlink::unix::Stream::from(server_stream);
+                let listener = zlink::ReadyListener::new(server_zlink);
+                let server = zlink::Server::new(listener, service);
+
+                tokio::select! {
+                    res = server.run() => {
+                        if let Err(e) = res {
+                            tracing::warn!("CstorLayerService in-process server error: {e:#?}");
+                        }
+                    }
+                    _ = shutdown_rx => {
+                        tracing::trace!("CstorLayerService in-process server received stop signal");
+                    }
+                }
+            });
+        })?;
+
+    Ok((
+        client_conn,
+        InProcessServer {
+            shutdown: Some(shutdown_tx),
+            thread: Some(handle),
+        },
+    ))
+}
+
+/// Serve the `CstorLayerService` on `socket` until the peer disconnects.
+///
+/// Intended for the userns helper subprocess: the helper is shut down by the
+/// parent (`cfsctl`) `kill()`ing it once the import is done (see
+/// [`crate::userns_helper::StorageProxy`]). Abnormal parent death is covered by
+/// the helper's `PR_SET_PDEATHSIG` net (see `init_if_helper`), so there is no
+/// need to detect peer close here.
+pub fn serve_on_socket_blocking(socket: std::os::unix::net::UnixStream) -> std::io::Result<()> {
+    socket.set_nonblocking(true)?;
+
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()?;
+
+    let local = tokio::task::LocalSet::new();
+    local.block_on(&rt, async move {
+        // `from_std` must be called inside the runtime context.
+        let stream = tokio::net::UnixStream::from_std(socket)?;
+        let zs = zlink::unix::Stream::from(stream);
+        let listener = zlink::ReadyListener::new(zs);
+        let server = zlink::Server::new(listener, CstorLayerService);
+        server
+            .run()
+            .await
+            .map_err(|e| std::io::Error::other(format!("{e:#?}")))?;
+        Ok(())
+    })
+}
+
+// ── Tests ─────────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+    use std::io::Read as _;
+    use std::os::fd::{AsFd as _, BorrowedFd};
+
+    use composefs_splitdirfdstream::{SplitdirfdstreamReader, build_layer_fd_layout, seed_from_id};
+    use flate2::Compression;
+    use flate2::write::GzEncoder;
+    use tempfile::TempDir;
+
+    use super::*;
+    use crate::layer::Layer;
+    use crate::storage::Storage;
+
+    // ── Shared test fixture ──────────────────────────────────────────────────
+
+    fn create_two_layer_mock(root: &std::path::Path) -> Storage {
+        for d in ["overlay", "overlay-layers", "overlay-images"] {
+            std::fs::create_dir_all(root.join(d)).unwrap();
+        }
+
+        let child_id = "child-layer-001";
+        let child_link = "CHILDLINKID000000000000000";
+        let child_diff = root.join("overlay").join(child_id).join("diff");
+        std::fs::create_dir_all(child_diff.join("etc")).unwrap();
+        std::fs::write(child_diff.join("etc/child.txt"), b"child content!").unwrap();
+        std::fs::write(root.join("overlay").join(child_id).join("link"), child_link).unwrap();
+
+        let parent_id = "parent-layer-001";
+        let parent_link = "PARENTLINKID000000000000000";
+        let parent_diff = root.join("overlay").join(parent_id).join("diff");
+        std::fs::create_dir_all(parent_diff.join("etc")).unwrap();
+        std::fs::write(parent_diff.join("etc/parent.txt"), b"parent content!").unwrap();
+        std::fs::write(
+            root.join("overlay").join(parent_id).join("link"),
+            parent_link,
+        )
+        .unwrap();
+
+        std::fs::write(
+            root.join("overlay").join(child_id).join("lower"),
+            format!("l/{}", parent_link),
+        )
+        .unwrap();
+
+        let l_dir = root.join("overlay").join("l");
+        std::fs::create_dir_all(&l_dir).unwrap();
+        std::os::unix::fs::symlink(format!("../{}/diff", child_id), l_dir.join(child_link))
+            .unwrap();
+        std::os::unix::fs::symlink(format!("../{}/diff", parent_id), l_dir.join(parent_link))
+            .unwrap();
+
+        let ndjson = concat!(
+            r#"{"type":1,"name":"./etc/child.txt","size":14}"#,
+            "\n",
+            r#"{"type":1,"name":"./etc/parent.txt","size":15}"#,
+            "\n",
+        );
+        let gz_path = root
+            .join("overlay-layers")
+            .join(format!("{}.tar-split.gz", child_id));
+        let gz_file = std::fs::File::create(&gz_path).unwrap();
+        let mut encoder = GzEncoder::new(gz_file, Compression::default());
+        encoder.write_all(ndjson.as_bytes()).unwrap();
+        encoder.finish().unwrap();
+
+        Storage::open(root).unwrap()
+    }
+
+    // Deterministic constants for "child-layer-001".
+    //
+    // These derive from the layer-id seed independently of the sparse-slot
+    // *placement* (which comes from a seeded shuffle): the number of dummy slots,
+    // extra lifetime fds, and transport frames are all fixed for this seed. The
+    // real-layer slot indices themselves are read from the layout at runtime.
+    const LAYER_SEED: u64 = 9_551_015_030_439_334_514;
+    const EXPECTED_DIR_COUNT: u32 = 5;
+    const EXPECTED_N_EXTRA_LIFETIME: usize = 2;
+    const EXPECTED_N_FRAMES: usize = 4;
+
+    /// Build the fd layout for a layer under test and return the full layout
+    /// plus the `link_id → sparse-slot-index` map.
+    ///
+    /// Uses a `/dev/null` fd as a throwaway pipe_read (tests don't drain the
+    /// data pipe; they call `produce_splitdirfdstream` separately into a vec).
+    fn build_test_layout(
+        storage: &Storage,
+        layer: &Layer,
+        seed: u64,
+    ) -> (
+        composefs_splitdirfdstream::LayerFdLayout,
+        HashMap<String, u32>,
+        std::collections::HashSet<u32>,
+    ) {
+        let (real_fds, link_ids) = open_layer_diff_fds(storage, layer).unwrap();
+        let dummy_pipe_read = composefs_splitdirfdstream::open_devnull().unwrap();
+        let layout = build_layer_fd_layout(dummy_pipe_read, real_fds, seed).unwrap();
+        let real_indices: std::collections::HashSet<u32> =
+            layout.real_indices.iter().copied().collect();
+        let map: HashMap<String, u32> = link_ids
+            .into_iter()
+            .zip(layout.real_indices.iter().copied())
+            .collect();
+        (layout, map, real_indices)
+    }
+
+    // ── C1: sparse layout + producer ────────────────────────────────────────
+
+    #[test]
+    fn test_produce_splitdirfdstream_chunk_sequence() {
+        let tmp = TempDir::new().unwrap();
+        let storage = create_two_layer_mock(tmp.path());
+        let layer = Layer::open(&storage, "child-layer-001").unwrap();
+
+        let seed = seed_from_id("child-layer-001");
+        assert_eq!(seed, LAYER_SEED, "seed must match precomputed value");
+
+        let (layout, link_id_to_dirfd, real_indices) = build_test_layout(&storage, &layer, seed);
+
+        let child_link = "CHILDLINKID000000000000000";
+        let parent_link = "PARENTLINKID000000000000000";
+
+        // The real-layer slot positions come from the seeded shuffle; rather
+        // than pin them to specific values, take them from the layout map and
+        // assert the produced chunks reference those same slots.
+        let child_idx = link_id_to_dirfd[child_link];
+        let parent_idx = link_id_to_dirfd[parent_link];
+
+        // dir_count = total slot count including dummies.
+        assert_eq!(layout.dir_count, EXPECTED_DIR_COUNT);
+        assert!(child_idx < EXPECTED_DIR_COUNT && parent_idx < EXPECTED_DIR_COUNT);
+        assert_ne!(child_idx, parent_idx, "real layers occupy distinct slots");
+
+        let mut buf = Vec::<u8>::new();
+        produce_splitdirfdstream(&storage, &layer, &link_id_to_dirfd, &real_indices, &mut buf)
+            .unwrap();
+
+        let mut reader = SplitdirfdstreamReader::new(buf.as_slice());
+
+        // child.txt → FileBackedData at child's sparse slot.
+        match reader.next_chunk().unwrap().expect("chunk 1") {
+            composefs_splitdirfdstream::Chunk::FileBackedData {
+                dirfd_index,
+                length,
+                filename,
+            } => {
+                assert_eq!(dirfd_index, child_idx);
+                assert_eq!(length, 14);
+                assert_eq!(filename, b"etc/child.txt");
+            }
+            other => panic!("expected FileBackedData, got {other:?}"),
+        }
+
+        // 498 bytes padding.
+        match reader.next_chunk().unwrap().expect("chunk 2") {
+            composefs_splitdirfdstream::Chunk::Metadata(data) => {
+                assert_eq!(data.len(), 498);
+                assert!(data.iter().all(|&b| b == 0));
+            }
+            other => panic!("expected Metadata (padding), got {other:?}"),
+        }
+
+        // parent.txt → FileBackedData at parent's sparse slot.
+        match reader.next_chunk().unwrap().expect("chunk 3") {
+            composefs_splitdirfdstream::Chunk::FileBackedData {
+                dirfd_index,
+                length,
+                filename,
+            } => {
+                assert_eq!(dirfd_index, parent_idx);
+                assert_eq!(length, 15);
+                assert_eq!(filename, b"etc/parent.txt");
+            }
+            other => panic!("expected FileBackedData, got {other:?}"),
+        }
+
+        // 497 bytes padding.
+        match reader.next_chunk().unwrap().expect("chunk 4") {
+            composefs_splitdirfdstream::Chunk::Metadata(data) => {
+                assert_eq!(data.len(), 497);
+                assert!(data.iter().all(|&b| b == 0));
+            }
+            other => panic!("expected Metadata (padding), got {other:?}"),
+        }
+
+        assert!(reader.next_chunk().unwrap().is_none(), "expected EOF");
+    }
+
+    #[test]
+    fn test_non_world_readable_file_uses_file_content() {
+        use std::os::unix::fs::PermissionsExt as _;
+
+        let tmp = TempDir::new().unwrap();
+        let storage = create_two_layer_mock(tmp.path());
+
+        let child_txt = tmp
+            .path()
+            .join("overlay/child-layer-001/diff/etc/child.txt");
+        std::fs::set_permissions(&child_txt, std::fs::Permissions::from_mode(0o640)).unwrap();
+
+        let layer = Layer::open(&storage, "child-layer-001").unwrap();
+        let seed = seed_from_id("child-layer-001");
+        let (_, link_id_to_dirfd, real_indices) = build_test_layout(&storage, &layer, seed);
+        let mut buf = Vec::<u8>::new();
+        produce_splitdirfdstream(&storage, &layer, &link_id_to_dirfd, &real_indices, &mut buf)
+            .unwrap();
+
+        let mut reader = SplitdirfdstreamReader::new(buf.as_slice());
+        let mut saw_file_content = false;
+        while let Some(chunk) = reader.next_chunk().unwrap() {
+            match chunk {
+                composefs_splitdirfdstream::Chunk::Metadata(_) => {}
+                composefs_splitdirfdstream::Chunk::InlineData(data) => {
+                    assert_eq!(data, b"child content!");
+                    saw_file_content = true;
+                    break;
+                }
+                composefs_splitdirfdstream::Chunk::FileBackedData { filename, .. } => {
+                    panic!(
+                        "non-world-readable must produce InlineData, got FileBackedData({:?})",
+                        std::str::from_utf8(filename).unwrap_or("<invalid utf8>")
+                    );
+                }
+            }
+        }
+        assert!(saw_file_content, "expected InlineData chunk for child.txt");
+    }
+
+    #[test]
+    fn test_per_layer_diff_fds() {
+        let tmp = TempDir::new().unwrap();
+        let storage = create_two_layer_mock(tmp.path());
+        let layer = Layer::open(&storage, "child-layer-001").unwrap();
+
+        let seed = seed_from_id("child-layer-001");
+        let (layout, link_id_to_dirfd, _real_indices) = build_test_layout(&storage, &layer, seed);
+
+        // dir_count counts the full sparse region (real + dummy slots).
+        assert_eq!(layout.dir_count, EXPECTED_DIR_COUNT);
+
+        // Real-layer slots come from the seeded shuffle; read them from the map.
+        let child_idx = link_id_to_dirfd["CHILDLINKID000000000000000"];
+        let parent_idx = link_id_to_dirfd["PARENTLINKID000000000000000"];
+        assert_ne!(child_idx, parent_idx, "real layers occupy distinct slots");
+
+        // fds_all layout: [dummy_pipe, dirfds region (dir_count), keepalive_write, extras]
+        // dirfds region starts at index 1.
+        let child_fd = layout.fds_all[1 + child_idx as usize].as_fd();
+        assert!(
+            composefs_splitdirfdstream::open_beneath(child_fd, b"etc/child.txt").is_ok(),
+            "child diff-dir must allow opening etc/child.txt"
+        );
+
+        let parent_fd = layout.fds_all[1 + parent_idx as usize].as_fd();
+        assert!(
+            composefs_splitdirfdstream::open_beneath(parent_fd, b"etc/parent.txt").is_ok(),
+            "parent diff-dir must allow opening etc/parent.txt"
+        );
+
+        // Every other slot in the dirfds region is a dummy and must not open
+        // any real file.
+        for dummy_slot in (0..EXPECTED_DIR_COUNT).filter(|s| *s != child_idx && *s != parent_idx) {
+            assert!(
+                composefs_splitdirfdstream::open_beneath(
+                    layout.fds_all[1 + dummy_slot as usize].as_fd(),
+                    b"etc/child.txt"
+                )
+                .is_err(),
+                "dummy slot {dummy_slot} must not open real files"
+            );
+        }
+    }
+
+    #[test]
+    fn test_reconstruct_produces_file_content() {
+        let tmp = TempDir::new().unwrap();
+        let storage = create_two_layer_mock(tmp.path());
+        let layer = Layer::open(&storage, "child-layer-001").unwrap();
+
+        let seed = seed_from_id("child-layer-001");
+        let (layout, link_id_to_dirfd, real_indices) = build_test_layout(&storage, &layer, seed);
+        assert_eq!(layout.dir_count, EXPECTED_DIR_COUNT);
+
+        let mut buf = Vec::<u8>::new();
+        produce_splitdirfdstream(&storage, &layer, &link_id_to_dirfd, &real_indices, &mut buf)
+            .unwrap();
+
+        // The dirfds region in fds_all starts at index 1.
+        let dir_count = layout.dir_count as usize;
+        let borrowed_fds: Vec<BorrowedFd<'_>> = layout.fds_all[1..=dir_count]
+            .iter()
+            .map(|f| f.as_fd())
+            .collect();
+        let mut out = Vec::<u8>::new();
+        let bytes_written =
+            composefs_splitdirfdstream::reconstruct(buf.as_slice(), &borrowed_fds, &mut out)
+                .unwrap();
+
+        assert!(bytes_written > 0);
+        assert!(
+            out.windows(b"child content!".len())
+                .any(|w| w == b"child content!")
+        );
+        assert!(
+            out.windows(b"parent content!".len())
+                .any(|w| w == b"parent content!")
+        );
+        assert_eq!(bytes_written, 1024);
+    }
+
+    // ── C2/C3: CstorLayerService in-process GetLayer round-trip ─────────────
+
+    /// Proxy trait for the org.composefs.Oci interface (cstor client side).
+    #[zlink::proxy(interface = "org.composefs.Oci")]
+    trait CstorOciProxy {
+        async fn get_info(
+            &mut self,
+        ) -> zlink::Result<std::result::Result<GetInfoReply, CstorOciError>>;
+
+        #[zlink(more, return_fds)]
+        async fn get_layer(
+            &mut self,
+            handle: u64,
+            params: GetLayerParams,
+        ) -> zlink::Result<
+            impl zlink::futures_util::Stream<
+                Item = zlink::Result<(
+                    std::result::Result<GetLayerReply, CstorOciError>,
+                    Vec<std::os::fd::OwnedFd>,
+                )>,
+            >,
+        >;
+    }
+
+    /// Collect all frames from a streaming get_layer call.
+    async fn collect_get_layer(
+        client: &mut zlink::unix::Connection,
+        storage_path: &str,
+        layer_id: &str,
+    ) -> (GetLayerReply, Vec<OwnedFd>, usize) {
+        use zlink::futures_util::StreamExt as _;
+
+        let params = GetLayerParams {
+            diff_id: None,
+            storage: Some(StorageLocator {
+                storage_path: storage_path.to_owned(),
+                layer_id: layer_id.to_owned(),
+            }),
+        };
+
+        let mut stream = std::pin::pin!(client.get_layer(0, params).await.unwrap());
+        let mut all_fds: Vec<OwnedFd> = Vec::new();
+        let mut last_reply: Option<GetLayerReply> = None;
+        let mut frame_count = 0usize;
+
+        while let Some(item) = stream.next().await {
+            let (result, fds) = item.unwrap();
+            last_reply = Some(result.unwrap());
+            all_fds.extend(fds);
+            frame_count += 1;
+        }
+
+        (last_reply.expect("no frames"), all_fds, frame_count)
+    }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_cstor_oci_get_layer_in_process() {
+        let tmp = TempDir::new().unwrap();
+        let _ = create_two_layer_mock(tmp.path());
+        let storage_path = tmp.path().to_str().unwrap().to_string();
+
+        let (mut client, server) = spawn_in_process(CstorLayerService).unwrap();
+
+        // GetInfo.
+        let info = client.get_info().await.unwrap().unwrap();
+        assert!(info.features.contains(&"splitdirfdstream-v0".to_string()));
+        assert!(
+            info.features
+                .contains(&"source-containers-storage".to_string())
+        );
+
+        // GetLayer — full round-trip.
+        let (reply, all_fds, frame_count) =
+            collect_get_layer(&mut client, &storage_path, "child-layer-001").await;
+
+        assert_eq!(frame_count, EXPECTED_N_FRAMES);
+        assert_eq!(reply.dir_count, EXPECTED_DIR_COUNT);
+
+        let expected_total = 1 + EXPECTED_DIR_COUNT as usize + 1 + EXPECTED_N_EXTRA_LIFETIME;
+        assert_eq!(all_fds.len(), expected_total);
+
+        let mut it = all_fds.into_iter();
+        let pipe_fd = it.next().unwrap();
+        let dir_fds: Vec<OwnedFd> = it.by_ref().take(reply.dir_count as usize).collect();
+        let lifetime_fds: Vec<OwnedFd> = it.collect();
+
+        // Verify the real diff-dirs are present somewhere in the sparse region.
+        // The exact slots come from the seeded shuffle and are not known to the
+        // client; the stream reconstruction below proves the producer used the
+        // right indices.
+        assert!(
+            dir_fds
+                .iter()
+                .any(
+                    |fd| composefs_splitdirfdstream::open_beneath(fd.as_fd(), b"etc/child.txt")
+                        .is_ok()
+                ),
+            "some dir slot must open etc/child.txt"
+        );
+        assert!(
+            dir_fds
+                .iter()
+                .any(
+                    |fd| composefs_splitdirfdstream::open_beneath(fd.as_fd(), b"etc/parent.txt")
+                        .is_ok()
+                ),
+            "some dir slot must open etc/parent.txt"
+        );
+
+        // Read and reconstruct the stream.
+        let pipe_dup = rustix::io::dup(pipe_fd.as_fd()).unwrap();
+        drop(pipe_fd);
+        let borrowed: Vec<BorrowedFd<'_>> = dir_fds.iter().map(|f| f.as_fd()).collect();
+
+        let mut stream_bytes = Vec::new();
+        std::fs::File::from(pipe_dup)
+            .read_to_end(&mut stream_bytes)
+            .unwrap();
+
+        let mut reconstructed = Vec::new();
+        let n = composefs_splitdirfdstream::reconstruct(
+            stream_bytes.as_slice(),
+            &borrowed,
+            &mut reconstructed,
+        )
+        .unwrap();
+        assert_eq!(n, 1024);
+        assert!(
+            reconstructed
+                .windows(b"child content!".len())
+                .any(|w| w == b"child content!")
+        );
+        assert!(
+            reconstructed
+                .windows(b"parent content!".len())
+                .any(|w| w == b"parent content!")
+        );
+
+        // Drop lifetime fds → signals producer (keepalive EOF).
+        drop(lifetime_fds);
+
+        drop(client);
+        server.shutdown().await;
+    }
+
+    /// Regression test: the self-reaping producer must not deadlock.
+    #[test]
+    fn test_in_process_teardown_does_not_deadlock() {
+        let tmp = TempDir::new().unwrap();
+        let _ = create_two_layer_mock(tmp.path());
+        let storage_path = tmp.path().to_str().unwrap().to_string();
+
+        let worker = std::thread::spawn(move || {
+            let rt = tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(2)
+                .enable_all()
+                .build()
+                .unwrap();
+
+            rt.block_on(async move {
+                let (mut client, server) = spawn_in_process(CstorLayerService).unwrap();
+
+                let (reply, all_fds, frame_count) =
+                    collect_get_layer(&mut client, &storage_path, "child-layer-001").await;
+
+                assert_eq!(frame_count, EXPECTED_N_FRAMES);
+                assert_eq!(reply.dir_count, EXPECTED_DIR_COUNT);
+
+                let mut it = all_fds.into_iter();
+                let pipe_fd = it.next().unwrap();
+                let _dir_fds: Vec<OwnedFd> = it.by_ref().take(reply.dir_count as usize).collect();
+                let lifetime_fds: Vec<OwnedFd> = it.collect();
+
+                let pipe_dup = rustix::io::dup(pipe_fd.as_fd()).unwrap();
+                drop(pipe_fd);
+
+                tokio::task::spawn_blocking(move || {
+                    let _lifetime_fds = lifetime_fds;
+                    let mut f = std::fs::File::from(pipe_dup);
+                    let mut buf = Vec::new();
+                    f.read_to_end(&mut buf).unwrap();
+                    assert!(!buf.is_empty());
+                })
+                .await
+                .unwrap();
+
+                drop(client);
+                server.shutdown().await;
+            });
+        });
+
+        let deadline = std::time::Instant::now() + std::time::Duration::from_secs(5);
+        while !worker.is_finished() {
+            if std::time::Instant::now() >= deadline {
+                panic!("producer deadlocked: worker did not finish within 5s");
+            }
+            std::thread::sleep(std::time::Duration::from_millis(10));
+        }
+        worker.join().expect("worker thread panicked");
+    }
+}
diff --git a/crates/composefs-storage/src/layer.rs b/crates/composefs-storage/src/layer.rs
index 9a05a6b9..3f877d16 100644
--- a/crates/composefs-storage/src/layer.rs
+++ b/crates/composefs-storage/src/layer.rs
@@ -155,33 +155,21 @@ impl Layer {
     ///
     /// Returns layers in order: [self, parent, grandparent, ..., base]
     ///
+    /// The overlay `lower` file already lists the complete ordered ancestor
+    /// chain (immediate parent first, base last), so the chain is simply
+    /// `self` followed by each resolved parent. No recursive expansion needed.
+    ///
     /// # Errors
     ///
-    /// Returns an error if the layer chain exceeds the maximum depth of 500 layers.
+    /// Returns an error if any parent layer cannot be resolved or opened.
     pub fn layer_chain(self, storage: &Storage) -> Result<Vec<Layer>> {
-        let mut chain = vec![self];
-        let mut current_idx = 0;
-
-        // Maximum depth to prevent infinite loops
-        const MAX_DEPTH: usize = 500;
-
-        while current_idx < chain.len() && chain.len() < MAX_DEPTH {
-            let parent_ids = chain[current_idx].parents(storage)?;
-
-            // Add all parents to the chain
-            for parent_id in parent_ids {
-                chain.push(Layer::open(storage, &parent_id)?);
-            }
-
-            current_idx += 1;
-        }
-
-        if chain.len() >= MAX_DEPTH {
-            return Err(StorageError::InvalidStorage(
-                "Layer chain exceeds maximum depth of 500".to_string(),
-            ));
+        // Call parents() before moving self into the vec.
+        let parent_ids = self.parents(storage)?;
+        let mut chain = Vec::with_capacity(1 + parent_ids.len());
+        chain.push(self);
+        for id in parent_ids {
+            chain.push(Layer::open(storage, &id)?);
         }
-
         Ok(chain)
     }
 
@@ -309,6 +297,81 @@ mod tests {
         Layer::open(&storage, layer_id).unwrap()
     }
 
+    // --- layer_chain tests ---
+
+    /// Create a 3-layer mock: A → B → C (A's lower lists B and C).
+    ///
+    /// The overlay `lower` file for layer A is `l/<B_link>:l/<C_link>`, meaning
+    /// B is the immediate parent and C is the base.  `layer_chain` should return
+    /// exactly [A, B, C].
+    fn create_three_layer_mock(root: &Path) -> Storage {
+        for d in ["overlay", "overlay/l", "overlay-layers", "overlay-images"] {
+            std::fs::create_dir_all(root.join(d)).unwrap();
+        }
+
+        let layers = [
+            ("layer-a", "AAAAAAAAAAAAAAAAAAAAAAAAAA"),
+            ("layer-b", "BBBBBBBBBBBBBBBBBBBBBBBBBB"),
+            ("layer-c", "CCCCCCCCCCCCCCCCCCCCCCCCCC"),
+        ];
+
+        for (id, link) in &layers {
+            let layer_dir = root.join("overlay").join(id);
+            std::fs::create_dir_all(layer_dir.join("diff")).unwrap();
+            std::fs::write(layer_dir.join("link"), link).unwrap();
+            // Create symlink overlay/l/<link> → ../<id>/diff
+            std::os::unix::fs::symlink(
+                format!("../{}/diff", id),
+                root.join("overlay/l").join(link),
+            )
+            .unwrap();
+        }
+
+        // Layer A's lower: B is immediate parent, C is base.
+        std::fs::write(
+            root.join("overlay/layer-a/lower"),
+            format!(
+                "l/{}:l/{}",
+                layers[1].1, // B link
+                layers[2].1, // C link
+            ),
+        )
+        .unwrap();
+        // Layer B has C as its parent (single entry).
+        std::fs::write(
+            root.join("overlay/layer-b/lower"),
+            format!("l/{}", layers[2].1),
+        )
+        .unwrap();
+        // Layer C is the base — no lower file.
+
+        Storage::open(root).unwrap()
+    }
+
+    /// Regression test: `layer_chain` must return exactly [A, B, C] for a
+    /// 3-layer chain where A's `lower` already lists both B and C.
+    ///
+    /// The old BFS expansion would open B, then expand B's parents (C), giving
+    /// [A, B, C, C] — duplicating C and growing exponentially for real images.
+    #[test]
+    fn test_layer_chain_no_bfs_expansion() {
+        let dir = tempfile::tempdir().unwrap();
+        let storage = create_three_layer_mock(dir.path());
+
+        let layer_a = Layer::open(&storage, "layer-a").unwrap();
+        let chain = layer_a.layer_chain(&storage).unwrap();
+
+        assert_eq!(
+            chain.len(),
+            3,
+            "layer_chain must return exactly 3 layers [A, B, C], got {}",
+            chain.len()
+        );
+        assert_eq!(chain[0].id(), "layer-a");
+        assert_eq!(chain[1].id(), "layer-b");
+        assert_eq!(chain[2].id(), "layer-c");
+    }
+
     // --- has_whiteout tests ---
 
     #[test]
diff --git a/crates/composefs-storage/src/lib.rs b/crates/composefs-storage/src/lib.rs
index b6a5dbbb..83c9f83f 100644
--- a/crates/composefs-storage/src/lib.rs
+++ b/crates/composefs-storage/src/lib.rs
@@ -55,27 +55,35 @@ pub mod config;
 pub mod error;
 pub mod image;
 pub mod layer;
+pub mod lock;
 pub mod storage;
 pub mod tar_split;
 
+// Stateless CstorLayerService implementing org.composefs.Oci
+#[cfg(feature = "layer-transfer")]
+pub mod cstor_service;
+
 // User namespace support for rootless access
 pub mod userns;
-#[cfg(feature = "userns-helper")]
+#[cfg(feature = "layer-transfer")]
 pub mod userns_helper;
 
 // Re-export commonly used types
 pub use config::{AdditionalLayerStore, StorageConfig};
+#[cfg(feature = "layer-transfer")]
+pub use cstor_service::{
+    CstorLayerService, InProcessServer as CstorInProcessServer,
+    spawn_in_process as spawn_cstor_in_process,
+};
 pub use error::{Result, StorageError};
 pub use image::Image;
 pub use layer::Layer;
+pub use lock::LayerStoreLock;
 pub use storage::{LayerMetadata, Storage};
 pub use tar_split::{TarHeader, TarSplitFdStream, TarSplitItem};
 pub use userns::can_bypass_file_permissions;
-#[cfg(feature = "userns-helper")]
-pub use userns_helper::{
-    GetImageResult, HelperError, ImageInfo, ProxiedLayerStream, ProxiedTarSplitItem, StorageProxy,
-    init_if_helper,
-};
+#[cfg(feature = "layer-transfer")]
+pub use userns_helper::{HelperError, StorageProxy, init_if_helper};
 
 // Re-export OCI spec types for convenience
 pub use oci_spec::image::{Descriptor, ImageConfiguration, ImageManifest};
diff --git a/crates/composefs-storage/src/lock.rs b/crates/composefs-storage/src/lock.rs
new file mode 100644
index 00000000..c11e203b
--- /dev/null
+++ b/crates/composefs-storage/src/lock.rs
@@ -0,0 +1,150 @@
+//! Layer-store locking for safe concurrent access.
+//!
+//! containers-storage (podman/buildah) uses an `flock(2)`-based protocol on
+//! `<storage_root>/overlay-layers/layers.lock` to coordinate readers and
+//! writers.  Taking a shared (read) lock here ensures that a concurrent
+//! `podman rmi` cannot delete a layer's diff directory while we are
+//! streaming it to the consumer.
+//!
+//! # Why flock?
+//!
+//! * **Interoperability**: containers-storage uses `flock(2)` on this exact
+//!   path; taking the same lock type gives correct mutual exclusion with all
+//!   c/storage users.
+//! * **Auto-release on process death**: the kernel closes all fds and
+//!   releases any flocks when the process exits, even on SIGKILL.  This is
+//!   critical for the userns helper subprocess: if the parent kills it mid-
+//!   transfer, the lock is released automatically, unblocking podman.
+//! * **RAII**: the guard is just a wrapped `OwnedFd`; the lock releases when
+//!   the guard drops (fd close).
+
+use std::os::fd::OwnedFd;
+
+use rustix::fs::{FlockOperation, Mode, OFlags, flock};
+
+use crate::storage::Storage;
+
+/// A shared (read) lock on the containers-storage layer store.
+///
+/// The lock is held for as long as this guard exists; it releases
+/// automatically when the guard is dropped (or when the process exits).
+///
+/// Acquire via [`Storage::lock_layers_shared`].
+///
+/// The lock is released by closing the fd, which happens automatically when
+/// this guard is dropped (the [`OwnedFd`] closes itself) or when the process
+/// exits. No explicit unlock is needed.
+#[derive(Debug)]
+pub struct LayerStoreLock(#[allow(dead_code)] OwnedFd);
+
+impl Storage {
+    /// Acquire a shared (read) flock on `overlay-layers/layers.lock`.
+    ///
+    /// Blocks until the lock is available.  A concurrent writer (e.g.
+    /// `podman rmi`) holds an exclusive lock while removing layers; we block
+    /// until it finishes.  Multiple readers may hold shared locks simultaneously.
+    ///
+    /// The lock is released when the returned [`LayerStoreLock`] is dropped.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if `overlay-layers/layers.lock` cannot be opened
+    /// or if the `flock` syscall fails.
+    pub fn lock_layers_shared(&self) -> crate::Result<LayerStoreLock> {
+        use crate::error::StorageError;
+
+        // Open (or create) the lock file with O_RDWR so that a freshly
+        // created file gets the right mode for a later root podman that would
+        // open it O_RDWR.  O_RDONLY|O_CREAT would create a 0-byte file with
+        // mode 0o644 but the file descriptor would be read-only, which is
+        // fine for flock but would break a privileged writer that re-opens it
+        // expecting read-write access.  O_RDWR is the safe choice here.
+        use std::os::fd::AsFd as _;
+        let root_fd = self.root_dir().as_fd();
+        let fd = rustix::fs::openat(
+            root_fd,
+            "overlay-layers/layers.lock",
+            OFlags::RDWR | OFlags::CLOEXEC | OFlags::CREATE,
+            Mode::from_bits_truncate(0o644),
+        )
+        .map_err(|e| StorageError::Io(std::io::Error::from(e)))?;
+
+        // Acquire a shared (read) lock; block if an exclusive (write) lock is
+        // held by another process.
+        flock(&fd, FlockOperation::LockShared)
+            .map_err(|e| StorageError::Io(std::io::Error::from(e)))?;
+
+        Ok(LayerStoreLock(fd))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use tempfile::TempDir;
+
+    use super::*;
+    use crate::storage::Storage;
+
+    fn create_test_storage(tmp: &TempDir) -> Storage {
+        for d in ["overlay", "overlay-layers", "overlay-images"] {
+            std::fs::create_dir_all(tmp.path().join(d)).unwrap();
+        }
+        Storage::open(tmp.path()).unwrap()
+    }
+
+    /// Acquiring a shared lock should succeed.
+    #[test]
+    fn test_shared_lock_succeeds() {
+        let tmp = TempDir::new().unwrap();
+        let storage = create_test_storage(&tmp);
+
+        let lock = storage.lock_layers_shared().expect("shared lock");
+        // Lock file must exist now.
+        assert!(tmp.path().join("overlay-layers/layers.lock").exists());
+        drop(lock); // releases
+    }
+
+    /// Two shared locks from the same process succeed simultaneously (shared
+    /// locking is not exclusive against other shared lockers).
+    #[test]
+    fn test_two_shared_locks() {
+        let tmp = TempDir::new().unwrap();
+        let storage = create_test_storage(&tmp);
+
+        let lock1 = storage.lock_layers_shared().expect("first shared lock");
+        let lock2 = storage.lock_layers_shared().expect("second shared lock");
+        drop(lock1);
+        drop(lock2);
+    }
+
+    /// A non-blocking exclusive lock attempt fails while a shared lock is held.
+    ///
+    /// This exercises the mutual-exclusion between our shared reader lock and
+    /// a concurrent exclusive writer (e.g. podman rmi).  We can't test the
+    /// blocking case in a unit test without spawning threads, but we can
+    /// assert that the non-blocking attempt fails with EWOULDBLOCK.
+    #[test]
+    fn test_exclusive_blocked_by_shared() {
+        let tmp = TempDir::new().unwrap();
+        let storage = create_test_storage(&tmp);
+
+        let _shared = storage.lock_layers_shared().expect("shared lock");
+
+        // Open the lock file and try a non-blocking exclusive lock — must fail.
+        use std::os::fd::AsFd as _;
+        let root_fd = storage.root_dir().as_fd();
+        let fd = rustix::fs::openat(
+            root_fd,
+            "overlay-layers/layers.lock",
+            OFlags::RDWR | OFlags::CLOEXEC | OFlags::CREATE,
+            Mode::from_bits_truncate(0o644),
+        )
+        .expect("open lock file");
+
+        let result = flock(&fd, FlockOperation::NonBlockingLockExclusive);
+        assert!(
+            result.is_err(),
+            "exclusive non-blocking lock must fail while a shared lock is held"
+        );
+    }
+}
diff --git a/crates/composefs-storage/src/tar_split.rs b/crates/composefs-storage/src/tar_split.rs
index 5f6a418c..704f2209 100644
--- a/crates/composefs-storage/src/tar_split.rs
+++ b/crates/composefs-storage/src/tar_split.rs
@@ -19,7 +19,7 @@
 //! - CRC64-ISO algorithm for checksums
 
 use std::io::{BufRead, BufReader, Read, Seek};
-use std::os::fd::OwnedFd;
+use std::os::fd::{AsFd as _, OwnedFd};
 
 use base64::prelude::*;
 use cap_std::fs::{Dir, File};
@@ -34,6 +34,48 @@ use crate::storage::Storage;
 /// CRC64-ISO implementation for verifying file checksums.
 const CRC64_ISO: Crc<u64> = Crc::<u64>::new(&CRC_64_GO_ISO);
 
+/// Overlay metacopy xattr names — checked in priority order.
+///
+/// Root mode uses `trusted.overlay.metacopy`; rootless mode (with the
+/// `userxattr` overlay mount option) uses `user.overlay.metacopy`.
+const OVERLAY_METACOPY_XATTRS: &[&str] = &["trusted.overlay.metacopy", "user.overlay.metacopy"];
+
+/// Returns `true` if `file` is an overlay metacopy stub.
+///
+/// A metacopy file is a zero-byte upper-layer stub whose content lives in a
+/// lower layer.  The kernel signals this by attaching a
+/// `trusted.overlay.metacopy` (or `user.overlay.metacopy` in rootless mode)
+/// extended attribute to the file.  Reading data from the stub via a plain
+/// file descriptor returns EOF immediately, so the producer must detect and
+/// skip it, falling back to the lower layer that holds the real data.
+///
+/// Returns `Err` only for unexpected I/O errors; `ENODATA` / `ENOATTR`
+/// (xattr not present) is treated as `Ok(false)`.
+fn is_overlay_metacopy(file: &File) -> Result<bool> {
+    let fd = file.as_fd();
+    for name in OVERLAY_METACOPY_XATTRS {
+        // Pass a 1-byte buffer: we only care whether the xattr exists, not
+        // its value.  The kernel returns Ok(n) when the value fits and ERANGE
+        // when the buffer is too small — both mean the xattr is present.
+        match rustix::fs::fgetxattr(fd, *name, &mut [0u8; 1]) {
+            Ok(_) => return Ok(true),
+            // ERANGE: buffer too small, but the xattr is present.
+            Err(rustix::io::Errno::RANGE) => return Ok(true),
+            // ENODATA / ENOATTR: xattr not present on this file.
+            Err(rustix::io::Errno::NODATA) => continue,
+            // ENOATTR is the BSD spelling; some libc glue maps it the same way.
+            #[cfg(not(target_os = "linux"))]
+            Err(rustix::io::Errno::NOATTR) => continue,
+            Err(e) => {
+                return Err(StorageError::Io(std::io::Error::from_raw_os_error(
+                    e.raw_os_error(),
+                )));
+            }
+        }
+    }
+    Ok(false)
+}
+
 /// Item returned from tar-split stream iteration.
 #[derive(Debug)]
 pub enum TarSplitItem {
@@ -58,6 +100,13 @@ pub enum TarSplitItem {
         /// This is the path as recorded in the original tar archive
         /// (e.g., "./etc/hosts").
         name: String,
+        /// Short link ID of the layer in whose diff directory this file was found.
+        ///
+        /// This is the value of `overlay/l/<link_id>` — the short symlink name
+        /// created by containers/storage that points at `../<layer-id>/diff`.
+        /// The consumer uses it to build filenames of the form
+        /// `l/<link_id>/<relpath>` relative to the `overlay/` directory.
+        link_id: String,
     },
 }
 
@@ -292,6 +341,19 @@ pub struct TarSplitFdStream {
 
     /// Entry counter for debugging and error messages.
     entry_count: usize,
+
+    /// Short link IDs for each layer in the chain.
+    ///
+    /// Index 0 is the target layer's own link ID; subsequent entries are the
+    /// ancestor layers in the same depth-first order produced by
+    /// [`Layer::layer_chain`].  Each link ID corresponds to a symlink under
+    /// `overlay/l/<link_id>` that points to the layer's diff directory.
+    chain_link_ids: Vec<String>,
+
+    /// Layer IDs parallel to `chain_link_ids`.
+    ///
+    /// `chain_ids[i]` is the layer ID whose link ID is `chain_link_ids[i]`.
+    chain_ids: Vec<String>,
 }
 
 impl TarSplitFdStream {
@@ -319,28 +381,65 @@ impl TarSplitFdStream {
         let gz_decoder = GzDecoder::new(file);
         let reader = BufReader::new(gz_decoder);
 
-        // Open the layer for on-demand file lookups
+        // Build the ordered chain up front.
+        // layer_chain() consumes a Layer, so open a fresh one for that purpose;
+        // then open another fresh copy for the on-demand lookups below.
+        let chain_layer = Layer::open(storage, layer.id())?;
+        let chain = chain_layer.layer_chain(storage)?;
+
+        let mut chain_link_ids = Vec::with_capacity(chain.len());
+        let mut chain_ids = Vec::with_capacity(chain.len());
+        for l in &chain {
+            chain_link_ids.push(l.link_id().to_string());
+            chain_ids.push(l.id().to_string());
+        }
+
+        // Open the layer for on-demand file lookups (kept for the existing
+        // parent-search helpers that still operate on Layer objects).
         let layer = Layer::open(storage, layer.id())?;
 
         // Clone storage root dir for on-demand parent layer access
         let storage_root = storage.root_dir().try_clone()?;
 
+        // `chain` is no longer needed.
+        drop(chain);
+
         Ok(Self {
             layer,
             storage_root,
             reader,
             entry_count: 0,
+            chain_link_ids,
+            chain_ids,
         })
     }
 
     /// Open a file in the layer chain, trying current layer first then parents.
-    fn open_file_in_chain(&self, path: &str) -> Result<cap_std::fs::File> {
+    ///
+    /// Returns the opened file together with the link ID of the layer in whose
+    /// diff directory the file was found (i.e. `overlay/l/<link_id>` points
+    /// to that layer's diff dir).
+    ///
+    /// Overlay metacopy stubs (files whose content lives in a lower layer,
+    /// signalled by a `trusted.overlay.metacopy` or `user.overlay.metacopy`
+    /// xattr) are transparently skipped: finding one in a layer is treated as
+    /// "not found here" and the search continues down the parent chain until
+    /// the layer that contains the real file data is found.
+    fn open_file_in_chain(&self, path: &str) -> Result<(cap_std::fs::File, String)> {
         // Normalize path (remove leading ./)
         let normalized_path = path.strip_prefix("./").unwrap_or(path);
 
-        // Try to open in current layer first
+        // Try to open in current layer first (chain index 0 = own diff dir).
+        // Skip overlay metacopy stubs — they are zero-byte upper-layer files
+        // whose actual content lives in a lower layer.
         match self.layer.diff_dir().open(normalized_path) {
-            Ok(file) => return Ok(file),
+            Ok(file) if !is_overlay_metacopy(&file)? => {
+                let link_id = self.chain_link_ids[0].clone();
+                return Ok((file, link_id));
+            }
+            Ok(_) => {
+                // File exists but is a metacopy stub; fall through to parents.
+            }
             Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
                 // Continue to search parent layers
             }
@@ -352,12 +451,15 @@ impl TarSplitFdStream {
     }
 
     /// Recursively search parent layers for a file.
+    ///
+    /// Returns the opened file together with the link ID of the layer in whose
+    /// diff directory the file was found.
     fn search_parent_layers(
         &self,
         current_layer: &Layer,
         path: &str,
         depth: usize,
-    ) -> Result<cap_std::fs::File> {
+    ) -> Result<(cap_std::fs::File, String)> {
         const MAX_DEPTH: usize = 500;
 
         if depth >= MAX_DEPTH {
@@ -377,11 +479,14 @@ impl TarSplitFdStream {
 
             // Try to open file directly in parent's diff directory
             match self.open_file_in_layer(&parent_id, path) {
-                Ok(file) => return Ok(file),
+                Ok(file) => {
+                    let found_link_id = self.link_id_for(&parent_id)?;
+                    return Ok((file, found_link_id));
+                }
                 Err(StorageError::Io(e)) if e.kind() == std::io::ErrorKind::NotFound => {
                     // File not in this parent, recursively search its parents
                     match self.search_by_layer_id(&parent_id, path, depth + 1) {
-                        Ok(file) => return Ok(file),
+                        Ok(result) => return Ok(result),
                         Err(StorageError::TarSplitError(_)) => continue, // File not found in this branch, try next parent
                         Err(e) => return Err(e),
                     }
@@ -397,12 +502,15 @@ impl TarSplitFdStream {
     }
 
     /// Search for a file starting from a layer ID.
+    ///
+    /// Returns the opened file together with the link ID of the layer in whose
+    /// diff directory the file was found.
     fn search_by_layer_id(
         &self,
         layer_id: &str,
         path: &str,
         depth: usize,
-    ) -> Result<cap_std::fs::File> {
+    ) -> Result<(cap_std::fs::File, String)> {
         const MAX_DEPTH: usize = 500;
 
         if depth >= MAX_DEPTH {
@@ -414,7 +522,10 @@ impl TarSplitFdStream {
 
         // Try to open file in this layer
         match self.open_file_in_layer(layer_id, path) {
-            Ok(file) => return Ok(file),
+            Ok(file) => {
+                let found_link_id = self.link_id_for(layer_id)?;
+                return Ok((file, found_link_id));
+            }
             Err(StorageError::Io(e)) if e.kind() == std::io::ErrorKind::NotFound => {
                 // File not found, check parents
             }
@@ -428,7 +539,7 @@ impl TarSplitFdStream {
         for link_id in parent_links {
             let parent_id = self.resolve_link_direct(&link_id)?;
             match self.search_by_layer_id(&parent_id, path, depth + 1) {
-                Ok(file) => return Ok(file),
+                Ok(result) => return Ok(result),
                 Err(StorageError::TarSplitError(_)) => continue, // File not found in this branch, try next parent
                 Err(e) => return Err(e),
             }
@@ -466,11 +577,22 @@ impl TarSplitFdStream {
     }
 
     /// Open a file in a specific layer's diff directory.
+    ///
+    /// Returns `Err(StorageError::Io(NotFound))` both when the file is absent
+    /// and when it is present but is an overlay metacopy stub, so that callers
+    /// uniformly fall back to the next lower layer in either case.
     fn open_file_in_layer(&self, layer_id: &str, path: &str) -> Result<cap_std::fs::File> {
         let overlay_dir = self.storage_root.open_dir("overlay")?;
         let layer_dir = overlay_dir.open_dir(layer_id)?;
         let diff_dir = layer_dir.open_dir("diff")?;
-        diff_dir.open(path).map_err(StorageError::Io)
+        let file = diff_dir.open(path).map_err(StorageError::Io)?;
+        if is_overlay_metacopy(&file)? {
+            return Err(StorageError::Io(std::io::Error::new(
+                std::io::ErrorKind::NotFound,
+                "overlay metacopy stub — real data is in a lower layer",
+            )));
+        }
+        Ok(file)
     }
 
     /// Read parent link IDs from a layer's lower file.
@@ -490,6 +612,37 @@ impl TarSplitFdStream {
         }
     }
 
+    /// Look up the link ID for a layer by its layer ID.
+    ///
+    /// Returns the link ID (i.e. the short name under `overlay/l/`) for the
+    /// layer identified by `layer_id`.  This should always succeed because the
+    /// on-demand search and the precomputed chain are built from the same
+    /// storage; a miss means the store mutated under us or the image is
+    /// malformed, so we fail closed.
+    fn link_id_for(&self, layer_id: &str) -> Result<String> {
+        self.chain_ids
+            .iter()
+            .position(|id| id == layer_id)
+            .map(|idx| self.chain_link_ids[idx].clone())
+            .ok_or_else(|| {
+                StorageError::TarSplitError(format!(
+                    "resolved layer {} is not in the precomputed chain",
+                    layer_id
+                ))
+            })
+    }
+
+    /// Return the ordered link-ID chain for this stream.
+    ///
+    /// Index 0 is the target layer's own link ID; subsequent entries are
+    /// ancestor layers in the same depth-first order produced by
+    /// [`Layer::layer_chain`].  The `link_id` field on every
+    /// [`TarSplitItem::FileContent`] yielded by [`Self::next`] is one of the
+    /// values in this slice.
+    pub fn chain_link_ids(&self) -> &[String] {
+        &self.chain_link_ids
+    }
+
     /// Verify CRC64-ISO checksum of a file.
     fn verify_crc64(
         &self,
@@ -604,7 +757,7 @@ impl TarSplitFdStream {
                                     )
                                 })?;
 
-                                let mut file = self.open_file_in_chain(path)?;
+                                let (mut file, link_id) = self.open_file_in_chain(path)?;
 
                                 // Verify CRC64 if provided
                                 if let Some(ref crc64_b64) = crc64 {
@@ -621,6 +774,7 @@ impl TarSplitFdStream {
                                     fd: owned_fd,
                                     size: file_size,
                                     name: path.clone(),
+                                    link_id,
                                 }));
                             }
                             // Empty file or directory - header already in preceding Segment
@@ -645,7 +799,211 @@ impl TarSplitFdStream {
 
 #[cfg(test)]
 mod tests {
+    use std::io::Write as _;
+
+    use flate2::Compression;
+    use flate2::write::GzEncoder;
+
     use super::*;
+    use crate::storage::Storage;
+
+    // ---------------------------------------------------------------------------
+    // Helper: build a minimal two-layer mock storage on disk.
+    //
+    // Layout created:
+    //   <root>/
+    //     overlay/
+    //       l/
+    //         CHILDLINKID000000000000000  ->  ../../overlay/child-layer-001/diff
+    //         PARENTLINKID0000000000000  ->  ../../overlay/parent-layer-001/diff
+    //       child-layer-001/
+    //         diff/
+    //           etc/
+    //             child.txt          ("child content")
+    //         link                   ("CHILDLINKID000000000000000")
+    //         lower                  ("l/PARENTLINKID0000000000000")
+    //       parent-layer-001/
+    //         diff/
+    //           etc/
+    //             parent.txt         ("parent content")
+    //         link                   ("PARENTLINKID0000000000000")
+    //     overlay-layers/
+    //       child-layer-001.tar-split.gz   (two File entries, no CRC)
+    //     overlay-images/   (empty, required by Storage::open)
+    //
+    // The tar-split for child-layer-001 references:
+    //   - "./etc/child.txt"  (size 14, found in child diff → dirfd_index 0)
+    //   - "./etc/parent.txt" (size 15, found in parent diff → dirfd_index 1)
+    // ---------------------------------------------------------------------------
+    fn create_two_layer_mock(root: &std::path::Path) -> Storage {
+        // Required top-level dirs
+        for d in ["overlay", "overlay-layers", "overlay-images"] {
+            std::fs::create_dir_all(root.join(d)).unwrap();
+        }
+
+        // --- child layer ---
+        let child_id = "child-layer-001";
+        let child_link = "CHILDLINKID000000000000000";
+        let child_diff = root.join("overlay").join(child_id).join("diff");
+        std::fs::create_dir_all(child_diff.join("etc")).unwrap();
+        std::fs::write(child_diff.join("etc/child.txt"), b"child content!").unwrap(); // 14 bytes
+        std::fs::write(root.join("overlay").join(child_id).join("link"), child_link).unwrap();
+
+        // --- parent layer ---
+        let parent_id = "parent-layer-001";
+        let parent_link = "PARENTLINKID000000000000000";
+        let parent_diff = root.join("overlay").join(parent_id).join("diff");
+        std::fs::create_dir_all(parent_diff.join("etc")).unwrap();
+        std::fs::write(parent_diff.join("etc/parent.txt"), b"parent content!").unwrap(); // 15 bytes
+        std::fs::write(
+            root.join("overlay").join(parent_id).join("link"),
+            parent_link,
+        )
+        .unwrap();
+
+        // child's lower file: references parent via its link
+        std::fs::write(
+            root.join("overlay").join(child_id).join("lower"),
+            format!("l/{}", parent_link),
+        )
+        .unwrap();
+
+        // overlay/l/ symlinks: target is relative from overlay/l/ to overlay/<id>/diff
+        // i.e. "../../overlay/<id>/diff" won't work since overlay/l is already inside overlay/
+        // The actual format used by containers/storage is: "../<layer-id>/diff"
+        let l_dir = root.join("overlay").join("l");
+        std::fs::create_dir_all(&l_dir).unwrap();
+        std::os::unix::fs::symlink(format!("../{}/diff", child_id), l_dir.join(child_link))
+            .unwrap();
+        std::os::unix::fs::symlink(format!("../{}/diff", parent_id), l_dir.join(parent_link))
+            .unwrap();
+
+        // --- tar-split for child layer ---
+        // Two File entries (no CRC, so no checksum is verified).
+        // We also need at least one Segment entry so the reader has something
+        // to return for the Segment path; we emit a minimal one.
+        let ndjson = concat!(
+            r#"{"type":1,"name":"./etc/child.txt","size":14}"#,
+            "\n",
+            r#"{"type":1,"name":"./etc/parent.txt","size":15}"#,
+            "\n",
+        );
+
+        let gz_path = root
+            .join("overlay-layers")
+            .join(format!("{}.tar-split.gz", child_id));
+        let gz_file = std::fs::File::create(&gz_path).unwrap();
+        let mut encoder = GzEncoder::new(gz_file, Compression::default());
+        encoder.write_all(ndjson.as_bytes()).unwrap();
+        encoder.finish().unwrap();
+
+        Storage::open(root).unwrap()
+    }
+
+    #[test]
+    fn test_link_id_two_layer_chain() {
+        let tmp = tempfile::tempdir().unwrap();
+        let storage = create_two_layer_mock(tmp.path());
+        let layer = Layer::open(&storage, "child-layer-001").unwrap();
+        let mut stream = TarSplitFdStream::new(&storage, &layer).unwrap();
+
+        // chain_link_ids should have exactly 2 entries (child + parent)
+        assert_eq!(
+            stream.chain_link_ids().len(),
+            2,
+            "expected chain length 2 (child + parent)"
+        );
+
+        let child_link = "CHILDLINKID000000000000000";
+        let parent_link = "PARENTLINKID000000000000000";
+
+        // First item: ./etc/child.txt — present only in child diff
+        match stream.next().unwrap().expect("expected FileContent item") {
+            TarSplitItem::FileContent { name, link_id, .. } => {
+                assert_eq!(name, "./etc/child.txt");
+                assert_eq!(
+                    link_id, child_link,
+                    "./etc/child.txt should be found in the child layer"
+                );
+            }
+            TarSplitItem::Segment(_) => panic!("expected FileContent, got Segment"),
+        }
+
+        // Second item: ./etc/parent.txt — present only in parent diff
+        match stream.next().unwrap().expect("expected FileContent item") {
+            TarSplitItem::FileContent { name, link_id, .. } => {
+                assert_eq!(name, "./etc/parent.txt");
+                assert_eq!(
+                    link_id, parent_link,
+                    "./etc/parent.txt should be found in the parent layer"
+                );
+            }
+            TarSplitItem::Segment(_) => panic!("expected FileContent, got Segment"),
+        }
+
+        // Stream should be exhausted
+        assert!(stream.next().unwrap().is_none());
+    }
+
+    #[test]
+    fn test_link_id_single_layer() {
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+
+        // Minimal single-layer storage (no lower file → no parents)
+        for d in ["overlay", "overlay-layers", "overlay-images"] {
+            std::fs::create_dir_all(root.join(d)).unwrap();
+        }
+
+        let layer_id = "base-layer-001";
+        let link_id = "BASELINKID0000000000000000";
+        let diff = root.join("overlay").join(layer_id).join("diff");
+        std::fs::create_dir_all(&diff).unwrap();
+        std::fs::write(diff.join("hello.txt"), b"hello").unwrap(); // 5 bytes
+        std::fs::write(root.join("overlay").join(layer_id).join("link"), link_id).unwrap();
+
+        // overlay/l/ symlink (not strictly needed for single layer, but good hygiene)
+        let l_dir = root.join("overlay").join("l");
+        std::fs::create_dir_all(&l_dir).unwrap();
+        std::os::unix::fs::symlink(format!("../{}/diff", layer_id), l_dir.join(link_id)).unwrap();
+
+        // tar-split with one file entry
+        let ndjson = concat!(r#"{"type":1,"name":"./hello.txt","size":5}"#, "\n");
+        let gz_path = root
+            .join("overlay-layers")
+            .join(format!("{}.tar-split.gz", layer_id));
+        let gz_file = std::fs::File::create(&gz_path).unwrap();
+        let mut encoder = GzEncoder::new(gz_file, Compression::default());
+        encoder.write_all(ndjson.as_bytes()).unwrap();
+        encoder.finish().unwrap();
+
+        let storage = Storage::open(root).unwrap();
+        let layer = Layer::open(&storage, layer_id).unwrap();
+        let mut stream = TarSplitFdStream::new(&storage, &layer).unwrap();
+
+        assert_eq!(
+            stream.chain_link_ids().len(),
+            1,
+            "single layer → chain len 1"
+        );
+
+        match stream.next().unwrap().expect("expected item") {
+            TarSplitItem::FileContent {
+                name,
+                link_id: found_link_id,
+                ..
+            } => {
+                assert_eq!(name, "./hello.txt");
+                assert_eq!(
+                    found_link_id, link_id,
+                    "base layer file must have its own link_id"
+                );
+            }
+            TarSplitItem::Segment(_) => panic!("expected FileContent"),
+        }
+
+        assert!(stream.next().unwrap().is_none());
+    }
 
     #[test]
     fn test_tar_header_type_checks() {
@@ -708,4 +1066,139 @@ mod tests {
         let result = TarSplitEntry::from_raw(raw);
         assert!(result.is_err());
     }
+
+    /// Build a two-layer mock where the child layer holds a metacopy stub for
+    /// `shared.txt` and the real content lives only in the parent layer.
+    ///
+    /// Uses `/var/tmp` as the temp-dir root because `user.*` extended
+    /// attributes are supported there (tmpfs, no `nosuid`/`nouser_xattr`
+    /// mount restrictions).  If `/var/tmp` is unavailable or the filesystem
+    /// does not support `user.*` xattrs, the test is skipped.
+    ///
+    /// Layout:
+    /// ```text
+    /// <root>/overlay/
+    ///   child-layer-001/diff/shared.txt   — 0-byte metacopy stub
+    ///                                       (user.overlay.metacopy xattr set)
+    ///   parent-layer-001/diff/shared.txt  — "real content" (12 bytes)
+    /// ```
+    #[test]
+    fn test_metacopy_stub_falls_back_to_parent() {
+        // Use /var/tmp so user.* xattrs are available on tmpfs.
+        let tmp = match tempfile::Builder::new().tempdir_in("/var/tmp") {
+            Ok(d) => d,
+            Err(_) => {
+                // /var/tmp unavailable — skip rather than fail.
+                return;
+            }
+        };
+        let root =
+            cap_std::fs::Dir::open_ambient_dir(tmp.path(), cap_std::ambient_authority()).unwrap();
+
+        // Required top-level dirs.
+        for d in ["overlay", "overlay-layers", "overlay-images"] {
+            root.create_dir_all(d).unwrap();
+        }
+
+        let child_id = "child-layer-001";
+        let child_link = "CHILDLINKID000000000000000";
+        let parent_id = "parent-layer-001";
+        let parent_link = "PARENTLINKID000000000000000";
+
+        // Parent layer: real file content.
+        root.create_dir_all(format!("overlay/{parent_id}/diff"))
+            .unwrap();
+        root.write(
+            format!("overlay/{parent_id}/diff/shared.txt"),
+            b"real content", // 12 bytes
+        )
+        .unwrap();
+        root.write(format!("overlay/{parent_id}/link"), parent_link)
+            .unwrap();
+
+        // Child layer: zero-byte metacopy stub with user.overlay.metacopy xattr.
+        root.create_dir_all(format!("overlay/{child_id}/diff"))
+            .unwrap();
+        root.write(format!("overlay/{child_id}/diff/shared.txt"), b"")
+            .unwrap(); // 0 bytes — the stub
+
+        // Set the metacopy xattr.  If the filesystem refuses user.* xattrs,
+        // bail out with a skip rather than a spurious failure.
+        let stub_file = root
+            .open(format!("overlay/{child_id}/diff/shared.txt"))
+            .unwrap();
+        match rustix::fs::fsetxattr(
+            stub_file.as_fd(),
+            "user.overlay.metacopy",
+            b"",
+            rustix::fs::XattrFlags::CREATE,
+        ) {
+            Ok(()) => {}
+            Err(rustix::io::Errno::OPNOTSUPP) => {
+                // Filesystem does not support xattrs — skip.
+                return;
+            }
+            Err(e) => panic!("unexpected fsetxattr error: {e}"),
+        }
+
+        root.write(format!("overlay/{child_id}/link"), child_link)
+            .unwrap();
+        root.write(
+            format!("overlay/{child_id}/lower"),
+            format!("l/{parent_link}"),
+        )
+        .unwrap();
+
+        // overlay/l/ symlinks.
+        root.create_dir_all("overlay/l").unwrap();
+        root.symlink(
+            format!("../{child_id}/diff"),
+            format!("overlay/l/{child_link}"),
+        )
+        .unwrap();
+        root.symlink(
+            format!("../{parent_id}/diff"),
+            format!("overlay/l/{parent_link}"),
+        )
+        .unwrap();
+
+        // tar-split: one file entry referencing shared.txt (size 12).
+        let ndjson = concat!(r#"{"type":1,"name":"./shared.txt","size":12}"#, "\n");
+        let gz_file = root
+            .open_with(
+                format!("overlay-layers/{child_id}.tar-split.gz"),
+                cap_std::fs::OpenOptions::new()
+                    .write(true)
+                    .create(true)
+                    .truncate(true),
+            )
+            .unwrap();
+        let mut encoder = GzEncoder::new(gz_file.into_std(), Compression::default());
+        encoder.write_all(ndjson.as_bytes()).unwrap();
+        encoder.finish().unwrap();
+
+        let storage = Storage::open(tmp.path()).unwrap();
+        let layer = Layer::open(&storage, child_id).unwrap();
+        let mut stream = TarSplitFdStream::new(&storage, &layer).unwrap();
+
+        assert_eq!(
+            stream.chain_link_ids().len(),
+            2,
+            "expected chain length 2 (child + parent)"
+        );
+
+        match stream.next().unwrap().expect("expected FileContent item") {
+            TarSplitItem::FileContent { name, link_id, .. } => {
+                assert_eq!(name, "./shared.txt");
+                assert_eq!(
+                    link_id, parent_link,
+                    "./shared.txt metacopy stub in child must be skipped; \
+                     real data is at parent (link_id={parent_link})"
+                );
+            }
+            TarSplitItem::Segment(_) => panic!("expected FileContent, got Segment"),
+        }
+
+        assert!(stream.next().unwrap().is_none());
+    }
 }
diff --git a/crates/composefs-storage/src/userns_helper.rs b/crates/composefs-storage/src/userns_helper.rs
index 0b50a606..55174258 100644
--- a/crates/composefs-storage/src/userns_helper.rs
+++ b/crates/composefs-storage/src/userns_helper.rs
@@ -3,8 +3,8 @@
 //! This module provides a mechanism for unprivileged processes to access
 //! containers-storage content that has restrictive permissions. It works by
 //! spawning a helper process inside a user namespace (via `podman unshare`)
-//! that can read any file, and communicating with it via JSON-RPC over a
-//! Unix socket with fd-passing.
+//! that can read any file, and communicating with it via the zlink
+//! `org.composefs.Oci` service (`CstorLayerService`) over a Unix socket.
 //!
 //! # Why This Is Needed
 //!
@@ -30,9 +30,8 @@
 //! │       │      /proc/self/exe         │
 //! │       │      (child's stdin=socket) │
 //! │       │                             │
-//! │  proxy.stream_layer() ───────────►  │
-//! │       │                             │
-//! │  ◄─── receives OwnedFd via SCM_RIGHTS│
+//! │  proxy.connection()  ──────────────►│
+//! │       │   (OciProxy/CstorLayerSvc)  │
 //! └─────────────────────────────────────┘
 //! ```
 //!
@@ -49,152 +48,19 @@
 //! // Normal application code continues here...
 //! ```
 
-use std::os::fd::AsFd;
+use std::os::fd::AsFd as _;
 use std::os::unix::io::OwnedFd;
 use std::os::unix::net::UnixStream as StdUnixStream;
-use std::path::Path;
 use std::process::{Child, Command, Stdio};
 
-use base64::prelude::*;
-use jsonrpc_fdpass::transport::UnixSocketTransport;
-use jsonrpc_fdpass::{JsonRpcMessage, JsonRpcRequest, JsonRpcResponse, MessageWithFds};
 use rustix::io::dup;
 use rustix::process::{Signal, set_parent_process_death_signal};
-use serde::{Deserialize, Serialize};
-use tokio::net::UnixStream as TokioUnixStream;
 
-use crate::layer::Layer;
-use crate::storage::Storage;
-use crate::tar_split::{TarSplitFdStream, TarSplitItem};
 use crate::userns::can_bypass_file_permissions;
 
 /// Environment variable that indicates this process is a userns helper.
 const HELPER_ENV: &str = "__CSTORAGE_USERNS_HELPER";
 
-/// JSON-RPC 2.0 error codes.
-///
-/// These codes follow the JSON-RPC 2.0 specification:
-/// - Standard errors: -32700 to -32600
-/// - Server errors: -32099 to -32000 (implementation-defined)
-mod error_codes {
-    /// Invalid params - the params passed to a method are invalid.
-    pub const INVALID_PARAMS: i32 = -32602;
-
-    /// Method not found - the requested method does not exist.
-    pub const METHOD_NOT_FOUND: i32 = -32601;
-
-    /// Resource not found - the requested resource (image, layer, etc.) was not found.
-    pub const RESOURCE_NOT_FOUND: i32 = -32000;
-
-    /// Internal error - a server-side error occurred (I/O, storage access, etc.).
-    pub const INTERNAL_ERROR: i32 = -32003;
-}
-
-/// JSON-RPC method names.
-mod methods {
-    /// Open a file and return its fd.
-    pub const OPEN_FILE: &str = "userns.openFile";
-    /// Shutdown the helper process.
-    pub const SHUTDOWN: &str = "userns.shutdown";
-    /// List images in storage.
-    pub const LIST_IMAGES: &str = "userns.listImages";
-    /// Get image metadata.
-    pub const GET_IMAGE: &str = "userns.getImage";
-    /// Stream layer as tar-split entries with fds.
-    pub const STREAM_LAYER: &str = "userns.streamLayer";
-}
-
-/// Parameters for the open_file method.
-#[derive(Debug, Serialize, Deserialize)]
-pub struct OpenFileParams {
-    /// Path to open.
-    pub path: String,
-}
-
-/// Result for the open_file method.
-#[derive(Debug, Serialize, Deserialize)]
-pub struct OpenFileResult {
-    /// True if successful (fd is passed out-of-band).
-    pub success: bool,
-}
-
-/// Parameters for list_images method.
-#[derive(Debug, Serialize, Deserialize)]
-pub struct ListImagesParams {
-    /// Storage root path.
-    pub storage_path: String,
-}
-
-/// Image info returned by list_images.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ImageInfo {
-    /// Image ID.
-    pub id: String,
-    /// Image names/tags.
-    pub names: Vec<String>,
-}
-
-/// Result for list_images method.
-#[derive(Debug, Serialize, Deserialize)]
-pub struct ListImagesResult {
-    /// List of images.
-    pub images: Vec<ImageInfo>,
-}
-
-/// Parameters for get_image method.
-#[derive(Debug, Serialize, Deserialize)]
-pub struct GetImageParams {
-    /// Storage root path.
-    pub storage_path: String,
-    /// Image ID or name.
-    pub image_ref: String,
-}
-
-/// Result for get_image method.
-#[derive(Debug, Serialize, Deserialize)]
-pub struct GetImageResult {
-    /// Image ID.
-    pub id: String,
-    /// Image names.
-    pub names: Vec<String>,
-    /// Layer diff IDs (sha256:...).
-    pub layer_diff_ids: Vec<oci_spec::image::Digest>,
-    /// Storage layer IDs (internal IDs used by containers-storage).
-    pub storage_layer_ids: Vec<String>,
-}
-
-/// Parameters for stream_layer method.
-#[derive(Debug, Serialize, Deserialize)]
-pub struct StreamLayerParams {
-    /// Storage root path.
-    pub storage_path: String,
-    /// Layer ID (storage layer ID, not diff ID).
-    pub layer_id: String,
-}
-
-/// Streaming notification for a segment.
-#[derive(Debug, Serialize, Deserialize)]
-pub struct StreamSegmentNotification {
-    /// Base64-encoded segment data.
-    pub data: String,
-}
-
-/// Streaming notification for a file (fd is passed out-of-band).
-#[derive(Debug, Serialize, Deserialize)]
-pub struct StreamFileNotification {
-    /// File path in the tar.
-    pub name: String,
-    /// File size.
-    pub size: u64,
-}
-
-/// Result for stream_layer method (sent after all notifications).
-#[derive(Debug, Serialize, Deserialize)]
-pub struct StreamLayerResult {
-    /// Number of items streamed.
-    pub items_sent: usize,
-}
-
 /// Error type for userns helper operations.
 #[derive(Debug, thiserror::Error)]
 pub enum HelperError {
@@ -202,890 +68,219 @@ pub enum HelperError {
     #[error("failed to create socket: {0}")]
     Socket(#[source] std::io::Error),
 
+    /// `podman` binary was not found on `PATH`.
+    ///
+    /// The userns helper requires `podman unshare` to set up a user
+    /// namespace.  Install `podman` and ensure it is on `PATH`.
+    #[error("podman not found on PATH (required for user-namespace helper): {0}")]
+    PodmanNotFound(#[source] std::io::Error),
+
     /// Failed to spawn helper process.
     #[error("failed to spawn helper process: {0}")]
     Spawn(#[source] std::io::Error),
 
-    /// IPC error.
+    /// IPC / transport error.
     #[error("IPC error: {0}")]
     Ipc(String),
 
-    /// Helper returned an error.
-    #[error("helper error: {0}")]
-    HelperError(String),
-
     /// I/O error.
     #[error("I/O error: {0}")]
     Io(#[from] std::io::Error),
-
-    /// JSON-RPC error from the helper.
-    #[error("RPC error: code={code}, message={message}")]
-    RpcError {
-        /// JSON-RPC error code.
-        code: i32,
-        /// Error message.
-        message: String,
-    },
 }
 
-/// Check if this process was spawned as a userns helper and run the helper loop if so.
+/// Check if this process was spawned as a userns helper and serve the
+/// `org.composefs.Oci` (`CstorLayerService`) zlink service if so.
 ///
-/// This function **must** be called early in `main()`, before any other cstorage
-/// operations. If this process was spawned as a helper, this function will:
+/// This function **must** be called early in `main()`, before any other
+/// `composefs_storage` operations.  If the `__CSTORAGE_USERNS_HELPER`
+/// environment variable is set this function will:
 ///
-/// 1. Read from stdin (which is a Unix socket from the parent)
-/// 2. Serve JSON-RPC requests for file operations  
-/// 3. Exit when the parent closes the connection
+/// 1. Set a parent-death signal so the helper exits when the parent dies.
+/// 2. Duplicate `stdin` into an owned [`StdUnixStream`].
+/// 3. Call [`crate::cstor_service::serve_on_socket_blocking`] to serve the
+///    `org.composefs.Oci` zlink service until the parent closes the connection.
+/// 4. Exit the process (0 on success, 1 on error).
 ///
-/// If this is not a helper process, this function returns immediately.
+/// If the environment variable is **not** set, this function returns
+/// immediately and the caller continues normal execution.
 //
 // Runs in a forked helper subprocess whose stdin is the IPC socket; there is
 // no logging facility or error channel back to the parent, so fatal
 // diagnostics are written to stderr before exiting.
 #[allow(clippy::print_stderr)]
 pub fn init_if_helper() {
-    // Check if we're a helper via environment variable
     if std::env::var(HELPER_ENV).is_err() {
-        return; // Not a helper, continue normal execution
+        return; // Not a helper — continue normal execution.
     }
 
-    // Ensure we exit if parent dies (avoids orphan helper processes)
+    // Set a death signal on our immediate parent's exit, as a best-effort net
+    // against orphaned helpers.
+    //
+    // The PRIMARY shutdown mechanism is explicit: `StorageProxy` (in the parent
+    // cfsctl process) `kill()`s this helper when the import finishes or when the
+    // proxy is dropped.  PDEATHSIG is only a fallback for abnormal parent death.
+    //
+    // CAVEAT: when spawned via `podman unshare <exe>`, the helper's PPID is the
+    // intermediate `podman` process, NOT cfsctl.  So PDEATHSIG actually fires on
+    // *podman's* death, not cfsctl's; it does not track cfsctl through the
+    // podman intermediary.  In the normal flow `podman unshare` waits on this
+    // helper, so cfsctl killing the helper (or podman) tears the whole chain
+    // down.  This signal is purely a backstop.
     if let Err(e) = set_parent_process_death_signal(Some(Signal::TERM)) {
-        eprintln!("cstorage helper: failed to set parent death signal: {}", e);
-        // Continue anyway - this is a nice-to-have, not critical
+        eprintln!("cstorage helper: failed to set parent death signal: {e}");
+        // Continue anyway — explicit kill() from the parent is the real guard.
     }
 
-    // We're a helper - stdin is our IPC socket.
-    // Use dup() to get a new owned fd from stdin (fd 0).
-    // This is safe because:
-    // 1. We were spawned with stdin set to a socket
-    // 2. dup() gives us a new fd that we own
-    // 3. We use std::io::stdin().as_fd() which is the safe way to get the fd
-    let stdin_fd = match dup(std::io::stdin().as_fd()) {
+    // stdin is our IPC socket.  dup() it so we hold an owned fd.
+    let stdin_fd: OwnedFd = match dup(std::io::stdin().as_fd()) {
         Ok(fd) => fd,
         Err(e) => {
-            eprintln!("cstorage helper: failed to dup stdin: {}", e);
+            eprintln!("cstorage helper: failed to dup stdin: {e}");
             std::process::exit(1);
         }
     };
     let std_socket = StdUnixStream::from(stdin_fd);
 
-    // Run the helper loop (never returns on success)
-    if let Err(e) = run_helper_loop_blocking(std_socket) {
-        eprintln!("cstorage helper: error in helper loop: {}", e);
+    if let Err(e) = crate::cstor_service::serve_on_socket_blocking(std_socket) {
+        eprintln!("cstorage helper: server error: {e}");
         std::process::exit(1);
     }
     std::process::exit(0);
 }
 
-/// Run the helper loop synchronously by creating a tokio runtime.
-fn run_helper_loop_blocking(std_socket: StdUnixStream) -> std::result::Result<(), HelperError> {
-    // Set non-blocking for tokio
-    std_socket.set_nonblocking(true)?;
-
-    // Create a tokio runtime for the helper
-    let rt = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()
-        .map_err(|e| HelperError::Ipc(format!("failed to create tokio runtime: {}", e)))?;
-
-    rt.block_on(run_helper_loop_async(std_socket))
-}
-
-/// Run the helper loop, serving requests from the parent.
-async fn run_helper_loop_async(std_socket: StdUnixStream) -> std::result::Result<(), HelperError> {
-    // Convert std socket to tokio socket
-    let tokio_socket = TokioUnixStream::from_std(std_socket)
-        .map_err(|e| HelperError::Ipc(format!("failed to convert socket: {}", e)))?;
-
-    let transport = UnixSocketTransport::new(tokio_socket);
-    let (mut sender, mut receiver) = transport.split();
-
-    tracing::debug!("userns helper: starting request loop");
-
-    loop {
-        let msg_with_fds = match receiver.receive().await {
-            Ok(m) => m,
-            Err(jsonrpc_fdpass::Error::ConnectionClosed) => {
-                tracing::debug!("userns helper: connection closed");
-                return Ok(());
-            }
-            Err(e) => {
-                return Err(HelperError::Ipc(format!(
-                    "failed to receive message: {}",
-                    e
-                )));
-            }
-        };
-
-        match msg_with_fds.message {
-            JsonRpcMessage::Request(request) => {
-                let id = request.id.clone();
-
-                // Handle stream_layer specially since it needs to send multiple messages
-                if request.method == methods::STREAM_LAYER {
-                    if let Err((code, msg)) = handle_stream_layer(&request, &mut sender).await {
-                        let error = jsonrpc_fdpass::JsonRpcError::owned(code, msg, None::<()>);
-                        let response = JsonRpcResponse::error(error, id);
-                        let message =
-                            MessageWithFds::new(JsonRpcMessage::Response(response), vec![]);
-                        sender.send(message).await.map_err(|e| {
-                            HelperError::Ipc(format!("failed to send error response: {}", e))
-                        })?;
-                    }
-                    // Success response is sent by handle_stream_layer
-                    continue;
-                }
-
-                let (result, fds) = handle_request(&request);
-
-                match result {
-                    Ok(response_value) => {
-                        let response = JsonRpcResponse::success(response_value, id);
-                        let message = MessageWithFds::new(JsonRpcMessage::Response(response), fds);
-                        sender.send(message).await.map_err(|e| {
-                            HelperError::Ipc(format!("failed to send response: {}", e))
-                        })?;
-                    }
-                    Err((code, message_str)) => {
-                        let error =
-                            jsonrpc_fdpass::JsonRpcError::owned(code, message_str, None::<()>);
-                        let response = JsonRpcResponse::error(error, id);
-                        let message =
-                            MessageWithFds::new(JsonRpcMessage::Response(response), vec![]);
-                        sender.send(message).await.map_err(|e| {
-                            HelperError::Ipc(format!("failed to send error response: {}", e))
-                        })?;
-                    }
-                }
-
-                // Check for shutdown request (handle after sending response)
-                if request.method == methods::SHUTDOWN {
-                    tracing::debug!("userns helper: received shutdown request");
-                    return Ok(());
-                }
-            }
-            JsonRpcMessage::Notification(notif) => {
-                if notif.method == methods::SHUTDOWN {
-                    tracing::debug!("userns helper: received shutdown notification");
-                    return Ok(());
-                }
-                // Ignore other notifications
-            }
-            JsonRpcMessage::Response(_) => {
-                // Unexpected response - ignore
-            }
-        }
-    }
-}
-
-/// Handle stream_layer request - sends multiple notifications with fds.
-async fn handle_stream_layer(
-    request: &JsonRpcRequest,
-    sender: &mut jsonrpc_fdpass::transport::Sender,
-) -> std::result::Result<(), (i32, String)> {
-    let params: StreamLayerParams = request
-        .params
-        .as_ref()
-        .and_then(|p| serde_json::from_value(p.clone()).ok())
-        .ok_or((
-            error_codes::INVALID_PARAMS,
-            "invalid params for streamLayer".to_string(),
-        ))?;
-
-    let storage = Storage::open(&params.storage_path).map_err(|e| {
-        (
-            error_codes::INTERNAL_ERROR,
-            format!("failed to open storage: {}", e),
-        )
-    })?;
-
-    let layer = Layer::open(&storage, &params.layer_id).map_err(|e| {
-        (
-            error_codes::RESOURCE_NOT_FOUND,
-            format!("layer not found: {}", e),
-        )
-    })?;
-
-    let mut stream = TarSplitFdStream::new(&storage, &layer).map_err(|e| {
-        (
-            error_codes::INTERNAL_ERROR,
-            format!("failed to create tar-split stream: {}", e),
-        )
-    })?;
-
-    let mut items_sent = 0usize;
-
-    // Stream all items as notifications
-    while let Some(item) = stream
-        .next()
-        .map_err(|e| (error_codes::INTERNAL_ERROR, format!("stream error: {}", e)))?
-    {
-        match item {
-            TarSplitItem::Segment(bytes) => {
-                // Send segment as base64-encoded notification
-                let params = StreamSegmentNotification {
-                    data: BASE64_STANDARD.encode(&bytes),
-                };
-                let notif = jsonrpc_fdpass::JsonRpcNotification::new(
-                    "stream.segment".to_string(),
-                    Some(serde_json::to_value(&params).unwrap()),
-                );
-                let message = MessageWithFds::new(JsonRpcMessage::Notification(notif), vec![]);
-                sender.send(message).await.map_err(|e| {
-                    (
-                        error_codes::INTERNAL_ERROR,
-                        format!("failed to send segment: {}", e),
-                    )
-                })?;
-                items_sent += 1;
-            }
-            TarSplitItem::FileContent { fd, size, name } => {
-                // Send file notification with fd
-                let params = StreamFileNotification { name, size };
-                let notif = jsonrpc_fdpass::JsonRpcNotification::new(
-                    "stream.file".to_string(),
-                    Some(serde_json::to_value(&params).unwrap()),
-                );
-                let message = MessageWithFds::new(JsonRpcMessage::Notification(notif), vec![fd]);
-                sender.send(message).await.map_err(|e| {
-                    (
-                        error_codes::INTERNAL_ERROR,
-                        format!("failed to send file: {}", e),
-                    )
-                })?;
-                items_sent += 1;
-            }
-        }
-    }
-
-    // Send success response
-    let result = StreamLayerResult { items_sent };
-    let response =
-        JsonRpcResponse::success(serde_json::to_value(result).unwrap(), request.id.clone());
-    let message = MessageWithFds::new(JsonRpcMessage::Response(response), vec![]);
-    sender.send(message).await.map_err(|e| {
-        (
-            error_codes::INTERNAL_ERROR,
-            format!("failed to send response: {}", e),
-        )
-    })?;
-
-    Ok(())
-}
-
-/// Handle a JSON-RPC request.
-fn handle_request(
-    request: &JsonRpcRequest,
-) -> (
-    std::result::Result<serde_json::Value, (i32, String)>,
-    Vec<OwnedFd>,
-) {
-    match request.method.as_str() {
-        methods::OPEN_FILE => {
-            let params: OpenFileParams = match request
-                .params
-                .as_ref()
-                .and_then(|p| serde_json::from_value(p.clone()).ok())
-            {
-                Some(p) => p,
-                None => {
-                    return (
-                        Err((
-                            error_codes::INVALID_PARAMS,
-                            "invalid params: missing 'path' field".to_string(),
-                        )),
-                        vec![],
-                    );
-                }
-            };
-
-            match std::fs::File::open(&params.path) {
-                Ok(file) => {
-                    let fd: OwnedFd = file.into();
-                    let result = OpenFileResult { success: true };
-                    (Ok(serde_json::to_value(result).unwrap()), vec![fd])
-                }
-                Err(e) => (
-                    Err((
-                        error_codes::INTERNAL_ERROR,
-                        format!("failed to open file: {}", e),
-                    )),
-                    vec![],
-                ),
-            }
-        }
-        methods::LIST_IMAGES => handle_list_images(request),
-        methods::GET_IMAGE => handle_get_image(request),
-        methods::SHUTDOWN => {
-            // Just return success - the loop will exit after sending the response
-            (Ok(serde_json::json!({"success": true})), vec![])
-        }
-        _ => (
-            Err((
-                error_codes::METHOD_NOT_FOUND,
-                format!("method not found: {}", request.method),
-            )),
-            vec![],
-        ),
-    }
-}
-
-/// Handle list_images request.
-fn handle_list_images(
-    request: &JsonRpcRequest,
-) -> (
-    std::result::Result<serde_json::Value, (i32, String)>,
-    Vec<OwnedFd>,
-) {
-    let params: ListImagesParams = match request
-        .params
-        .as_ref()
-        .and_then(|p| serde_json::from_value(p.clone()).ok())
-    {
-        Some(p) => p,
-        None => {
-            return (
-                Err((
-                    error_codes::INVALID_PARAMS,
-                    "invalid params for listImages".to_string(),
-                )),
-                vec![],
-            );
-        }
-    };
-
-    let storage = match Storage::open(&params.storage_path) {
-        Ok(s) => s,
-        Err(e) => {
-            return (
-                Err((
-                    error_codes::INTERNAL_ERROR,
-                    format!("failed to open storage: {}", e),
-                )),
-                vec![],
-            );
-        }
-    };
-
-    let images = match storage.list_images() {
-        Ok(imgs) => imgs,
-        Err(e) => {
-            return (
-                Err((
-                    error_codes::INTERNAL_ERROR,
-                    format!("failed to list images: {}", e),
-                )),
-                vec![],
-            );
-        }
-    };
-
-    let image_infos: Vec<ImageInfo> = images
-        .iter()
-        .map(|img| ImageInfo {
-            id: img.id().to_string(),
-            names: img.names(&storage).unwrap_or_default(),
-        })
-        .collect();
-
-    let result = ListImagesResult {
-        images: image_infos,
-    };
-    (Ok(serde_json::to_value(result).unwrap()), vec![])
-}
-
-/// Handle get_image request.
-fn handle_get_image(
-    request: &JsonRpcRequest,
-) -> (
-    std::result::Result<serde_json::Value, (i32, String)>,
-    Vec<OwnedFd>,
-) {
-    let params: GetImageParams = match request
-        .params
-        .as_ref()
-        .and_then(|p| serde_json::from_value(p.clone()).ok())
-    {
-        Some(p) => p,
-        None => {
-            return (
-                Err((
-                    error_codes::INVALID_PARAMS,
-                    "invalid params for getImage".to_string(),
-                )),
-                vec![],
-            );
-        }
-    };
-
-    let storage = match Storage::open(&params.storage_path) {
-        Ok(s) => s,
-        Err(e) => {
-            return (
-                Err((
-                    error_codes::INTERNAL_ERROR,
-                    format!("failed to open storage: {}", e),
-                )),
-                vec![],
-            );
-        }
-    };
-
-    // Try by ID first, then by name
-    let image = match crate::image::Image::open(&storage, &params.image_ref) {
-        Ok(img) => img,
-        Err(_) => match storage.find_image_by_name(&params.image_ref) {
-            Ok(img) => img,
-            Err(e) => {
-                return (
-                    Err((
-                        error_codes::RESOURCE_NOT_FOUND,
-                        format!("image not found: {}", e),
-                    )),
-                    vec![],
-                );
-            }
-        },
-    };
-
-    let config = match image.config() {
-        Ok(cfg) => cfg,
-        Err(e) => {
-            return (
-                Err((
-                    error_codes::INTERNAL_ERROR,
-                    format!("failed to read config: {}", e),
-                )),
-                vec![],
-            );
-        }
-    };
-
-    let diff_ids: Vec<oci_spec::image::Digest> = config
-        .rootfs()
-        .diff_ids()
-        .iter()
-        .map(|s| s.parse().expect("config diff_id should be valid digest"))
-        .collect();
-
-    let storage_layer_ids = match image.storage_layer_ids(std::slice::from_ref(&storage)) {
-        Ok(ids) => ids,
-        Err(e) => {
-            return (
-                Err((
-                    error_codes::INTERNAL_ERROR,
-                    format!("failed to get storage layer IDs: {}", e),
-                )),
-                vec![],
-            );
-        }
-    };
-
-    let result = GetImageResult {
-        id: image.id().to_string(),
-        names: image.names(&storage).unwrap_or_default(),
-        layer_diff_ids: diff_ids,
-        storage_layer_ids,
-    };
-    (Ok(serde_json::to_value(result).unwrap()), vec![])
-}
-
-/// Proxy for accessing files via the userns helper process.
+/// Proxy for accessing storage content via the userns helper process.
+///
+/// When the caller cannot bypass file permissions, `StorageProxy::spawn()`
+/// starts a helper process inside a user namespace using `podman unshare` and
+/// returns a connected [`zlink::unix::Connection`] the caller can use with the
+/// [`composefs_oci::varlink_types::OciProxy`] trait to stream layers.
 ///
-/// This spawns a helper process (via `podman unshare`) that runs inside a
-/// user namespace and can read files with restrictive permissions. File
-/// descriptors are passed back via SCM_RIGHTS.
+/// # Dependency on `podman`
+///
+/// This type requires the `podman` binary to be present on `PATH`.  If
+/// `podman` is not found, [`StorageProxy::spawn`] returns
+/// [`HelperError::PodmanNotFound`].
+///
+/// # Liveness
+///
+/// The helper subprocess is shut down explicitly: [`StorageProxy::shutdown`]
+/// (and the `Drop` impl) `kill()` the child once the caller is done.  A
+/// best-effort `PR_SET_PDEATHSIG` in the helper is a fallback for abnormal
+/// parent death (see `init_if_helper`).
+///
+/// Dropping this struct kills the child process.
 pub struct StorageProxy {
-    child: Child,
-    sender: jsonrpc_fdpass::transport::Sender,
-    receiver: jsonrpc_fdpass::transport::Receiver,
-    next_id: u64,
+    /// The spawned helper child process.  Wrapped in `Option` so that
+    /// `shutdown` can take ownership and call `wait()` without fighting the
+    /// `Drop` impl.
+    child: Option<Child>,
+    conn: zlink::unix::Connection,
 }
 
 impl std::fmt::Debug for StorageProxy {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("StorageProxy")
-            .field("child_pid", &self.child.id())
+            .field("child_pid", &self.child.as_ref().map(|c| c.id()))
             .finish_non_exhaustive()
     }
 }
 
 impl StorageProxy {
-    /// Spawn a userns helper process.
+    /// Spawn a userns helper process if required.
     ///
-    /// If the current process can already bypass file permissions (running as
-    /// root or has CAP_DAC_OVERRIDE), this returns `Ok(None)` since no helper
-    /// is needed.
-    pub async fn spawn() -> std::result::Result<Option<Self>, HelperError> {
-        // Check if we even need a helper
+    /// Returns `Ok(None)` when the current process can already bypass file
+    /// permissions (running as root, or has `CAP_DAC_OVERRIDE`), since no
+    /// helper is needed in that case.
+    pub async fn spawn() -> Result<Option<Self>, HelperError> {
         if can_bypass_file_permissions() {
             return Ok(None);
         }
-
         Self::spawn_helper().await.map(Some)
     }
 
-    /// Spawn the helper unconditionally.
-    async fn spawn_helper() -> std::result::Result<Self, HelperError> {
+    /// Spawn the helper unconditionally using `/proc/self/exe`.
+    async fn spawn_helper() -> Result<Self, HelperError> {
         let exe = std::fs::read_link("/proc/self/exe").map_err(HelperError::Io)?;
         Self::spawn_helper_with_binary(exe).await
     }
 
-    /// Spawn the helper with a specific binary path.
+    /// Spawn the helper with an explicit binary path.
     ///
-    /// This is used when the default /proc/self/exe is not suitable,
-    /// such as when running from a test harness.
-    async fn spawn_helper_with_binary(
-        exe: std::path::PathBuf,
-    ) -> std::result::Result<Self, HelperError> {
-        // Create a socket pair - one end for us, one for the child's stdin
+    /// Useful when `/proc/self/exe` is not the right choice (e.g. test
+    /// harnesses that run under a wrapper binary).
+    async fn spawn_helper_with_binary(exe: std::path::PathBuf) -> Result<Self, HelperError> {
+        // Create a socket pair — one end becomes the child's stdin, the other
+        // stays in the parent for the zlink connection.
         let (parent_sock, child_sock) = StdUnixStream::pair().map_err(HelperError::Socket)?;
 
-        // Spawn via podman unshare, with child_sock as the child's stdin.
-        // We use `env` to set the HELPER_ENV because podman unshare doesn't
-        // propagate the parent's environment to the inner command.
+        // Spawn via `podman unshare`; set HELPER_ENV because podman unshare
+        // does not automatically forward the parent's environment.
+        //
+        // `podman` must be on PATH — see [`HelperError::PodmanNotFound`].
         let child = Command::new("podman")
             .arg("unshare")
             .arg("env")
-            .arg(format!("{}=1", HELPER_ENV))
+            .arg(format!("{HELPER_ENV}=1"))
             .arg(&exe)
             .stdin(Stdio::from(OwnedFd::from(child_sock)))
             .stdout(Stdio::inherit())
             .stderr(Stdio::inherit())
             .spawn()
-            .map_err(HelperError::Spawn)?;
+            .map_err(|e| {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    HelperError::PodmanNotFound(e)
+                } else {
+                    HelperError::Spawn(e)
+                }
+            })?;
 
-        // Convert our socket to async
+        // Wrap the parent socket end as a zlink Connection.
         parent_sock.set_nonblocking(true)?;
-        let tokio_socket = TokioUnixStream::from_std(parent_sock)
-            .map_err(|e| HelperError::Ipc(format!("failed to convert socket: {}", e)))?;
-
-        let transport = UnixSocketTransport::new(tokio_socket);
-        let (sender, receiver) = transport.split();
+        let tok = tokio::net::UnixStream::from_std(parent_sock)
+            .map_err(|e| HelperError::Ipc(format!("failed to convert socket: {e}")))?;
+        let zs = zlink::unix::Stream::from(tok);
+        let conn = zlink::Connection::from(zs);
 
         Ok(Self {
-            child,
-            sender,
-            receiver,
-            next_id: 1,
+            child: Some(child),
+            conn,
         })
     }
 
-    /// Open a file via the helper, returning its fd.
-    ///
-    /// # Arguments
-    ///
-    /// * `path` - The path to open (should be absolute)
+    /// Return a mutable reference to the underlying zlink connection.
     ///
-    /// # Returns
-    ///
-    /// The opened file descriptor, which can be used for reading.
-    pub async fn open_file(
-        &mut self,
-        path: impl AsRef<Path>,
-    ) -> std::result::Result<OwnedFd, HelperError> {
-        let params = OpenFileParams {
-            path: path.as_ref().to_string_lossy().to_string(),
-        };
-
-        let id = self.next_id;
-        self.next_id += 1;
-
-        let request = JsonRpcRequest::new(
-            methods::OPEN_FILE.to_string(),
-            Some(serde_json::to_value(&params).unwrap()),
-            serde_json::Value::Number(id.into()),
-        );
-
-        let message = MessageWithFds::new(JsonRpcMessage::Request(request), vec![]);
-        self.sender
-            .send(message)
-            .await
-            .map_err(|e| HelperError::Ipc(format!("failed to send request: {}", e)))?;
-
-        // Receive response
-        let response = self
-            .receiver
-            .receive()
-            .await
-            .map_err(|e| HelperError::Ipc(format!("failed to receive response: {}", e)))?;
-
-        match response.message {
-            JsonRpcMessage::Response(resp) => {
-                if let Some(error) = resp.error {
-                    return Err(HelperError::RpcError {
-                        code: error.code(),
-                        message: error.message().to_string(),
-                    });
-                }
-
-                // The fd should be in the response
-                if response.file_descriptors.is_empty() {
-                    return Err(HelperError::Ipc(
-                        "response missing file descriptor".to_string(),
-                    ));
-                }
-
-                Ok(response.file_descriptors.into_iter().next().unwrap())
-            }
-            other => Err(HelperError::Ipc(format!(
-                "unexpected message type: {:?}",
-                other
-            ))),
-        }
-    }
-
-    /// Shutdown the helper process gracefully.
-    pub async fn shutdown(mut self) -> std::result::Result<(), HelperError> {
-        let id = self.next_id;
-
-        let request = JsonRpcRequest::new(
-            methods::SHUTDOWN.to_string(),
-            None,
-            serde_json::Value::Number(id.into()),
-        );
-
-        let message = MessageWithFds::new(JsonRpcMessage::Request(request), vec![]);
-        // Ignore send errors - the child may have already exited
-        let _ = self.sender.send(message).await;
-
-        // Wait for the child to exit
-        let _ = self.child.wait();
-
-        Ok(())
-    }
-
-    /// List images in storage via the helper.
-    pub async fn list_images(
-        &mut self,
-        storage_path: &str,
-    ) -> std::result::Result<Vec<ImageInfo>, HelperError> {
-        let params = ListImagesParams {
-            storage_path: storage_path.to_string(),
-        };
-        let result: ListImagesResult = self.call(methods::LIST_IMAGES, &params).await?;
-        Ok(result.images)
+    /// Callers use this with the `OciProxy` trait (from
+    /// `composefs_oci::varlink_types`) to drive `GetLayer` calls over the
+    /// helper connection.
+    pub fn connection(&mut self) -> &mut zlink::unix::Connection {
+        &mut self.conn
     }
 
-    /// Get image information via the helper.
-    pub async fn get_image(
-        &mut self,
-        storage_path: &str,
-        image_ref: &str,
-    ) -> std::result::Result<GetImageResult, HelperError> {
-        let params = GetImageParams {
-            storage_path: storage_path.to_string(),
-            image_ref: image_ref.to_string(),
-        };
-        self.call(methods::GET_IMAGE, &params).await
-    }
-
-    /// Start streaming a layer's tar-split content.
-    ///
-    /// Returns a stream that yields `ProxiedTarSplitItem`s. The helper sends
-    /// notifications with file descriptors for each file in the layer.
-    pub async fn stream_layer(
-        &mut self,
-        storage_path: &str,
-        layer_id: &str,
-    ) -> std::result::Result<ProxiedLayerStream<'_>, HelperError> {
-        let params = StreamLayerParams {
-            storage_path: storage_path.to_string(),
-            layer_id: layer_id.to_string(),
-        };
-
-        let id = self.next_id;
-        self.next_id += 1;
-
-        let request = JsonRpcRequest::new(
-            methods::STREAM_LAYER.to_string(),
-            Some(serde_json::to_value(&params).unwrap()),
-            serde_json::Value::Number(id.into()),
-        );
-
-        let message = MessageWithFds::new(JsonRpcMessage::Request(request), vec![]);
-        self.sender
-            .send(message)
-            .await
-            .map_err(|e| HelperError::Ipc(format!("failed to send stream_layer request: {}", e)))?;
-
-        Ok(ProxiedLayerStream {
-            receiver: &mut self.receiver,
-            request_id: id,
-            finished: false,
-        })
-    }
-
-    /// Make an RPC call and parse the response.
-    async fn call<P: Serialize, R: for<'de> Deserialize<'de>>(
-        &mut self,
-        method: &str,
-        params: &P,
-    ) -> std::result::Result<R, HelperError> {
-        let id = self.next_id;
-        self.next_id += 1;
-
-        let request = JsonRpcRequest::new(
-            method.to_string(),
-            Some(serde_json::to_value(params).unwrap()),
-            serde_json::Value::Number(id.into()),
-        );
-
-        let message = MessageWithFds::new(JsonRpcMessage::Request(request), vec![]);
-        self.sender
-            .send(message)
-            .await
-            .map_err(|e| HelperError::Ipc(format!("failed to send request: {}", e)))?;
-
-        // Receive response
-        let response = self
-            .receiver
-            .receive()
-            .await
-            .map_err(|e| HelperError::Ipc(format!("failed to receive response: {}", e)))?;
-
-        match response.message {
-            JsonRpcMessage::Response(resp) => {
-                if let Some(error) = resp.error {
-                    return Err(HelperError::RpcError {
-                        code: error.code(),
-                        message: error.message().to_string(),
-                    });
-                }
-
-                let result = resp
-                    .result
-                    .ok_or_else(|| HelperError::Ipc("response missing result".to_string()))?;
-
-                serde_json::from_value(result)
-                    .map_err(|e| HelperError::Ipc(format!("failed to parse result: {}", e)))
-            }
-            other => Err(HelperError::Ipc(format!(
-                "unexpected message type: {:?}",
-                other
-            ))),
-        }
-    }
-}
-
-/// Item received from a proxied layer stream.
-#[derive(Debug)]
-pub enum ProxiedTarSplitItem {
-    /// Raw segment bytes (tar header/padding).
-    Segment(Vec<u8>),
-    /// File content with metadata and fd.
-    FileContent {
-        /// File descriptor for the content.
-        fd: OwnedFd,
-        /// File size.
-        size: u64,
-        /// File name/path.
-        name: String,
-    },
-}
-
-/// Stream of tar-split items received via the helper proxy.
-pub struct ProxiedLayerStream<'a> {
-    receiver: &'a mut jsonrpc_fdpass::transport::Receiver,
-    request_id: u64,
-    finished: bool,
-}
-
-impl std::fmt::Debug for ProxiedLayerStream<'_> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("ProxiedLayerStream")
-            .field("request_id", &self.request_id)
-            .field("finished", &self.finished)
-            .finish_non_exhaustive()
-    }
-}
-
-impl<'a> ProxiedLayerStream<'a> {
-    /// Get the next item from the stream.
+    /// Shut down the helper.
     ///
-    /// Returns `None` when the stream is complete.
-    pub async fn next(&mut self) -> std::result::Result<Option<ProxiedTarSplitItem>, HelperError> {
-        if self.finished {
-            return Ok(None);
-        }
-
-        let msg_with_fds = match self.receiver.receive().await {
-            Ok(m) => m,
-            Err(jsonrpc_fdpass::Error::ConnectionClosed) => {
-                self.finished = true;
-                return Ok(None);
-            }
-            Err(e) => {
-                return Err(HelperError::Ipc(format!("failed to receive: {}", e)));
-            }
-        };
-
-        let mut fds = msg_with_fds.file_descriptors;
-
-        match msg_with_fds.message {
-            JsonRpcMessage::Notification(notif) => {
-                let params = notif.params.unwrap_or(serde_json::Value::Null);
-
-                match notif.method.as_str() {
-                    "stream.segment" => {
-                        let seg: StreamSegmentNotification = serde_json::from_value(params)
-                            .map_err(|e| {
-                                HelperError::Ipc(format!("invalid segment params: {}", e))
-                            })?;
-
-                        let bytes = BASE64_STANDARD.decode(&seg.data).map_err(|e| {
-                            HelperError::Ipc(format!("failed to decode segment: {}", e))
-                        })?;
-
-                        Ok(Some(ProxiedTarSplitItem::Segment(bytes)))
-                    }
-                    "stream.file" => {
-                        let file: StreamFileNotification = serde_json::from_value(params)
-                            .map_err(|e| HelperError::Ipc(format!("invalid file params: {}", e)))?;
-
-                        if fds.is_empty() {
-                            return Err(HelperError::Ipc(
-                                "file notification missing fd".to_string(),
-                            ));
-                        }
-
-                        let fd = fds.remove(0);
-                        Ok(Some(ProxiedTarSplitItem::FileContent {
-                            fd,
-                            size: file.size,
-                            name: file.name,
-                        }))
-                    }
-                    other => Err(HelperError::Ipc(format!(
-                        "unknown notification method: {}",
-                        other
-                    ))),
-                }
-            }
-            JsonRpcMessage::Response(resp) => {
-                // Final response - stream is complete
-                self.finished = true;
-
-                if let Some(error) = resp.error {
-                    return Err(HelperError::RpcError {
-                        code: error.code(),
-                        message: error.message().to_string(),
-                    });
-                }
-
-                Ok(None)
-            }
-            JsonRpcMessage::Request(_) => Err(HelperError::Ipc(
-                "unexpected request from helper".to_string(),
-            )),
+    /// Kills the child process and waits for it to exit.  Dropping this
+    /// struct also kills the child (via `Drop`), but `shutdown` additionally
+    /// waits for the process to be reaped so no zombie is left behind.  This
+    /// explicit `kill()` is the helper's primary shutdown mechanism.
+    pub async fn shutdown(mut self) -> Result<(), HelperError> {
+        // Take the child out so Drop sees None and skips the redundant kill().
+        if let Some(mut child) = self.child.take() {
+            drop(self); // drop conn first
+            let _ = tokio::task::spawn_blocking(move || {
+                let _ = child.kill();
+                child.wait()
+            })
+            .await;
         }
+        Ok(())
     }
 }
 
 impl Drop for StorageProxy {
     fn drop(&mut self) {
-        // Try to kill the child if it's still running
-        let _ = self.child.kill();
+        // Best-effort: kill the child if it is still running.
+        if let Some(ref mut child) = self.child {
+            let _ = child.kill();
+        }
     }
 }
diff --git a/crates/composefs/src/splitstream.rs b/crates/composefs/src/splitstream.rs
index 2f827749..fdb65b90 100644
--- a/crates/composefs/src/splitstream.rs
+++ b/crates/composefs/src/splitstream.rs
@@ -1051,6 +1051,39 @@ impl<ObjectID: FsVerityHashValue> SplitStreamReader<ObjectID> {
         }
     }
 
+    /// Walk the stream one chunk at a time, invoking `callback` for each
+    /// inline span and each external object reference, in stream order.
+    ///
+    /// This is the reference-preserving analogue of [`Self::cat`]: where `cat`
+    /// resolves external objects by copying their bytes, this yields the
+    /// object id so callers can re-emit a reference instead.
+    ///
+    /// The callback receives a [`SplitStreamData`] value for each chunk.
+    /// Inline chunks carry their raw bytes; external chunks carry the object
+    /// identifier so the caller can look up the object by whatever means it
+    /// prefers (e.g. emitting a filename reference into a `splitdirfdstream`).
+    #[context("Walking splitstream chunks")]
+    pub fn for_each_chunk(
+        &mut self,
+        mut callback: impl FnMut(SplitStreamData<ObjectID>) -> Result<()>,
+    ) -> Result<()> {
+        let mut buffer = vec![];
+
+        loop {
+            match self.ensure_chunk(true, true, 0)? {
+                ChunkType::Eof => break Ok(()),
+                ChunkType::Inline => {
+                    read_into_vec(&mut self.decoder, &mut buffer, self.inline_bytes)?;
+                    self.inline_bytes = 0;
+                    callback(SplitStreamData::Inline(buffer[..].into()))?;
+                }
+                ChunkType::External(id) => {
+                    callback(SplitStreamData::External(id))?;
+                }
+            }
+        }
+    }
+
     /// Traverses the split stream and calls the callback for each object reference.
     ///
     /// This includes both references from the digest map and external references in the stream.