From 80893d4c2fdc0541e3330713a8a782f4950124a0 Mon Sep 17 00:00:00 2001 From: jebradbury39 Date: Thu, 6 Nov 2025 17:08:22 -0800 Subject: [PATCH 1/8] wip - debugging Signed-off-by: jebradbury39 --- src/archive.rs | 10 +- src/builder.rs | 491 +++++++++++++++++++++++++++++++++++++++---------- src/lib.rs | 2 +- tests/all.rs | 295 ++++++++++++++++++++++++++++- 4 files changed, 700 insertions(+), 98 deletions(-) diff --git a/src/archive.rs b/src/archive.rs index 36da7b6d..11b7af00 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -482,10 +482,11 @@ impl<'a> EntriesFields<'a> { let off = block.offset()?; let len = block.length()?; if len != 0 && (size - remaining) % BLOCK_SIZE != 0 { - return Err(other( + panic!("len={len}, size={size}, rem={remaining}"); + /*return Err(other( "previous block in sparse file was not \ aligned to 512-byte boundary", - )); + ));*/ } else if off < cur { return Err(other( "out of order or overlapping sparse \ @@ -498,6 +499,11 @@ impl<'a> EntriesFields<'a> { cur = off .checked_add(len) .ok_or_else(|| other("more bytes listed in sparse file than u64 can hold"))?; + + if len > remaining { + panic!("size={}, len={len}, remaining={remaining}", entry.size); + } + remaining = remaining.checked_sub(len).ok_or_else(|| { other( "sparse file consumed more data than the header \ diff --git a/src/builder.rs b/src/builder.rs index 88164c88..04748952 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -27,6 +27,140 @@ struct BuilderOptions { sparse: bool, } +/// Describes sparse-aware seek targets used by [`SeekSparse::seek_sparse`]. +/// +/// These variants mirror the Linux-style `SEEK_DATA` / `SEEK_HOLE` semantics +/// so callers can iterate over alternating data and hole segments inside a +/// sparse object. The offsets refer to the logical size of the data source. +pub enum SeekFromSparse { + /// Adjust the seekable-object offset to the next location in the seekable-object + /// greater than or equal to offset containing data. If offset + /// points to data, then the seekable-object offset is set to offset. + /// + /// If there is no more data to be found, this will result in an `UnexpectedEof`. + NextData(u64), + /// Adjust the seekable-object offset to the next hole in the seekable-object greater + /// than or equal to offset. If offset points into the middle + /// of a hole, then the seekable-object offset is set to offset. If there + /// is no hole past offset, then the seekable-object offset is adjusted to + /// the end of the seekable-object (i.e., there is an implicit hole at the + /// end of any seekable-object). + /// + /// If the offset is beyond the end of the seekable-object, this will result in an `UnexpectedEof`. + NextHole(u64), +} + +/// A helper trait for data sources that can describe sparse regions. +/// +/// Types implementing this trait allow [`Builder`] to skip reading empty +/// segments when constructing sparse archive entries via +/// [`Builder::append_sparse_data`]. +pub trait SeekSparse { + /// Seeks to the offset, and returns the new actual offset of the seekable-object + fn seek_sparse(&mut self, pos: SeekFromSparse) -> std::io::Result; + + /// Returns the logical size (size of holes + data) + fn logical_size(&self) -> u64; + + /// Returns the size of just the data + fn data_size(&self) -> u64; +} + +struct FileWithMetadata<'a> { + file: &'a mut fs::File, + stat: &'a fs::Metadata, +} + +impl<'a> Read for FileWithMetadata<'a> { + #[inline] + fn read(&mut self, buf: &mut [u8]) -> io::Result { + self.file.read(buf) + } +} + +impl<'a> Seek for FileWithMetadata<'a> { + #[inline] + fn seek(&mut self, pos: io::SeekFrom) -> io::Result { + self.file.seek(pos) + } +} + +#[cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux"))] +impl<'a> SeekSparse for FileWithMetadata<'a> { + fn seek_sparse(&mut self, pos: SeekFromSparse) -> std::io::Result { + use std::os::unix::io::AsRawFd as _; + + fn lseek(file: &fs::File, offset: i64, whence: libc::c_int) -> Result { + #[cfg(any(target_os = "linux", target_os = "android"))] + let lseek = libc::lseek64; + #[cfg(not(any(target_os = "linux", target_os = "android")))] + let lseek = libc::lseek; + + match unsafe { lseek(file.as_raw_fd(), offset, whence) } { + -1 => Err(io::Error::last_os_error().raw_os_error().unwrap()), + off => Ok(off), + } + } + + // On most Unixes, we need to read `_PC_MIN_HOLE_SIZE` to see if the file + // system supports `SEEK_HOLE`. + // FreeBSD: https://man.freebsd.org/cgi/man.cgi?query=lseek&sektion=2&manpath=FreeBSD+14.1-STABLE + #[cfg(not(any(target_os = "linux", target_os = "android")))] + if unsafe { libc::fpathconf(file.as_raw_fd(), libc::_PC_MIN_HOLE_SIZE) } == -1 { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "SEEK_HOLE is not supported for this filesystem", + )); + } + + let (off, whence) = match pos { + SeekFromSparse::NextData(offset) => { + let offset: i64 = offset.try_into().map_err(|_error| { + io::Error::new( + io::ErrorKind::InvalidInput, + "seek data offset exceeds i64 max", + ) + })?; + (offset, libc::SEEK_DATA) + } + SeekFromSparse::NextHole(offset) => { + let offset: i64 = offset.try_into().map_err(|_error| { + io::Error::new( + io::ErrorKind::InvalidInput, + "seek hole offset exceeds i64 max", + ) + })?; + (offset, libc::SEEK_HOLE) + } + }; + + match lseek(self.file, off, whence) { + Ok(off) => off.try_into().map_err(|_error| { + io::Error::new( + io::ErrorKind::InvalidInput, + "seek sparse result offset is negative", + ) + }), + Err(libc::ENXIO) => Err(io::Error::new(io::ErrorKind::UnexpectedEof, "hit ENXIO")), + Err(err) => Err(io::Error::from_raw_os_error(err)), + } + } + + #[inline] + fn logical_size(&self) -> u64 { + use std::os::unix::fs::MetadataExt; + + self.stat.size() + } + + #[inline] + fn data_size(&self) -> u64 { + use std::os::unix::fs::MetadataExt; + + self.stat.blocks() + } +} + impl Builder { /// Create a new archive builder with the underlying object as the /// destination of all data written. The builder will use @@ -184,6 +318,108 @@ impl Builder { self.append(header, data) } + /// Adds a new sparse entry to this archive with the specified path. + /// + /// This function behaves like [`Self::append_data`], but it leverages the + /// [`SeekSparse`] implementation on the provided reader to avoid copying + /// data that resides in sparse regions. When the reader reports no sparse + /// blocks (or when sparse handling is disabled), this method falls back to + /// writing the entry as a regular file. + /// + /// As with other append methods, the header's size must match the logical + /// size of the entry, and the checksum will be recalculated after the path + /// metadata is prepared. Call [`Self::finish`] once all entries have been + /// written to complete the archive. + /// + /// # Errors + /// + /// Returns any I/O error encountered while preparing headers or copying + /// data from the sparse reader into the underlying writer. + /// + /// # Examples + /// + /// ``` + /// use std::io::{Cursor, Read, Seek, SeekFrom}; + /// use tar::{Builder, Header, SeekFromSparse, SeekSparse}; + /// + /// struct SparseCursor { + /// cursor: Cursor>, + /// hole_start: u64, + /// hole_len: u64, + /// } + /// + /// impl Read for SparseCursor { + /// fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + /// self.cursor.read(buf) + /// } + /// } + /// + /// impl Seek for SparseCursor { + /// fn seek(&mut self, pos: SeekFrom) -> std::io::Result { + /// self.cursor.seek(pos) + /// } + /// } + /// + /// impl SeekSparse for SparseCursor { + /// fn seek_sparse(&mut self, pos: SeekFromSparse) -> std::io::Result { + /// match pos { + /// SeekFromSparse::NextData(offset) => { + /// if offset < self.hole_start { + /// self.cursor.seek(SeekFrom::Start(offset)) + /// } else { + /// Err(std::io::ErrorKind::UnexpectedEof.into()) + /// } + /// } + /// SeekFromSparse::NextHole(offset) => { + /// if offset <= self.hole_start { + /// self.cursor.seek(SeekFrom::Start(self.hole_start)) + /// } else { + /// Err(std::io::ErrorKind::UnexpectedEof.into()) + /// } + /// } + /// } + /// } + /// + /// fn logical_size(&self) -> u64 { + /// self.cursor.get_ref().len() as u64 + self.hole_len + /// } + /// + /// fn data_size(&self) -> u64 { + /// self.cursor.get_ref().len() as u64 + /// } + /// } + /// + /// let mut header = Header::new_gnu(); + /// header.set_mode(0o644); + /// + /// let data = SparseCursor { + /// cursor: Cursor::new(b"hello world".to_vec()), + /// hole_start: 11, + /// hole_len: 13, + /// }; + /// + /// let mut ar = Builder::new(Vec::new()); + /// ar.append_sparse_data(&mut header, "file.txt", data).unwrap(); + /// let _archive = ar.into_inner().unwrap(); + /// ``` + pub fn append_sparse_data, R: Read + Seek + SeekSparse>( + &mut self, + header: &mut Header, + path: P, + mut data: R, + ) -> io::Result<()> { + if self.options.sparse { + prepare_header_path(self.get_mut(), header, path.as_ref())?; + header.set_size(data.logical_size()); + let sparse_entries = prepare_header_sparse_generic(&mut data, header)?; + header.set_cksum(); + + append_sparse(self.get_mut(), header, sparse_entries, &mut data) + } else { + self.append_data(header, path, data) + } + } + /// Adds a new entry to this archive and returns an [`EntryWriter`] for /// adding its contents. /// @@ -590,9 +826,41 @@ fn append(mut dst: &mut dyn Write, header: &Header, mut data: &mut dyn Read) -> Ok(()) } +fn append_sparse( + dst: &mut dyn Write, + header: &Header, + sparse_entries: Option, + data: &mut R, +) -> io::Result<()> { + dst.write_all(header.as_bytes())?; + + if let Some(sparse_entries) = sparse_entries { + append_extended_sparse_headers(dst, &sparse_entries)?; + for entry in sparse_entries.entries { + data.seek(io::SeekFrom::Start(entry.offset))?; + io::copy(&mut data.take(entry.num_bytes), dst)?; + } + pad_zeroes(dst, sparse_entries.on_disk_size)?; + } else { + let len = io::copy(data, dst)?; + pad_zeroes(dst, len)?; + } + Ok(()) +} + +#[inline] +fn calc_pad_zeros(len: u64) -> u64 { + let r = len % BLOCK_SIZE; + if r == 0 { + 0 + } else { + BLOCK_SIZE - r + } +} + fn pad_zeroes(dst: &mut dyn Write, len: u64) -> io::Result<()> { let buf = [0; BLOCK_SIZE as usize]; - let remaining = BLOCK_SIZE - (len % BLOCK_SIZE); + let remaining = calc_pad_zeros(len); if remaining < BLOCK_SIZE { dst.write_all(&buf[..remaining as usize])?; } @@ -807,6 +1075,28 @@ fn prepare_header_sparse( _ => return Ok(None), }; + println!("entries = {entries:#?}"); + + prepare_header_sparse_from_entries(header, &entries); + + Ok(Some(entries)) +} + +fn prepare_header_sparse_generic( + data: &mut R, + header: &mut Header, +) -> io::Result> { + let entries = match find_sparse_entries_seek(data)? { + Some(entries) => entries, + _ => return Ok(None), + }; + + prepare_header_sparse_from_entries(header, &entries); + + Ok(Some(entries)) +} + +fn prepare_header_sparse_from_entries(header: &mut Header, entries: &SparseEntries) { header.set_entry_type(EntryType::GNUSparse); header.set_size(entries.on_disk_size); @@ -822,8 +1112,6 @@ fn prepare_header_sparse( header_entry.set_length(entry.num_bytes); } gnu_header.set_is_extended(entries.entries.len() > gnu_header.sparse.len()); - - Ok(Some(entries)) } /// Write extra sparse headers into `dst` for those entries that did not fit in the main header. @@ -924,7 +1212,9 @@ impl SparseEntries { #[derive(Debug, Copy, Clone, PartialEq, Eq)] struct SparseEntry { + /// offset on seekable data offset: u64, + /// length on seekable data num_bytes: u64, } @@ -946,39 +1236,23 @@ fn find_sparse_entries( } #[cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux"))] - find_sparse_entries_seek(file, stat) + find_sparse_entries_seek(&mut FileWithMetadata { file, stat }) } -/// Implementation of `find_sparse_entries` using `SEEK_HOLE` and `SEEK_DATA`. -#[cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux"))] -fn find_sparse_entries_seek( - file: &mut fs::File, - stat: &fs::Metadata, +fn find_sparse_entries_seek( + data: &mut R, ) -> io::Result> { - use std::os::unix::fs::MetadataExt as _; - use std::os::unix::io::AsRawFd as _; + let data_size = data.logical_size(); - fn lseek(file: &fs::File, offset: i64, whence: libc::c_int) -> Result { - #[cfg(any(target_os = "linux", target_os = "android"))] - let lseek = libc::lseek64; - #[cfg(not(any(target_os = "linux", target_os = "android")))] - let lseek = libc::lseek; - - match unsafe { lseek(file.as_raw_fd(), offset, whence) } { - -1 => Err(io::Error::last_os_error().raw_os_error().unwrap()), - off => Ok(off), - } - } - - if stat.blocks() == 0 { - return Ok(if stat.size() == 0 { + if data.data_size() == 0 { + return Ok(if data.logical_size() == 0 { // Empty file. None } else { // Fully sparse file. Some(SparseEntries { entries: vec![SparseEntry { - offset: stat.size(), + offset: data.logical_size(), num_bytes: 0, }], on_disk_size: 0, @@ -986,122 +1260,153 @@ fn find_sparse_entries_seek( }); } - // On most Unixes, we need to read `_PC_MIN_HOLE_SIZE` to see if the file - // system supports `SEEK_HOLE`. - // FreeBSD: https://man.freebsd.org/cgi/man.cgi?query=lseek&sektion=2&manpath=FreeBSD+14.1-STABLE - #[cfg(not(any(target_os = "linux", target_os = "android")))] - if unsafe { libc::fpathconf(file.as_raw_fd(), libc::_PC_MIN_HOLE_SIZE) } == -1 { - return Ok(None); - } - - // Linux is the only UNIX-like without support for `_PC_MIN_HOLE_SIZE`, so - // instead we try to call `lseek` and see if it fails. - #[cfg(any(target_os = "linux", target_os = "android"))] - match lseek(file, 0, libc::SEEK_HOLE) { + // preliminary check (we have data since non-zero size, so the next hole should be findable) + match data.seek_sparse(SeekFromSparse::NextHole(0)) { Ok(_) => (), - Err(libc::ENXIO) => { - // The file is empty. Treat it as non-sparse. + Err(err) if err.kind() == io::ErrorKind::UnexpectedEof => { + // The data is empty (or the data is not sparse-compatible). Either way, treat it as non-sparse. return Ok(None); } - Err(_) => return Ok(None), + Err(err) => { + return Err(err); + } } - let mut entries = Vec::new(); - let mut on_disk_size = 0; - let mut off_s = 0; + // if a hole is smaller than BLOCK_SIZE, then we need to treat that just as zeros. We must also ensure that all + // SparseEntry items are aligned to BLOCK_SIZE. + + let mut entries: Vec = Vec::new(); + let mut on_disk_size: u64 = 0; + let mut next_hole_start_offset: u64 = 0; + let mut current_segment: Option = None; loop { + // off_s = next_hole_start_offset + // // off_s=0 │ off_s │ off_s // ↓ │ ↓ │ ↓ // | DATA |… │ ……………| HOLE | DATA |… │ …|×EOF× // ↑ │ ↑ ↑ │ // (a) │ (b) (c) (d) │ (e) - match lseek(file, off_s, libc::SEEK_DATA) { - Ok(0) if off_s == 0 => (), // (a) The file starts with data. - Ok(off) if off < off_s => { - // (b) Unlikely. - return Err(std::io::Error::new( - io::ErrorKind::Other, - "lseek(SEEK_DATA) went backwards", - )); - } - Ok(off) if off == off_s => { - // (c) The data at the same offset as the hole. - return Err(std::io::Error::new( - io::ErrorKind::Other, - "lseek(SEEK_DATA) did not advance. \ + let data_start_offset = + match data.seek_sparse(SeekFromSparse::NextData(next_hole_start_offset)) { + Ok(offset) if offset == 0 => offset, // (a) The file starts with data (pointing at the start of the data). + Ok(offset) if offset < next_hole_start_offset => { + // (b) Unlikely. + return Err(io::Error::new( + io::ErrorKind::Other, + "seek data went backwards", + )); + } + Ok(offset) if offset == next_hole_start_offset => { + // (c) The data at the same offset as the hole. + return Err(io::Error::new( + io::ErrorKind::Other, + "seek data did not advance. \ Did the file change while appending?", - )); + )); + } + Ok(offset) => offset, // (d) This is now pointing at the start of the next data. + Err(error) if error.kind() == io::ErrorKind::UnexpectedEof => break, // (e) Reached the end of the file. + Err(error) => return Err(error), + }; + + // check if we can save our current segment (can we pad our previous data with our most recent hole?) + /*if let Some(cur) = &mut current_segment { + let prev_hole_length = data_start_offset - next_hole_start_offset; + if calc_pad_zeros(cur.num_bytes) <= prev_hole_length { + cur.num_bytes += calc_pad_zeros(cur.num_bytes); + + on_disk_size += cur.num_bytes; + entries.push(*cur); + + current_segment = None; + } else { + // the hole is too small to actually be represented as sparse (just use zeros) + cur.num_bytes += prev_hole_length; } - Ok(off) => off_s = off, // (d) Jump to the next hole. - Err(libc::ENXIO) => break, // (e) Reached the end of the file. - Err(errno) => return Err(io::Error::from_raw_os_error(errno)), - }; + }*/ + // off_s = data_offset + // // off_s=0 │ off_s │ off_s // ↓ │ ↓ │ ↓ // | DATA |×EOF× │ ……………| DATA | HOLE |… │ …|×EOF× // ↑ │ ↑ ↑ │ // (a) │ (b) (c) (d) │ (e) - match lseek(file, off_s, libc::SEEK_HOLE) { - Ok(off_e) if off_s == 0 && (off_e as u64) == stat.size() => { + let hole_offset = match data.seek_sparse(SeekFromSparse::NextHole(data_start_offset)) + { + Ok(offset) if data_start_offset == 0 && offset == data_size => { // (a) The file is not sparse. - file.seek(io::SeekFrom::Start(0))?; + data.seek(io::SeekFrom::Start(0))?; return Ok(None); } - Ok(off_e) if off_e < off_s => { + Ok(offset) if offset < next_hole_start_offset => { // (b) Unlikely. - return Err(std::io::Error::new( + return Err(io::Error::new( io::ErrorKind::Other, - "lseek(SEEK_HOLE) went backwards", + "seek hole went backwards", )); } - Ok(off_e) if off_e == off_s => { + Ok(offset) if offset == data_start_offset => { // (c) The hole at the same offset as the data. - return Err(std::io::Error::new( + return Err(io::Error::new( io::ErrorKind::Other, - "lseek(SEEK_HOLE) did not advance. \ - Did the file change while appending?", + "seek hole did not advance. \ + Did the data change while appending?", )); } - Ok(off_e) => { + Ok(offset) => { // (d) Found a hole or reached the end of the file (implicit // zero-length hole). - entries.push(SparseEntry { - offset: off_s as u64, - num_bytes: off_e as u64 - off_s as u64, - }); - on_disk_size += off_e as u64 - off_s as u64; - off_s = off_e; + offset } - Err(libc::ENXIO) => { + Err(error) if error.kind() == io::ErrorKind::UnexpectedEof => { // (e) off_s was already beyond the end of the file. - return Err(std::io::Error::new( + return Err(io::Error::new( io::ErrorKind::Other, - "lseek(SEEK_HOLE) returned ENXIO. \ - Did the file change while appending?", + "seek hole returned UnexpectedEof. \ + Did the data change while appending?", )); } - Err(errno) => return Err(io::Error::from_raw_os_error(errno)), + Err(error) => return Err(error), + }; + + let data_len = hole_offset - data_start_offset; + + let cur = if let Some(mut cur) = current_segment { + cur.num_bytes += data_len; + cur + } else { + SparseEntry { offset: data_start_offset, num_bytes: data_len } }; + + // check if we are naturally block-aligned (likely file-based data) + if cur.num_bytes % BLOCK_SIZE == 0 { + on_disk_size += cur.num_bytes; + entries.push(cur); + + current_segment = None; + } else { + current_segment = Some(cur); + } + + next_hole_start_offset = hole_offset; } - if off_s as u64 > stat.size() { - return Err(std::io::Error::new( - io::ErrorKind::Other, - "lseek(SEEK_DATA) went beyond the end of the file. \ - Did the file change while appending?", - )); + if let Some(cur) = current_segment { + on_disk_size += cur.num_bytes; + entries.push(cur); } // Add a final zero-length entry. It is required if the file ends with a // hole, and redundant otherwise. However, we add it unconditionally to // mimic GNU tar behavior. entries.push(SparseEntry { - offset: stat.size(), + offset: data_size, num_bytes: 0, }); - file.seek(io::SeekFrom::Start(0))?; + data.seek(io::SeekFrom::Start(0))?; Ok(Some(SparseEntries { entries, diff --git a/src/lib.rs b/src/lib.rs index db890001..de71bab6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,7 +24,7 @@ use std::io::{Error, ErrorKind}; pub use crate::archive::{Archive, Entries}; -pub use crate::builder::{Builder, EntryWriter}; +pub use crate::builder::{Builder, EntryWriter, SeekFromSparse, SeekSparse}; pub use crate::entry::{Entry, Unpacked}; pub use crate::entry_type::EntryType; pub use crate::header::GnuExtSparseHeader; diff --git a/tests/all.rs b/tests/all.rs index 8f25021d..52c86364 100644 --- a/tests/all.rs +++ b/tests/all.rs @@ -11,7 +11,9 @@ use std::iter::repeat; use std::path::{Path, PathBuf}; use filetime::FileTime; -use tar::{Archive, Builder, Entries, Entry, EntryType, Header, HeaderMode}; +use tar::{ + Archive, Builder, Entries, Entry, EntryType, Header, HeaderMode, SeekFromSparse, SeekSparse, +}; use tempfile::{Builder as TempBuilder, TempDir}; macro_rules! tar { @@ -1377,7 +1379,296 @@ fn writing_sparse() { let expected = fs::read_to_string(&path).unwrap(); - assert!(s == expected, "path: {path:?}"); + if s != expected { + println!("{s}\nVS\n{expected}"); + } + + assert!(s == expected, "path: {path:?}, actual len = {}, expected len = {}", s.len(), expected.len()); + } + + assert!(entries.next().is_none()); +} + +#[derive(Clone, Debug)] +struct SparseSegments<'a> { + segments: &'a [SparseSegment], + logical_size: u64, + data_size: u64, + pos: u64, +} + +#[derive(Clone, Debug)] +struct SparseSegment { + offset: u64, + data: Vec, +} + +impl SparseSegment { + fn end(&self) -> u64 { + self.offset + self.data.len() as u64 + } +} + +impl<'a> SparseSegments<'a> { + // assuming that chunks are in order (this is fine for testing) + fn new(segments: &'a [SparseSegment]) -> SparseSegments<'a> { + let mut logical_size: u64 = 0; + let mut data_size: u64 = 0; + + for segment in segments { + logical_size = logical_size.max(segment.end()); + data_size += segment.data.len() as u64; + } + + SparseSegments { + segments, + logical_size, + data_size, + pos: 0, + } + } + + fn segment_containing(&self, position: u64) -> Option<&SparseSegment> { + self.segments + .iter() + .find(|segment| position >= segment.offset && position < segment.end()) + } + + fn next_segment_offset(&self, position: u64) -> u64 { + self.segments + .iter() + .find(|segment| position < segment.offset) + .map(|segment| segment.offset) + .unwrap_or(self.logical_size) + } +} + +impl<'a> Read for SparseSegments<'a> { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if self.pos >= self.logical_size { + return Ok(0); + } + + let mut written = 0usize; + + while written < buf.len() && self.pos < self.logical_size { + if let Some(segment) = self.segment_containing(self.pos) { + let seg_offset = (self.pos - segment.offset) as usize; + let available = segment.data.len() - seg_offset; + let to_copy = available.min(buf.len() - written); + buf[written..written + to_copy] + .copy_from_slice(&segment.data[seg_offset..seg_offset + to_copy]); + self.pos += to_copy as u64; + written += to_copy; + } else { + let next_offset = self.next_segment_offset(self.pos); + let hole_len = (next_offset - self.pos) as usize; + let to_fill = hole_len.min(buf.len() - written); + buf[written..written + to_fill].fill(0); + self.pos += to_fill as u64; + written += to_fill; + } + } + + Ok(written) + } +} + +impl<'a> Seek for SparseSegments<'a> { + fn seek(&mut self, pos: io::SeekFrom) -> io::Result { + let logical = self.logical_size as i128; + + let target = match pos { + io::SeekFrom::Start(offset) => offset as i128, + io::SeekFrom::Current(offset) => self.pos as i128 + offset as i128, + io::SeekFrom::End(offset) => logical + offset as i128, + }; + + if target < 0 || target > logical { + return Err(io::Error::new(io::ErrorKind::InvalidInput, "invalid seek")); + } + + self.pos = target as u64; + Ok(self.pos) + } +} + +impl<'a> SeekSparse for SparseSegments<'a> { + fn seek_sparse(&mut self, pos: SeekFromSparse) -> io::Result { + match pos { + SeekFromSparse::NextData(offset) => { + for segment in self.segments { + if segment.data.is_empty() { + // then this is actually just used to create a hole + // and does NOT count as data + continue; + } + + if offset < segment.offset { + return Ok(segment.offset); + } + if offset < segment.end() { + return Ok(offset); + } + } + Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "no more data segments", + )) + } + SeekFromSparse::NextHole(offset) => { + if offset > self.logical_size { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "offset beyond logical size", + )); + } + + if offset == self.logical_size { + return Ok(offset); + } + + for segment in self.segments { + if segment.data.is_empty() { + // then this is actually just used to create a hole + continue; + } + + if offset < segment.offset { + return Ok(offset); + } + if offset < segment.end() { + return Ok(segment.end()); + } + } + + Ok(offset) + } + } + } + + fn logical_size(&self) -> u64 { + self.logical_size + } + + fn data_size(&self) -> u64 { + self.data_size + } +} + +#[test] +fn writing_sparse_data() { + let mut ar = Builder::new(Vec::new()); + + fn expected_bytes(segments: &[SparseSegment]) -> Vec { + let logical_size = segments + .iter() + .map(|segment| segment.end()) + .max() + .unwrap_or(0); + + let mut bytes = vec![0u8; logical_size as usize]; + for segment in segments { + let start = segment.offset as usize; + let end = segment.end() as usize; + bytes[start..end].copy_from_slice(&segment.data); + } + + bytes + } + + let cases: Vec<(&str, Vec)> = vec![ + ("empty", vec![]), + ("full_sparse", vec![SparseSegment { + offset: 0x20_000, + data: Vec::new(), + }]), + ("_x", vec![SparseSegment { + offset: 0x20_000, + data: vec![b'a'; 0x1_000], + }]), + ("x_", vec![ + SparseSegment { + offset: 0, + data: vec![b'b'; 0x1_000], + }, + SparseSegment { + offset: 0x20_000, + data: Vec::new(), + }, + ]), + ("_x_x", vec![ + SparseSegment { + offset: 0x20_000, + data: vec![b'c'; 0x1_000], + }, + SparseSegment { + offset: 0x40_000, + data: vec![b'd'; 0x1_000], + }, + ]), + ("x_x_", vec![ + SparseSegment { + offset: 0, + data: vec![b'e'; 0x1_000], + }, + SparseSegment { + offset: 0x20_000, + data: vec![b'f'; 0x1_000], + }, + SparseSegment { + offset: 0x40_000, + data: Vec::new(), + }, + ]), + ("uneven", vec![ + SparseSegment { + offset: 0x20_333, + data: vec![b'u'; 0x555], + }, + SparseSegment { + offset: 0x40_777, + data: vec![b'v'; 0x999], + }, + ]), + ]; + + let mut expected = Vec::new(); + + for (name, segments) in &cases { + let expected_bytes = expected_bytes(segments); + let data = SparseSegments::new(segments); + + let mut header = Header::new_gnu(); + header.set_mode(0o644); + + ar.append_sparse_data(&mut header, name, data).expect(name); + expected.push(((*name).to_string(), expected_bytes)); + } + + ar.finish().unwrap(); + + let data = ar.into_inner().unwrap(); + + #[cfg(target_os = "linux")] + assert!(data.len() <= 37 * 1024); + #[cfg(target_os = "freebsd")] + assert!(data.len() <= 273 * 1024); + + let mut ar = Archive::new(&data[..]); + let mut entries = ar.entries().unwrap(); + + for (expected_name, expected_contents) in expected { + let mut entry = entries.next().unwrap().expect(&expected_name); + assert_eq!( + &*entry.header().path_bytes(), + expected_name.as_bytes(), + "path mismatch", + ); + + let mut contents = Vec::new(); + entry.read_to_end(&mut contents).unwrap(); + assert!(contents == expected_contents, "path: {expected_name}, actual len = {}, expected len = {}", contents.len(), expected_contents.len()); } assert!(entries.next().is_none()); From 9d665f0078aa34fe4c8607140985f3f7b04ab66d Mon Sep 17 00:00:00 2001 From: jebradbury39 Date: Fri, 7 Nov 2025 10:20:44 -0800 Subject: [PATCH 2/8] tests passing Signed-off-by: jebradbury39 --- src/archive.rs | 9 +-- src/builder.rs | 14 ++--- tests/all.rs | 153 +++++++++++++++++++++++++++++-------------------- 3 files changed, 100 insertions(+), 76 deletions(-) diff --git a/src/archive.rs b/src/archive.rs index 11b7af00..3e698e9c 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -482,11 +482,10 @@ impl<'a> EntriesFields<'a> { let off = block.offset()?; let len = block.length()?; if len != 0 && (size - remaining) % BLOCK_SIZE != 0 { - panic!("len={len}, size={size}, rem={remaining}"); - /*return Err(other( + return Err(other( "previous block in sparse file was not \ aligned to 512-byte boundary", - ));*/ + )); } else if off < cur { return Err(other( "out of order or overlapping sparse \ @@ -500,10 +499,6 @@ impl<'a> EntriesFields<'a> { .checked_add(len) .ok_or_else(|| other("more bytes listed in sparse file than u64 can hold"))?; - if len > remaining { - panic!("size={}, len={len}, remaining={remaining}", entry.size); - } - remaining = remaining.checked_sub(len).ok_or_else(|| { other( "sparse file consumed more data than the header \ diff --git a/src/builder.rs b/src/builder.rs index 04748952..e8a5412d 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -1075,8 +1075,6 @@ fn prepare_header_sparse( _ => return Ok(None), }; - println!("entries = {entries:#?}"); - prepare_header_sparse_from_entries(header, &entries); Ok(Some(entries)) @@ -1311,7 +1309,7 @@ fn find_sparse_entries_seek( }; // check if we can save our current segment (can we pad our previous data with our most recent hole?) - /*if let Some(cur) = &mut current_segment { + if let Some(cur) = &mut current_segment { let prev_hole_length = data_start_offset - next_hole_start_offset; if calc_pad_zeros(cur.num_bytes) <= prev_hole_length { cur.num_bytes += calc_pad_zeros(cur.num_bytes); @@ -1324,7 +1322,7 @@ fn find_sparse_entries_seek( // the hole is too small to actually be represented as sparse (just use zeros) cur.num_bytes += prev_hole_length; } - }*/ + } // off_s = data_offset // @@ -1333,8 +1331,7 @@ fn find_sparse_entries_seek( // | DATA |×EOF× │ ……………| DATA | HOLE |… │ …|×EOF× // ↑ │ ↑ ↑ │ // (a) │ (b) (c) (d) │ (e) - let hole_offset = match data.seek_sparse(SeekFromSparse::NextHole(data_start_offset)) - { + let hole_offset = match data.seek_sparse(SeekFromSparse::NextHole(data_start_offset)) { Ok(offset) if data_start_offset == 0 && offset == data_size => { // (a) The file is not sparse. data.seek(io::SeekFrom::Start(0))?; @@ -1377,7 +1374,10 @@ fn find_sparse_entries_seek( cur.num_bytes += data_len; cur } else { - SparseEntry { offset: data_start_offset, num_bytes: data_len } + SparseEntry { + offset: data_start_offset, + num_bytes: data_len, + } }; // check if we are naturally block-aligned (likely file-based data) diff --git a/tests/all.rs b/tests/all.rs index 52c86364..f76f5f81 100644 --- a/tests/all.rs +++ b/tests/all.rs @@ -1380,10 +1380,15 @@ fn writing_sparse() { let expected = fs::read_to_string(&path).unwrap(); if s != expected { - println!("{s}\nVS\n{expected}"); + println!("{s}\nVS\n{expected}"); } - assert!(s == expected, "path: {path:?}, actual len = {}, expected len = {}", s.len(), expected.len()); + assert!( + s == expected, + "path: {path:?}, actual len = {}, expected len = {}", + s.len(), + expected.len() + ); } assert!(entries.next().is_none()); @@ -1404,6 +1409,15 @@ struct SparseSegment { } impl SparseSegment { + #[allow(clippy::option_map_unit_fn)] + fn new(offset: u64, len: usize, fill_byte: u8) -> Self { + let mut data = vec![fill_byte; len]; + data.first_mut().map(|x| *x = b'['); + data.last_mut().map(|x| *x = b']'); + + Self { offset, data } + } + fn end(&self) -> u64 { self.offset + self.data.len() as u64 } @@ -1476,19 +1490,50 @@ impl<'a> Read for SparseSegments<'a> { impl<'a> Seek for SparseSegments<'a> { fn seek(&mut self, pos: io::SeekFrom) -> io::Result { - let logical = self.logical_size as i128; - let target = match pos { - io::SeekFrom::Start(offset) => offset as i128, - io::SeekFrom::Current(offset) => self.pos as i128 + offset as i128, - io::SeekFrom::End(offset) => logical + offset as i128, + io::SeekFrom::Start(offset) => offset, + io::SeekFrom::Current(offset) => { + if offset >= 0 { + self.pos.checked_add(offset as u64).ok_or_else(|| { + io::Error::new(io::ErrorKind::InvalidInput, "invalid seek past u64 max") + })? + } else { + let abs = (-offset) as u64; + self.pos.checked_sub(abs).ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "invalid seek before start of sparse segments", + ) + })? + } + } + io::SeekFrom::End(offset) => { + if offset >= 0 { + self.logical_size + .checked_add(offset as u64) + .ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "invalid seek past end of sparse segments", + ) + })? + } else { + let abs = (-offset) as u64; + self.logical_size.checked_sub(abs).ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "invalid seek before start of sparse segments", + ) + })? + } + } }; - if target < 0 || target > logical { + if target > self.logical_size { return Err(io::Error::new(io::ErrorKind::InvalidInput, "invalid seek")); } - self.pos = target as u64; + self.pos = target; Ok(self.pos) } } @@ -1579,58 +1624,37 @@ fn writing_sparse_data() { let cases: Vec<(&str, Vec)> = vec![ ("empty", vec![]), - ("full_sparse", vec![SparseSegment { - offset: 0x20_000, - data: Vec::new(), - }]), - ("_x", vec![SparseSegment { - offset: 0x20_000, - data: vec![b'a'; 0x1_000], - }]), - ("x_", vec![ - SparseSegment { - offset: 0, - data: vec![b'b'; 0x1_000], - }, - SparseSegment { - offset: 0x20_000, - data: Vec::new(), - }, - ]), - ("_x_x", vec![ - SparseSegment { - offset: 0x20_000, - data: vec![b'c'; 0x1_000], - }, - SparseSegment { - offset: 0x40_000, - data: vec![b'd'; 0x1_000], - }, - ]), - ("x_x_", vec![ - SparseSegment { - offset: 0, - data: vec![b'e'; 0x1_000], - }, - SparseSegment { - offset: 0x20_000, - data: vec![b'f'; 0x1_000], - }, - SparseSegment { - offset: 0x40_000, - data: Vec::new(), - }, - ]), - ("uneven", vec![ - SparseSegment { - offset: 0x20_333, - data: vec![b'u'; 0x555], - }, - SparseSegment { - offset: 0x40_777, - data: vec![b'v'; 0x999], - }, - ]), + ("full_sparse", vec![SparseSegment::new(0x20_000, 0, b'a')]), + ("_x", vec![SparseSegment::new(0x20_000, 0x1_000, b'a')]), + ( + "x_", + vec![ + SparseSegment::new(0, 0x1_000, b'b'), + SparseSegment::new(0x20_000, 0, b'b'), + ], + ), + ( + "_x_x", + vec![ + SparseSegment::new(0x20_000, 0x1_000, b'c'), + SparseSegment::new(0x40_000, 0x1_000, b'd'), + ], + ), + ( + "x_x_", + vec![ + SparseSegment::new(0, 0x1_000, b'e'), + SparseSegment::new(0x20_000, 0x1_000, b'f'), + SparseSegment::new(0x40_000, 0, b'f'), + ], + ), + ( + "uneven", + vec![ + SparseSegment::new(0x20_333, 0x555, b'u'), + SparseSegment::new(0x40_777, 0x999, b'v'), + ], + ), ]; let mut expected = Vec::new(); @@ -1668,7 +1692,12 @@ fn writing_sparse_data() { let mut contents = Vec::new(); entry.read_to_end(&mut contents).unwrap(); - assert!(contents == expected_contents, "path: {expected_name}, actual len = {}, expected len = {}", contents.len(), expected_contents.len()); + assert!( + contents == expected_contents, + "path: {expected_name}, actual len = {}, expected len = {}", + contents.len(), + expected_contents.len() + ); } assert!(entries.next().is_none()); From 621fa91b7181519137038e92fd65569df9c828b8 Mon Sep 17 00:00:00 2001 From: jebradbury39 Date: Fri, 7 Nov 2025 10:30:03 -0800 Subject: [PATCH 3/8] fix tests Signed-off-by: jebradbury39 --- tests/all.rs | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/tests/all.rs b/tests/all.rs index f76f5f81..ac8d78c5 100644 --- a/tests/all.rs +++ b/tests/all.rs @@ -1605,23 +1605,6 @@ impl<'a> SeekSparse for SparseSegments<'a> { fn writing_sparse_data() { let mut ar = Builder::new(Vec::new()); - fn expected_bytes(segments: &[SparseSegment]) -> Vec { - let logical_size = segments - .iter() - .map(|segment| segment.end()) - .max() - .unwrap_or(0); - - let mut bytes = vec![0u8; logical_size as usize]; - for segment in segments { - let start = segment.offset as usize; - let end = segment.end() as usize; - bytes[start..end].copy_from_slice(&segment.data); - } - - bytes - } - let cases: Vec<(&str, Vec)> = vec![ ("empty", vec![]), ("full_sparse", vec![SparseSegment::new(0x20_000, 0, b'a')]), @@ -1660,14 +1643,16 @@ fn writing_sparse_data() { let mut expected = Vec::new(); for (name, segments) in &cases { - let expected_bytes = expected_bytes(segments); - let data = SparseSegments::new(segments); + let mut data = SparseSegments::new(segments); + let mut expected_bytes = Vec::new(); + data.read_to_end(&mut expected_bytes).unwrap(); + data.seek(io::SeekFrom::Start(0)).unwrap(); let mut header = Header::new_gnu(); header.set_mode(0o644); ar.append_sparse_data(&mut header, name, data).expect(name); - expected.push(((*name).to_string(), expected_bytes)); + expected.push((*name, expected_bytes)); } ar.finish().unwrap(); @@ -1683,7 +1668,7 @@ fn writing_sparse_data() { let mut entries = ar.entries().unwrap(); for (expected_name, expected_contents) in expected { - let mut entry = entries.next().unwrap().expect(&expected_name); + let mut entry = entries.next().unwrap().expect(expected_name); assert_eq!( &*entry.header().path_bytes(), expected_name.as_bytes(), From 2c9b7344ce7c3a800f8ebf4d343b3581608505ff Mon Sep 17 00:00:00 2001 From: jebradbury39 Date: Fri, 7 Nov 2025 10:39:21 -0800 Subject: [PATCH 4/8] update docs Signed-off-by: jebradbury39 --- src/archive.rs | 1 - src/builder.rs | 217 ++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 189 insertions(+), 29 deletions(-) diff --git a/src/archive.rs b/src/archive.rs index 3e698e9c..36da7b6d 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -498,7 +498,6 @@ impl<'a> EntriesFields<'a> { cur = off .checked_add(len) .ok_or_else(|| other("more bytes listed in sparse file than u64 can hold"))?; - remaining = remaining.checked_sub(len).ok_or_else(|| { other( "sparse file consumed more data than the header \ diff --git a/src/builder.rs b/src/builder.rs index e8a5412d..41c2ca47 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -339,64 +339,225 @@ impl Builder { /// # Examples /// /// ``` - /// use std::io::{Cursor, Read, Seek, SeekFrom}; + /// use std::io::{self, Read, Seek, SeekFrom}; /// use tar::{Builder, Header, SeekFromSparse, SeekSparse}; /// - /// struct SparseCursor { - /// cursor: Cursor>, - /// hole_start: u64, - /// hole_len: u64, + /// #[derive(Clone, Debug)] + /// struct SparseSegments<'a> { + /// logical_size: u64, + /// segments: &'a [SparseSegment], + /// data_size: u64, + /// pos: u64, /// } /// - /// impl Read for SparseCursor { - /// fn read(&mut self, buf: &mut [u8]) -> std::io::Result { - /// self.cursor.read(buf) + /// #[derive(Clone, Debug)] + /// struct SparseSegment { + /// offset: u64, + /// data: Vec, + /// } + /// + /// impl SparseSegment { + /// #[allow(clippy::option_map_unit_fn)] + /// fn new(offset: u64, len: usize, fill_byte: u8) -> Self { + /// let mut data = vec![fill_byte; len]; + /// data.first_mut().map(|x| *x = b'['); + /// data.last_mut().map(|x| *x = b']'); + /// + /// Self { offset, data } + /// } + /// + /// fn end(&self) -> u64 { + /// self.offset + self.data.len() as u64 /// } /// } /// - /// impl Seek for SparseCursor { - /// fn seek(&mut self, pos: SeekFrom) -> std::io::Result { - /// self.cursor.seek(pos) + /// impl<'a> SparseSegments<'a> { + /// // assuming that chunks are in order (this is fine for testing) + /// fn new(segments: &'a [SparseSegment]) -> SparseSegments<'a> { + /// let mut logical_size: u64 = 0; + /// let mut data_size: u64 = 0; + /// + /// for segment in segments { + /// logical_size = logical_size.max(segment.end()); + /// data_size += segment.data.len() as u64; + /// } + /// + /// SparseSegments { + /// segments, + /// logical_size, + /// data_size, + /// pos: 0, + /// } + /// } + /// + /// fn segment_containing(&self, position: u64) -> Option<&SparseSegment> { + /// self.segments + /// .iter() + /// .find(|segment| position >= segment.offset && position < segment.end()) + /// } + /// + /// fn next_segment_offset(&self, position: u64) -> u64 { + /// self.segments + /// .iter() + /// .find(|segment| position < segment.offset) + /// .map(|segment| segment.offset) + /// .unwrap_or(self.logical_size) + /// } + /// } + /// + /// impl<'a> Read for SparseSegments<'a> { + /// fn read(&mut self, buf: &mut [u8]) -> io::Result { + /// if self.pos >= self.logical_size { + /// return Ok(0); + /// } + /// + /// let mut written = 0usize; + /// + /// while written < buf.len() && self.pos < self.logical_size { + /// if let Some(segment) = self.segment_containing(self.pos) { + /// let seg_offset = (self.pos - segment.offset) as usize; + /// let available = segment.data.len() - seg_offset; + /// let to_copy = available.min(buf.len() - written); + /// buf[written..written + to_copy] + /// .copy_from_slice(&segment.data[seg_offset..seg_offset + to_copy]); + /// self.pos += to_copy as u64; + /// written += to_copy; + /// } else { + /// let next_offset = self.next_segment_offset(self.pos); + /// let hole_len = (next_offset - self.pos) as usize; + /// let to_fill = hole_len.min(buf.len() - written); + /// buf[written..written + to_fill].fill(0); + /// self.pos += to_fill as u64; + /// written += to_fill; + /// } + /// } + /// + /// Ok(written) /// } /// } /// - /// impl SeekSparse for SparseCursor { - /// fn seek_sparse(&mut self, pos: SeekFromSparse) -> std::io::Result { + /// impl<'a> Seek for SparseSegments<'a> { + /// fn seek(&mut self, pos: io::SeekFrom) -> io::Result { + /// let target = match pos { + /// io::SeekFrom::Start(offset) => offset, + /// io::SeekFrom::Current(offset) => { + /// if offset >= 0 { + /// self.pos.checked_add(offset as u64).ok_or_else(|| { + /// io::Error::new(io::ErrorKind::InvalidInput, "invalid seek past u64 max") + /// })? + /// } else { + /// let abs = (-offset) as u64; + /// self.pos.checked_sub(abs).ok_or_else(|| { + /// io::Error::new( + /// io::ErrorKind::InvalidInput, + /// "invalid seek before start of sparse segments", + /// ) + /// })? + /// } + /// } + /// io::SeekFrom::End(offset) => { + /// if offset >= 0 { + /// self.logical_size + /// .checked_add(offset as u64) + /// .ok_or_else(|| { + /// io::Error::new( + /// io::ErrorKind::InvalidInput, + /// "invalid seek past end of sparse segments", + /// ) + /// })? + /// } else { + /// let abs = (-offset) as u64; + /// self.logical_size.checked_sub(abs).ok_or_else(|| { + /// io::Error::new( + /// io::ErrorKind::InvalidInput, + /// "invalid seek before start of sparse segments", + /// ) + /// })? + /// } + /// } + /// }; + /// + /// if target > self.logical_size { + /// return Err(io::Error::new(io::ErrorKind::InvalidInput, "invalid seek")); + /// } + /// + /// self.pos = target; + /// Ok(self.pos) + /// } + /// } + /// + /// impl<'a> SeekSparse for SparseSegments<'a> { + /// fn seek_sparse(&mut self, pos: SeekFromSparse) -> io::Result { /// match pos { /// SeekFromSparse::NextData(offset) => { - /// if offset < self.hole_start { - /// self.cursor.seek(SeekFrom::Start(offset)) - /// } else { - /// Err(std::io::ErrorKind::UnexpectedEof.into()) + /// for segment in self.segments { + /// if segment.data.is_empty() { + /// // then this is actually just used to create a hole + /// // and does NOT count as data + /// continue; + /// } + /// + /// if offset < segment.offset { + /// return Ok(segment.offset); + /// } + /// if offset < segment.end() { + /// return Ok(offset); + /// } /// } + /// Err(io::Error::new( + /// io::ErrorKind::UnexpectedEof, + /// "no more data segments", + /// )) /// } /// SeekFromSparse::NextHole(offset) => { - /// if offset <= self.hole_start { - /// self.cursor.seek(SeekFrom::Start(self.hole_start)) - /// } else { - /// Err(std::io::ErrorKind::UnexpectedEof.into()) + /// if offset > self.logical_size { + /// return Err(io::Error::new( + /// io::ErrorKind::UnexpectedEof, + /// "offset beyond logical size", + /// )); + /// } + /// + /// if offset == self.logical_size { + /// return Ok(offset); + /// } + /// + /// for segment in self.segments { + /// if segment.data.is_empty() { + /// // then this is actually just used to create a hole + /// continue; + /// } + /// + /// if offset < segment.offset { + /// return Ok(offset); + /// } + /// if offset < segment.end() { + /// return Ok(segment.end()); + /// } /// } + /// + /// Ok(offset) /// } /// } /// } /// /// fn logical_size(&self) -> u64 { - /// self.cursor.get_ref().len() as u64 + self.hole_len + /// self.logical_size /// } /// /// fn data_size(&self) -> u64 { - /// self.cursor.get_ref().len() as u64 + /// self.data_size /// } /// } /// /// let mut header = Header::new_gnu(); /// header.set_mode(0o644); /// - /// let data = SparseCursor { - /// cursor: Cursor::new(b"hello world".to_vec()), - /// hole_start: 11, - /// hole_len: 13, - /// }; + /// let segments = vec![ + /// SparseSegment::new(0, 10, b'a'), + /// SparseSegment::new(20, 5, b'b'), + /// ]; + /// + /// let data = SparseSegments::new(&segments); /// /// let mut ar = Builder::new(Vec::new()); /// ar.append_sparse_data(&mut header, "file.txt", data).unwrap(); From 3f4d61da53712ef2f00ba834df88a6cc80a5ad35 Mon Sep 17 00:00:00 2001 From: jebradbury39 Date: Fri, 7 Nov 2025 10:43:44 -0800 Subject: [PATCH 5/8] update Signed-off-by: jebradbury39 --- src/builder.rs | 21 ++++++++++++--------- tests/all.rs | 2 +- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/builder.rs b/src/builder.rs index 41c2ca47..c2348f9c 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -326,9 +326,12 @@ impl Builder { /// blocks (or when sparse handling is disabled), this method falls back to /// writing the entry as a regular file. /// - /// As with other append methods, the header's size must match the logical - /// size of the entry, and the checksum will be recalculated after the path - /// metadata is prepared. Call [`Self::finish`] once all entries have been + /// The header's size will be set to the logical size of the entry + /// (as provided by [`SeekSparse::logical_size`]), + /// and the checksum will be recalculated after the path + /// metadata is prepared. Other header attributes (like mode) should be set + /// prior to passing the header into this function. + /// Call [`Self::finish`] once all entries have been /// written to complete the archive. /// /// # Errors @@ -560,24 +563,24 @@ impl Builder { /// let data = SparseSegments::new(&segments); /// /// let mut ar = Builder::new(Vec::new()); - /// ar.append_sparse_data(&mut header, "file.txt", data).unwrap(); + /// ar.append_sparse_data(header, "file.txt", data).unwrap(); /// let _archive = ar.into_inner().unwrap(); /// ``` pub fn append_sparse_data, R: Read + Seek + SeekSparse>( &mut self, - header: &mut Header, + mut header: Header, path: P, mut data: R, ) -> io::Result<()> { if self.options.sparse { - prepare_header_path(self.get_mut(), header, path.as_ref())?; + prepare_header_path(self.get_mut(), &mut header, path.as_ref())?; header.set_size(data.logical_size()); - let sparse_entries = prepare_header_sparse_generic(&mut data, header)?; + let sparse_entries = prepare_header_sparse_generic(&mut data, &mut header)?; header.set_cksum(); - append_sparse(self.get_mut(), header, sparse_entries, &mut data) + append_sparse(self.get_mut(), &header, sparse_entries, &mut data) } else { - self.append_data(header, path, data) + self.append_data(&mut header, path, data) } } diff --git a/tests/all.rs b/tests/all.rs index ac8d78c5..fdc7fb94 100644 --- a/tests/all.rs +++ b/tests/all.rs @@ -1651,7 +1651,7 @@ fn writing_sparse_data() { let mut header = Header::new_gnu(); header.set_mode(0o644); - ar.append_sparse_data(&mut header, name, data).expect(name); + ar.append_sparse_data(header, name, data).expect(name); expected.push((*name, expected_bytes)); } From f101f1665a1d86553e5083e18f47c4129c083f8f Mon Sep 17 00:00:00 2001 From: jebradbury39 Date: Fri, 7 Nov 2025 10:50:21 -0800 Subject: [PATCH 6/8] update Signed-off-by: jebradbury39 --- src/builder.rs | 14 ++++++++++---- tests/all.rs | 4 ---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/builder.rs b/src/builder.rs index c2348f9c..9f3bde28 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -1024,10 +1024,8 @@ fn calc_pad_zeros(len: u64) -> u64 { fn pad_zeroes(dst: &mut dyn Write, len: u64) -> io::Result<()> { let buf = [0; BLOCK_SIZE as usize]; - let remaining = calc_pad_zeros(len); - if remaining < BLOCK_SIZE { - dst.write_all(&buf[..remaining as usize])?; - } + let num_zeros = calc_pad_zeros(len); + dst.write_all(&buf[..num_zeros as usize])?; Ok(()) } @@ -1557,6 +1555,14 @@ fn find_sparse_entries_seek( next_hole_start_offset = hole_offset; } + if next_hole_start_offset > data_size { + return Err(io::Error::new( + io::ErrorKind::Other, + "seek data went beyond the end of the file. \ + Did the file change while appending?", + )); + } + if let Some(cur) = current_segment { on_disk_size += cur.num_bytes; entries.push(cur); diff --git a/tests/all.rs b/tests/all.rs index fdc7fb94..7b1c4989 100644 --- a/tests/all.rs +++ b/tests/all.rs @@ -1379,10 +1379,6 @@ fn writing_sparse() { let expected = fs::read_to_string(&path).unwrap(); - if s != expected { - println!("{s}\nVS\n{expected}"); - } - assert!( s == expected, "path: {path:?}, actual len = {}, expected len = {}", From 820f81eaf15f35bace3cbe298978adc2c708c200 Mon Sep 17 00:00:00 2001 From: jebradbury39 Date: Fri, 7 Nov 2025 11:17:58 -0800 Subject: [PATCH 7/8] add cfg checks Signed-off-by: jebradbury39 --- src/builder.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/builder.rs b/src/builder.rs index 9f3bde28..1109e72b 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -66,11 +66,13 @@ pub trait SeekSparse { fn data_size(&self) -> u64; } +#[cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux"))] struct FileWithMetadata<'a> { file: &'a mut fs::File, stat: &'a fs::Metadata, } +#[cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux"))] impl<'a> Read for FileWithMetadata<'a> { #[inline] fn read(&mut self, buf: &mut [u8]) -> io::Result { @@ -78,6 +80,7 @@ impl<'a> Read for FileWithMetadata<'a> { } } +#[cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux"))] impl<'a> Seek for FileWithMetadata<'a> { #[inline] fn seek(&mut self, pos: io::SeekFrom) -> io::Result { From 0387be7dce65fc067a4e9df1256265a8bec3c61c Mon Sep 17 00:00:00 2001 From: jebradbury39 Date: Tue, 16 Jun 2026 14:23:30 -0700 Subject: [PATCH 8/8] fix Signed-off-by: jebradbury39 --- src/builder.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/builder.rs b/src/builder.rs index d1c62819..901e0fa8 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -614,7 +614,8 @@ impl Builder { mut data: R, ) -> io::Result<()> { if self.options.sparse { - prepare_header_path(self.get_mut(), &mut header, path.as_ref())?; + let allow_absolute = self.options.preserve_absolute; + prepare_header_path(self.get_mut(), &mut header, path.as_ref(), allow_absolute)?; header.set_size(data.logical_size()); let sparse_entries = prepare_header_sparse_generic(&mut data, &mut header)?; header.set_cksum();