diff --git a/src/builder.rs b/src/builder.rs index d7e756b1..901e0fa8 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -28,6 +28,143 @@ struct BuilderOptions { sparse: bool, } +/// Describes sparse-aware seek targets used by [`SeekSparse::seek_sparse`]. +/// +/// These variants mirror the Linux-style `SEEK_DATA` / `SEEK_HOLE` semantics +/// so callers can iterate over alternating data and hole segments inside a +/// sparse object. The offsets refer to the logical size of the data source. +pub enum SeekFromSparse { + /// Adjust the seekable-object offset to the next location in the seekable-object + /// greater than or equal to offset containing data. If offset + /// points to data, then the seekable-object offset is set to offset. + /// + /// If there is no more data to be found, this will result in an `UnexpectedEof`. + NextData(u64), + /// Adjust the seekable-object offset to the next hole in the seekable-object greater + /// than or equal to offset. If offset points into the middle + /// of a hole, then the seekable-object offset is set to offset. If there + /// is no hole past offset, then the seekable-object offset is adjusted to + /// the end of the seekable-object (i.e., there is an implicit hole at the + /// end of any seekable-object). + /// + /// If the offset is beyond the end of the seekable-object, this will result in an `UnexpectedEof`. + NextHole(u64), +} + +/// A helper trait for data sources that can describe sparse regions. +/// +/// Types implementing this trait allow [`Builder`] to skip reading empty +/// segments when constructing sparse archive entries via +/// [`Builder::append_sparse_data`]. +pub trait SeekSparse { + /// Seeks to the offset, and returns the new actual offset of the seekable-object + fn seek_sparse(&mut self, pos: SeekFromSparse) -> std::io::Result; + + /// Returns the logical size (size of holes + data) + fn logical_size(&self) -> u64; + + /// Returns the size of just the data + fn data_size(&self) -> u64; +} + +#[cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux"))] +struct FileWithMetadata<'a> { + file: &'a mut fs::File, + stat: &'a fs::Metadata, +} + +#[cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux"))] +impl<'a> Read for FileWithMetadata<'a> { + #[inline] + fn read(&mut self, buf: &mut [u8]) -> io::Result { + self.file.read(buf) + } +} + +#[cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux"))] +impl<'a> Seek for FileWithMetadata<'a> { + #[inline] + fn seek(&mut self, pos: io::SeekFrom) -> io::Result { + self.file.seek(pos) + } +} + +#[cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux"))] +impl<'a> SeekSparse for FileWithMetadata<'a> { + fn seek_sparse(&mut self, pos: SeekFromSparse) -> std::io::Result { + use std::os::unix::io::AsRawFd as _; + + fn lseek(file: &fs::File, offset: i64, whence: libc::c_int) -> Result { + #[cfg(any(target_os = "linux", target_os = "android"))] + let lseek = libc::lseek64; + #[cfg(not(any(target_os = "linux", target_os = "android")))] + let lseek = libc::lseek; + + match unsafe { lseek(file.as_raw_fd(), offset, whence) } { + -1 => Err(io::Error::last_os_error().raw_os_error().unwrap()), + off => Ok(off), + } + } + + // On most Unixes, we need to read `_PC_MIN_HOLE_SIZE` to see if the file + // system supports `SEEK_HOLE`. + // FreeBSD: https://man.freebsd.org/cgi/man.cgi?query=lseek&sektion=2&manpath=FreeBSD+14.1-STABLE + #[cfg(not(any(target_os = "linux", target_os = "android")))] + if unsafe { libc::fpathconf(file.as_raw_fd(), libc::_PC_MIN_HOLE_SIZE) } == -1 { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "SEEK_HOLE is not supported for this filesystem", + )); + } + + let (off, whence) = match pos { + SeekFromSparse::NextData(offset) => { + let offset: i64 = offset.try_into().map_err(|_error| { + io::Error::new( + io::ErrorKind::InvalidInput, + "seek data offset exceeds i64 max", + ) + })?; + (offset, libc::SEEK_DATA) + } + SeekFromSparse::NextHole(offset) => { + let offset: i64 = offset.try_into().map_err(|_error| { + io::Error::new( + io::ErrorKind::InvalidInput, + "seek hole offset exceeds i64 max", + ) + })?; + (offset, libc::SEEK_HOLE) + } + }; + + match lseek(self.file, off, whence) { + Ok(off) => off.try_into().map_err(|_error| { + io::Error::new( + io::ErrorKind::InvalidInput, + "seek sparse result offset is negative", + ) + }), + Err(libc::ENXIO) => Err(io::Error::new(io::ErrorKind::UnexpectedEof, "hit ENXIO")), + Err(err) => Err(io::Error::from_raw_os_error(err)), + } + } + + #[inline] + fn logical_size(&self) -> u64 { + use std::os::unix::fs::MetadataExt; + + self.stat.size() + } + + #[inline] + fn data_size(&self) -> u64 { + use std::os::unix::fs::MetadataExt; + + self.stat.blocks() + } +} + impl Builder { /// Create a new archive builder with the underlying object as the /// destination of all data written. The builder will use @@ -222,6 +359,273 @@ impl Builder { self.append(header, data) } + /// Adds a new sparse entry to this archive with the specified path. + /// + /// This function behaves like [`Self::append_data`], but it leverages the + /// [`SeekSparse`] implementation on the provided reader to avoid copying + /// data that resides in sparse regions. When the reader reports no sparse + /// blocks (or when sparse handling is disabled), this method falls back to + /// writing the entry as a regular file. + /// + /// The header's size will be set to the logical size of the entry + /// (as provided by [`SeekSparse::logical_size`]), + /// and the checksum will be recalculated after the path + /// metadata is prepared. Other header attributes (like mode) should be set + /// prior to passing the header into this function. + /// Call [`Self::finish`] once all entries have been + /// written to complete the archive. + /// + /// # Errors + /// + /// Returns any I/O error encountered while preparing headers or copying + /// data from the sparse reader into the underlying writer. + /// + /// # Examples + /// + /// ``` + /// use std::io::{self, Read, Seek, SeekFrom}; + /// use tar::{Builder, Header, SeekFromSparse, SeekSparse}; + /// + /// #[derive(Clone, Debug)] + /// struct SparseSegments<'a> { + /// logical_size: u64, + /// segments: &'a [SparseSegment], + /// data_size: u64, + /// pos: u64, + /// } + /// + /// #[derive(Clone, Debug)] + /// struct SparseSegment { + /// offset: u64, + /// data: Vec, + /// } + /// + /// impl SparseSegment { + /// #[allow(clippy::option_map_unit_fn)] + /// fn new(offset: u64, len: usize, fill_byte: u8) -> Self { + /// let mut data = vec![fill_byte; len]; + /// data.first_mut().map(|x| *x = b'['); + /// data.last_mut().map(|x| *x = b']'); + /// + /// Self { offset, data } + /// } + /// + /// fn end(&self) -> u64 { + /// self.offset + self.data.len() as u64 + /// } + /// } + /// + /// impl<'a> SparseSegments<'a> { + /// // assuming that chunks are in order (this is fine for testing) + /// fn new(segments: &'a [SparseSegment]) -> SparseSegments<'a> { + /// let mut logical_size: u64 = 0; + /// let mut data_size: u64 = 0; + /// + /// for segment in segments { + /// logical_size = logical_size.max(segment.end()); + /// data_size += segment.data.len() as u64; + /// } + /// + /// SparseSegments { + /// segments, + /// logical_size, + /// data_size, + /// pos: 0, + /// } + /// } + /// + /// fn segment_containing(&self, position: u64) -> Option<&SparseSegment> { + /// self.segments + /// .iter() + /// .find(|segment| position >= segment.offset && position < segment.end()) + /// } + /// + /// fn next_segment_offset(&self, position: u64) -> u64 { + /// self.segments + /// .iter() + /// .find(|segment| position < segment.offset) + /// .map(|segment| segment.offset) + /// .unwrap_or(self.logical_size) + /// } + /// } + /// + /// impl<'a> Read for SparseSegments<'a> { + /// fn read(&mut self, buf: &mut [u8]) -> io::Result { + /// if self.pos >= self.logical_size { + /// return Ok(0); + /// } + /// + /// let mut written = 0usize; + /// + /// while written < buf.len() && self.pos < self.logical_size { + /// if let Some(segment) = self.segment_containing(self.pos) { + /// let seg_offset = (self.pos - segment.offset) as usize; + /// let available = segment.data.len() - seg_offset; + /// let to_copy = available.min(buf.len() - written); + /// buf[written..written + to_copy] + /// .copy_from_slice(&segment.data[seg_offset..seg_offset + to_copy]); + /// self.pos += to_copy as u64; + /// written += to_copy; + /// } else { + /// let next_offset = self.next_segment_offset(self.pos); + /// let hole_len = (next_offset - self.pos) as usize; + /// let to_fill = hole_len.min(buf.len() - written); + /// buf[written..written + to_fill].fill(0); + /// self.pos += to_fill as u64; + /// written += to_fill; + /// } + /// } + /// + /// Ok(written) + /// } + /// } + /// + /// impl<'a> Seek for SparseSegments<'a> { + /// fn seek(&mut self, pos: io::SeekFrom) -> io::Result { + /// let target = match pos { + /// io::SeekFrom::Start(offset) => offset, + /// io::SeekFrom::Current(offset) => { + /// if offset >= 0 { + /// self.pos.checked_add(offset as u64).ok_or_else(|| { + /// io::Error::new(io::ErrorKind::InvalidInput, "invalid seek past u64 max") + /// })? + /// } else { + /// let abs = (-offset) as u64; + /// self.pos.checked_sub(abs).ok_or_else(|| { + /// io::Error::new( + /// io::ErrorKind::InvalidInput, + /// "invalid seek before start of sparse segments", + /// ) + /// })? + /// } + /// } + /// io::SeekFrom::End(offset) => { + /// if offset >= 0 { + /// self.logical_size + /// .checked_add(offset as u64) + /// .ok_or_else(|| { + /// io::Error::new( + /// io::ErrorKind::InvalidInput, + /// "invalid seek past end of sparse segments", + /// ) + /// })? + /// } else { + /// let abs = (-offset) as u64; + /// self.logical_size.checked_sub(abs).ok_or_else(|| { + /// io::Error::new( + /// io::ErrorKind::InvalidInput, + /// "invalid seek before start of sparse segments", + /// ) + /// })? + /// } + /// } + /// }; + /// + /// if target > self.logical_size { + /// return Err(io::Error::new(io::ErrorKind::InvalidInput, "invalid seek")); + /// } + /// + /// self.pos = target; + /// Ok(self.pos) + /// } + /// } + /// + /// impl<'a> SeekSparse for SparseSegments<'a> { + /// fn seek_sparse(&mut self, pos: SeekFromSparse) -> io::Result { + /// match pos { + /// SeekFromSparse::NextData(offset) => { + /// for segment in self.segments { + /// if segment.data.is_empty() { + /// // then this is actually just used to create a hole + /// // and does NOT count as data + /// continue; + /// } + /// + /// if offset < segment.offset { + /// return Ok(segment.offset); + /// } + /// if offset < segment.end() { + /// return Ok(offset); + /// } + /// } + /// Err(io::Error::new( + /// io::ErrorKind::UnexpectedEof, + /// "no more data segments", + /// )) + /// } + /// SeekFromSparse::NextHole(offset) => { + /// if offset > self.logical_size { + /// return Err(io::Error::new( + /// io::ErrorKind::UnexpectedEof, + /// "offset beyond logical size", + /// )); + /// } + /// + /// if offset == self.logical_size { + /// return Ok(offset); + /// } + /// + /// for segment in self.segments { + /// if segment.data.is_empty() { + /// // then this is actually just used to create a hole + /// continue; + /// } + /// + /// if offset < segment.offset { + /// return Ok(offset); + /// } + /// if offset < segment.end() { + /// return Ok(segment.end()); + /// } + /// } + /// + /// Ok(offset) + /// } + /// } + /// } + /// + /// fn logical_size(&self) -> u64 { + /// self.logical_size + /// } + /// + /// fn data_size(&self) -> u64 { + /// self.data_size + /// } + /// } + /// + /// let mut header = Header::new_gnu(); + /// header.set_mode(0o644); + /// + /// let segments = vec![ + /// SparseSegment::new(0, 10, b'a'), + /// SparseSegment::new(20, 5, b'b'), + /// ]; + /// + /// let data = SparseSegments::new(&segments); + /// + /// let mut ar = Builder::new(Vec::new()); + /// ar.append_sparse_data(header, "file.txt", data).unwrap(); + /// let _archive = ar.into_inner().unwrap(); + /// ``` + pub fn append_sparse_data, R: Read + Seek + SeekSparse>( + &mut self, + mut header: Header, + path: P, + mut data: R, + ) -> io::Result<()> { + if self.options.sparse { + let allow_absolute = self.options.preserve_absolute; + prepare_header_path(self.get_mut(), &mut header, path.as_ref(), allow_absolute)?; + header.set_size(data.logical_size()); + let sparse_entries = prepare_header_sparse_generic(&mut data, &mut header)?; + header.set_cksum(); + + append_sparse(self.get_mut(), &header, sparse_entries, &mut data) + } else { + self.append_data(&mut header, path, data) + } + } + /// Adds a new entry to this archive and returns an [`EntryWriter`] for /// adding its contents. /// @@ -673,12 +1077,42 @@ fn append(mut dst: &mut dyn Write, header: &Header, mut data: &mut dyn Read) -> Ok(()) } +fn append_sparse( + dst: &mut dyn Write, + header: &Header, + sparse_entries: Option, + data: &mut R, +) -> io::Result<()> { + dst.write_all(header.as_bytes())?; + + if let Some(sparse_entries) = sparse_entries { + append_extended_sparse_headers(dst, &sparse_entries)?; + for entry in sparse_entries.entries { + data.seek(io::SeekFrom::Start(entry.offset))?; + io::copy(&mut data.take(entry.num_bytes), dst)?; + } + pad_zeroes(dst, sparse_entries.on_disk_size)?; + } else { + let len = io::copy(data, dst)?; + pad_zeroes(dst, len)?; + } + Ok(()) +} + +#[inline] +fn calc_pad_zeros(len: u64) -> u64 { + let r = len % BLOCK_SIZE; + if r == 0 { + 0 + } else { + BLOCK_SIZE - r + } +} + fn pad_zeroes(dst: &mut dyn Write, len: u64) -> io::Result<()> { let buf = [0; BLOCK_SIZE as usize]; - let remaining = BLOCK_SIZE - (len % BLOCK_SIZE); - if remaining < BLOCK_SIZE { - dst.write_all(&buf[..remaining as usize])?; - } + let num_zeros = calc_pad_zeros(len); + dst.write_all(&buf[..num_zeros as usize])?; Ok(()) } @@ -923,6 +1357,26 @@ fn prepare_header_sparse( _ => return Ok(None), }; + prepare_header_sparse_from_entries(header, &entries); + + Ok(Some(entries)) +} + +fn prepare_header_sparse_generic( + data: &mut R, + header: &mut Header, +) -> io::Result> { + let entries = match find_sparse_entries_seek(data)? { + Some(entries) => entries, + _ => return Ok(None), + }; + + prepare_header_sparse_from_entries(header, &entries); + + Ok(Some(entries)) +} + +fn prepare_header_sparse_from_entries(header: &mut Header, entries: &SparseEntries) { header.set_entry_type(EntryType::GNUSparse); header.set_size(entries.on_disk_size); @@ -938,8 +1392,6 @@ fn prepare_header_sparse( header_entry.set_length(entry.num_bytes); } gnu_header.set_is_extended(entries.entries.len() > gnu_header.sparse.len()); - - Ok(Some(entries)) } /// Write extra sparse headers into `dst` for those entries that did not fit in the main header. @@ -1048,7 +1500,9 @@ impl SparseEntries { #[derive(Debug, Copy, Clone, PartialEq, Eq)] struct SparseEntry { + /// offset on seekable data offset: u64, + /// length on seekable data num_bytes: u64, } @@ -1070,39 +1524,23 @@ fn find_sparse_entries( } #[cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux"))] - find_sparse_entries_seek(file, stat) + find_sparse_entries_seek(&mut FileWithMetadata { file, stat }) } -/// Implementation of `find_sparse_entries` using `SEEK_HOLE` and `SEEK_DATA`. -#[cfg(any(target_os = "android", target_os = "freebsd", target_os = "linux"))] -fn find_sparse_entries_seek( - file: &mut fs::File, - stat: &fs::Metadata, +fn find_sparse_entries_seek( + data: &mut R, ) -> io::Result> { - use std::os::unix::fs::MetadataExt as _; - use std::os::unix::io::AsRawFd as _; - - fn lseek(file: &fs::File, offset: i64, whence: libc::c_int) -> Result { - #[cfg(any(target_os = "linux", target_os = "android"))] - let lseek = libc::lseek64; - #[cfg(not(any(target_os = "linux", target_os = "android")))] - let lseek = libc::lseek; - - match unsafe { lseek(file.as_raw_fd(), offset, whence) } { - -1 => Err(io::Error::last_os_error().raw_os_error().unwrap()), - off => Ok(off), - } - } + let data_size = data.logical_size(); - if stat.blocks() == 0 { - return Ok(if stat.size() == 0 { + if data.data_size() == 0 { + return Ok(if data.logical_size() == 0 { // Empty file. None } else { // Fully sparse file. Some(SparseEntries { entries: vec![SparseEntry { - offset: stat.size(), + offset: data.logical_size(), num_bytes: 0, }], on_disk_size: 0, @@ -1110,122 +1548,163 @@ fn find_sparse_entries_seek( }); } - // On most Unixes, we need to read `_PC_MIN_HOLE_SIZE` to see if the file - // system supports `SEEK_HOLE`. - // FreeBSD: https://man.freebsd.org/cgi/man.cgi?query=lseek&sektion=2&manpath=FreeBSD+14.1-STABLE - #[cfg(not(any(target_os = "linux", target_os = "android")))] - if unsafe { libc::fpathconf(file.as_raw_fd(), libc::_PC_MIN_HOLE_SIZE) } == -1 { - return Ok(None); - } - - // Linux is the only UNIX-like without support for `_PC_MIN_HOLE_SIZE`, so - // instead we try to call `lseek` and see if it fails. - #[cfg(any(target_os = "linux", target_os = "android"))] - match lseek(file, 0, libc::SEEK_HOLE) { + // preliminary check (we have data since non-zero size, so the next hole should be findable) + match data.seek_sparse(SeekFromSparse::NextHole(0)) { Ok(_) => (), - Err(libc::ENXIO) => { - // The file is empty. Treat it as non-sparse. + Err(err) if err.kind() == io::ErrorKind::UnexpectedEof => { + // The data is empty (or the data is not sparse-compatible). Either way, treat it as non-sparse. return Ok(None); } - Err(_) => return Ok(None), + Err(err) => { + return Err(err); + } } - let mut entries = Vec::new(); - let mut on_disk_size = 0; - let mut off_s = 0; + // if a hole is smaller than BLOCK_SIZE, then we need to treat that just as zeros. We must also ensure that all + // SparseEntry items are aligned to BLOCK_SIZE. + + let mut entries: Vec = Vec::new(); + let mut on_disk_size: u64 = 0; + let mut next_hole_start_offset: u64 = 0; + let mut current_segment: Option = None; loop { + // off_s = next_hole_start_offset + // // off_s=0 │ off_s │ off_s // ↓ │ ↓ │ ↓ // | DATA |… │ ……………| HOLE | DATA |… │ …|×EOF× // ↑ │ ↑ ↑ │ // (a) │ (b) (c) (d) │ (e) - match lseek(file, off_s, libc::SEEK_DATA) { - Ok(0) if off_s == 0 => (), // (a) The file starts with data. - Ok(off) if off < off_s => { - // (b) Unlikely. - return Err(std::io::Error::new( - io::ErrorKind::Other, - "lseek(SEEK_DATA) went backwards", - )); - } - Ok(off) if off == off_s => { - // (c) The data at the same offset as the hole. - return Err(std::io::Error::new( - io::ErrorKind::Other, - "lseek(SEEK_DATA) did not advance. \ + let data_start_offset = + match data.seek_sparse(SeekFromSparse::NextData(next_hole_start_offset)) { + Ok(offset) if offset == 0 => offset, // (a) The file starts with data (pointing at the start of the data). + Ok(offset) if offset < next_hole_start_offset => { + // (b) Unlikely. + return Err(io::Error::new( + io::ErrorKind::Other, + "seek data went backwards", + )); + } + Ok(offset) if offset == next_hole_start_offset => { + // (c) The data at the same offset as the hole. + return Err(io::Error::new( + io::ErrorKind::Other, + "seek data did not advance. \ Did the file change while appending?", - )); + )); + } + Ok(offset) => offset, // (d) This is now pointing at the start of the next data. + Err(error) if error.kind() == io::ErrorKind::UnexpectedEof => break, // (e) Reached the end of the file. + Err(error) => return Err(error), + }; + + // check if we can save our current segment (can we pad our previous data with our most recent hole?) + if let Some(cur) = &mut current_segment { + let prev_hole_length = data_start_offset - next_hole_start_offset; + if calc_pad_zeros(cur.num_bytes) <= prev_hole_length { + cur.num_bytes += calc_pad_zeros(cur.num_bytes); + + on_disk_size += cur.num_bytes; + entries.push(*cur); + + current_segment = None; + } else { + // the hole is too small to actually be represented as sparse (just use zeros) + cur.num_bytes += prev_hole_length; } - Ok(off) => off_s = off, // (d) Jump to the next hole. - Err(libc::ENXIO) => break, // (e) Reached the end of the file. - Err(errno) => return Err(io::Error::from_raw_os_error(errno)), - }; + } + // off_s = data_offset + // // off_s=0 │ off_s │ off_s // ↓ │ ↓ │ ↓ // | DATA |×EOF× │ ……………| DATA | HOLE |… │ …|×EOF× // ↑ │ ↑ ↑ │ // (a) │ (b) (c) (d) │ (e) - match lseek(file, off_s, libc::SEEK_HOLE) { - Ok(off_e) if off_s == 0 && (off_e as u64) == stat.size() => { + let hole_offset = match data.seek_sparse(SeekFromSparse::NextHole(data_start_offset)) { + Ok(offset) if data_start_offset == 0 && offset == data_size => { // (a) The file is not sparse. - file.seek(io::SeekFrom::Start(0))?; + data.seek(io::SeekFrom::Start(0))?; return Ok(None); } - Ok(off_e) if off_e < off_s => { + Ok(offset) if offset < next_hole_start_offset => { // (b) Unlikely. - return Err(std::io::Error::new( + return Err(io::Error::new( io::ErrorKind::Other, - "lseek(SEEK_HOLE) went backwards", + "seek hole went backwards", )); } - Ok(off_e) if off_e == off_s => { + Ok(offset) if offset == data_start_offset => { // (c) The hole at the same offset as the data. - return Err(std::io::Error::new( + return Err(io::Error::new( io::ErrorKind::Other, - "lseek(SEEK_HOLE) did not advance. \ - Did the file change while appending?", + "seek hole did not advance. \ + Did the data change while appending?", )); } - Ok(off_e) => { + Ok(offset) => { // (d) Found a hole or reached the end of the file (implicit // zero-length hole). - entries.push(SparseEntry { - offset: off_s as u64, - num_bytes: off_e as u64 - off_s as u64, - }); - on_disk_size += off_e as u64 - off_s as u64; - off_s = off_e; + offset } - Err(libc::ENXIO) => { + Err(error) if error.kind() == io::ErrorKind::UnexpectedEof => { // (e) off_s was already beyond the end of the file. - return Err(std::io::Error::new( + return Err(io::Error::new( io::ErrorKind::Other, - "lseek(SEEK_HOLE) returned ENXIO. \ - Did the file change while appending?", + "seek hole returned UnexpectedEof. \ + Did the data change while appending?", )); } - Err(errno) => return Err(io::Error::from_raw_os_error(errno)), + Err(error) => return Err(error), }; + + let data_len = hole_offset - data_start_offset; + + let cur = if let Some(mut cur) = current_segment { + cur.num_bytes += data_len; + cur + } else { + SparseEntry { + offset: data_start_offset, + num_bytes: data_len, + } + }; + + // check if we are naturally block-aligned (likely file-based data) + if cur.num_bytes % BLOCK_SIZE == 0 { + on_disk_size += cur.num_bytes; + entries.push(cur); + + current_segment = None; + } else { + current_segment = Some(cur); + } + + next_hole_start_offset = hole_offset; } - if off_s as u64 > stat.size() { - return Err(std::io::Error::new( + if next_hole_start_offset > data_size { + return Err(io::Error::new( io::ErrorKind::Other, - "lseek(SEEK_DATA) went beyond the end of the file. \ + "seek data went beyond the end of the file. \ Did the file change while appending?", )); } + if let Some(cur) = current_segment { + on_disk_size += cur.num_bytes; + entries.push(cur); + } + // Add a final zero-length entry. It is required if the file ends with a // hole, and redundant otherwise. However, we add it unconditionally to // mimic GNU tar behavior. entries.push(SparseEntry { - offset: stat.size(), + offset: data_size, num_bytes: 0, }); - file.seek(io::SeekFrom::Start(0))?; + data.seek(io::SeekFrom::Start(0))?; Ok(Some(SparseEntries { entries, diff --git a/src/lib.rs b/src/lib.rs index 8a848f28..116eaf74 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -42,7 +42,7 @@ use std::io::{Error, ErrorKind}; pub use crate::archive::{Archive, Entries}; -pub use crate::builder::{Builder, EntryWriter}; +pub use crate::builder::{Builder, EntryWriter, SeekFromSparse, SeekSparse}; pub use crate::entry::{Entry, Unpacked}; pub use crate::entry_type::EntryType; pub use crate::header::GnuExtSparseHeader; diff --git a/tests/all.rs b/tests/all.rs index eaeb839f..606be8ed 100644 --- a/tests/all.rs +++ b/tests/all.rs @@ -14,7 +14,9 @@ use std::process::Command; use filetime::FileTime; use rand::rngs::SmallRng; use rand::{Rng, SeedableRng}; -use tar::{Archive, Builder, Entries, Entry, EntryType, Header, HeaderMode}; +use tar::{ + Archive, Builder, Entries, Entry, EntryType, Header, HeaderMode, SeekFromSparse, SeekSparse, +}; use tempfile::{Builder as TempBuilder, TempDir}; /// A reader wrapper that returns partial results from `read()` to exercise @@ -1500,7 +1502,306 @@ fn writing_sparse() { let expected = fs::read_to_string(&path).unwrap(); - assert!(s == expected, "path: {path:?}"); + assert!( + s == expected, + "path: {path:?}, actual len = {}, expected len = {}", + s.len(), + expected.len() + ); + } + + assert!(entries.next().is_none()); +} + +#[derive(Clone, Debug)] +struct SparseSegments<'a> { + segments: &'a [SparseSegment], + logical_size: u64, + data_size: u64, + pos: u64, +} + +#[derive(Clone, Debug)] +struct SparseSegment { + offset: u64, + data: Vec, +} + +impl SparseSegment { + #[allow(clippy::option_map_unit_fn)] + fn new(offset: u64, len: usize, fill_byte: u8) -> Self { + let mut data = vec![fill_byte; len]; + data.first_mut().map(|x| *x = b'['); + data.last_mut().map(|x| *x = b']'); + + Self { offset, data } + } + + fn end(&self) -> u64 { + self.offset + self.data.len() as u64 + } +} + +impl<'a> SparseSegments<'a> { + // assuming that chunks are in order (this is fine for testing) + fn new(segments: &'a [SparseSegment]) -> SparseSegments<'a> { + let mut logical_size: u64 = 0; + let mut data_size: u64 = 0; + + for segment in segments { + logical_size = logical_size.max(segment.end()); + data_size += segment.data.len() as u64; + } + + SparseSegments { + segments, + logical_size, + data_size, + pos: 0, + } + } + + fn segment_containing(&self, position: u64) -> Option<&SparseSegment> { + self.segments + .iter() + .find(|segment| position >= segment.offset && position < segment.end()) + } + + fn next_segment_offset(&self, position: u64) -> u64 { + self.segments + .iter() + .find(|segment| position < segment.offset) + .map(|segment| segment.offset) + .unwrap_or(self.logical_size) + } +} + +impl<'a> Read for SparseSegments<'a> { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if self.pos >= self.logical_size { + return Ok(0); + } + + let mut written = 0usize; + + while written < buf.len() && self.pos < self.logical_size { + if let Some(segment) = self.segment_containing(self.pos) { + let seg_offset = (self.pos - segment.offset) as usize; + let available = segment.data.len() - seg_offset; + let to_copy = available.min(buf.len() - written); + buf[written..written + to_copy] + .copy_from_slice(&segment.data[seg_offset..seg_offset + to_copy]); + self.pos += to_copy as u64; + written += to_copy; + } else { + let next_offset = self.next_segment_offset(self.pos); + let hole_len = (next_offset - self.pos) as usize; + let to_fill = hole_len.min(buf.len() - written); + buf[written..written + to_fill].fill(0); + self.pos += to_fill as u64; + written += to_fill; + } + } + + Ok(written) + } +} + +impl<'a> Seek for SparseSegments<'a> { + fn seek(&mut self, pos: io::SeekFrom) -> io::Result { + let target = match pos { + io::SeekFrom::Start(offset) => offset, + io::SeekFrom::Current(offset) => { + if offset >= 0 { + self.pos.checked_add(offset as u64).ok_or_else(|| { + io::Error::new(io::ErrorKind::InvalidInput, "invalid seek past u64 max") + })? + } else { + let abs = (-offset) as u64; + self.pos.checked_sub(abs).ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "invalid seek before start of sparse segments", + ) + })? + } + } + io::SeekFrom::End(offset) => { + if offset >= 0 { + self.logical_size + .checked_add(offset as u64) + .ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "invalid seek past end of sparse segments", + ) + })? + } else { + let abs = (-offset) as u64; + self.logical_size.checked_sub(abs).ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "invalid seek before start of sparse segments", + ) + })? + } + } + }; + + if target > self.logical_size { + return Err(io::Error::new(io::ErrorKind::InvalidInput, "invalid seek")); + } + + self.pos = target; + Ok(self.pos) + } +} + +impl<'a> SeekSparse for SparseSegments<'a> { + fn seek_sparse(&mut self, pos: SeekFromSparse) -> io::Result { + match pos { + SeekFromSparse::NextData(offset) => { + for segment in self.segments { + if segment.data.is_empty() { + // then this is actually just used to create a hole + // and does NOT count as data + continue; + } + + if offset < segment.offset { + return Ok(segment.offset); + } + if offset < segment.end() { + return Ok(offset); + } + } + Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "no more data segments", + )) + } + SeekFromSparse::NextHole(offset) => { + if offset > self.logical_size { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "offset beyond logical size", + )); + } + + if offset == self.logical_size { + return Ok(offset); + } + + for segment in self.segments { + if segment.data.is_empty() { + // then this is actually just used to create a hole + continue; + } + + if offset < segment.offset { + return Ok(offset); + } + if offset < segment.end() { + return Ok(segment.end()); + } + } + + Ok(offset) + } + } + } + + fn logical_size(&self) -> u64 { + self.logical_size + } + + fn data_size(&self) -> u64 { + self.data_size + } +} + +#[test] +fn writing_sparse_data() { + let mut ar = Builder::new(Vec::new()); + + let cases: Vec<(&str, Vec)> = vec![ + ("empty", vec![]), + ("full_sparse", vec![SparseSegment::new(0x20_000, 0, b'a')]), + ("_x", vec![SparseSegment::new(0x20_000, 0x1_000, b'a')]), + ( + "x_", + vec![ + SparseSegment::new(0, 0x1_000, b'b'), + SparseSegment::new(0x20_000, 0, b'b'), + ], + ), + ( + "_x_x", + vec![ + SparseSegment::new(0x20_000, 0x1_000, b'c'), + SparseSegment::new(0x40_000, 0x1_000, b'd'), + ], + ), + ( + "x_x_", + vec![ + SparseSegment::new(0, 0x1_000, b'e'), + SparseSegment::new(0x20_000, 0x1_000, b'f'), + SparseSegment::new(0x40_000, 0, b'f'), + ], + ), + ( + "uneven", + vec![ + SparseSegment::new(0x20_333, 0x555, b'u'), + SparseSegment::new(0x40_777, 0x999, b'v'), + ], + ), + ]; + + let mut expected = Vec::new(); + + for (name, segments) in &cases { + let mut data = SparseSegments::new(segments); + let mut expected_bytes = Vec::new(); + data.read_to_end(&mut expected_bytes).unwrap(); + data.seek(io::SeekFrom::Start(0)).unwrap(); + + let mut header = Header::new_gnu(); + header.set_mode(0o644); + + ar.append_sparse_data(header, name, data).expect(name); + expected.push((*name, expected_bytes)); + } + + ar.finish().unwrap(); + + let data = ar.into_inner().unwrap(); + + #[cfg(target_os = "linux")] + assert!(data.len() <= 37 * 1024); + #[cfg(target_os = "freebsd")] + assert!(data.len() <= 273 * 1024); + + let mut ar = Archive::new(&data[..]); + let mut entries = ar.entries().unwrap(); + + for (expected_name, expected_contents) in expected { + let mut entry = entries.next().unwrap().expect(expected_name); + assert_eq!( + &*entry.header().path_bytes(), + expected_name.as_bytes(), + "path mismatch", + ); + + let mut contents = Vec::new(); + entry.read_to_end(&mut contents).unwrap(); + assert!( + contents == expected_contents, + "path: {expected_name}, actual len = {}, expected len = {}", + contents.len(), + expected_contents.len() + ); } assert!(entries.next().is_none());