From 2f5ea4fb484ea71df802bf4d2644c804dbdcac78 Mon Sep 17 00:00:00 2001 From: ncihnegn Date: Fri, 12 Aug 2022 02:25:37 -0700 Subject: [PATCH 1/5] Add support for PAX Format, Version 1.0 --- src/archive.rs | 91 +++++++++++++++++++++++++--------- src/entry.rs | 40 +++++++++++++++ src/header.rs | 10 +++- tests/all.rs | 13 +++++ tests/archives/pax_sparse.tar | Bin 0 -> 7168 bytes 5 files changed, 129 insertions(+), 25 deletions(-) create mode 100644 tests/archives/pax_sparse.tar diff --git a/src/archive.rs b/src/archive.rs index e875124a..9e2d4b83 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -9,9 +9,10 @@ use std::path::Path; use crate::entry::{EntryFields, EntryIo}; use crate::error::TarError; +use crate::header::{SparseEntry, BLOCK_SIZE}; use crate::other; use crate::pax::pax_extensions_size; -use crate::{Entry, GnuExtSparseHeader, GnuSparseHeader, Header}; +use crate::{Entry, GnuExtSparseHeader, Header}; /// A top-level representation of an archive file. /// @@ -256,6 +257,7 @@ impl<'a, R: Read> Iterator for Entries<'a, R> { } } +#[allow(unused_assignments)] // https://github.com/rust-lang/rust/issues/22630 impl<'a> EntriesFields<'a> { fn next_entry_raw( &mut self, @@ -277,14 +279,14 @@ impl<'a> EntriesFields<'a> { // Otherwise, check if we are ignoring zeros and continue, or break as if this is the // end of the archive. if !header.as_bytes().iter().all(|i| *i == 0) { - self.next += 512; + self.next += BLOCK_SIZE as u64; break; } if !self.archive.inner.ignore_zeros { return Ok(None); } - self.next += 512; + self.next += BLOCK_SIZE as u64; header_pos = self.next; } @@ -325,11 +327,11 @@ impl<'a> EntriesFields<'a> { // Store where the next entry is, rounding up by 512 bytes (the size of // a header); let size = size - .checked_add(511) + .checked_add(BLOCK_SIZE as u64 - 1) .ok_or_else(|| other("size overflow"))?; self.next = self .next - .checked_add(size & !(512 - 1)) + .checked_add(size & !(BLOCK_SIZE as u64 - 1)) .ok_or_else(|| other("size overflow"))?; Ok(Some(ret.into_entry())) @@ -394,26 +396,65 @@ impl<'a> EntriesFields<'a> { if let Some(pax_extensions_ref) = &pax_extensions { pax_size = pax_extensions_size(pax_extensions_ref); } + // Not an entry + // Keep pax_extensions for the next ustar header + processed -= 1; continue; } let mut fields = EntryFields::from(entry); + fields.pax_extensions = pax_extensions; + pax_extensions = None; // Reset pax_extensions after use + if is_recognized_header && fields.is_pax_sparse() { + gnu_longname = fields.pax_sparse_name(); + } fields.long_pathname = gnu_longname; fields.long_linkname = gnu_longlink; - fields.pax_extensions = pax_extensions; self.parse_sparse_header(&mut fields)?; return Ok(Some(fields.into_entry())); } } fn parse_sparse_header(&mut self, entry: &mut EntryFields<'a>) -> io::Result<()> { - if !entry.header.entry_type().is_gnu_sparse() { + if !entry.is_pax_sparse() && !entry.header.entry_type().is_gnu_sparse() { return Ok(()); } - let gnu = match entry.header.as_gnu() { - Some(gnu) => gnu, - None => return Err(other("sparse entry type listed but not GNU header")), - }; + let mut sparse_map = Vec::::new(); + let mut real_size = 0; + if entry.is_pax_sparse() { + real_size = entry.pax_sparse_realsize()?; + let mut num_bytes_read = 0; + let mut reader = io::BufReader::with_capacity(BLOCK_SIZE, &self.archive.inner); + let mut read_decimal_line = || -> io::Result { + let mut str = String::new(); + num_bytes_read += reader.read_line(&mut str)?; + str.strip_suffix("\n") + .and_then(|s| s.parse::().ok()) + .ok_or_else(|| other("failed to read a decimal line")) + }; + + let num_entries = read_decimal_line()?; + for _ in 0..num_entries { + let offset = read_decimal_line()?; + let size = read_decimal_line()?; + sparse_map.push(SparseEntry { offset, size }); + } + let rem = BLOCK_SIZE - (num_bytes_read % BLOCK_SIZE); + entry.size -= (num_bytes_read + rem) as u64; + } else if entry.header.entry_type().is_gnu_sparse() { + let gnu = match entry.header.as_gnu() { + Some(gnu) => gnu, + None => return Err(other("sparse entry type listed but not GNU header")), + }; + real_size = gnu.real_size()?; + for block in gnu.sparse.iter() { + if !block.is_empty() { + let offset = block.offset()?; + let size = block.length()?; + sparse_map.push(SparseEntry { offset, size }); + } + } + } // Sparse files are represented internally as a list of blocks that are // read. Blocks are either a bunch of 0's or they're data from the @@ -442,13 +483,10 @@ impl<'a> EntriesFields<'a> { let data = &mut entry.data; let reader = &self.archive.inner; let size = entry.size; - let mut add_block = |block: &GnuSparseHeader| -> io::Result<_> { - if block.is_empty() { - return Ok(()); - } - let off = block.offset()?; - let len = block.length()?; - if len != 0 && (size - remaining) % 512 != 0 { + let mut add_block = |block: &SparseEntry| -> io::Result<_> { + let off = block.offset; + let len = block.size; + if len != 0 && (size - remaining) % BLOCK_SIZE as u64 != 0 { return Err(other( "previous block in sparse file was not \ aligned to 512-byte boundary", @@ -474,10 +512,10 @@ impl<'a> EntriesFields<'a> { data.push(EntryIo::Data(reader.take(len))); Ok(()) }; - for block in gnu.sparse.iter() { - add_block(block)? + for block in sparse_map { + add_block(&block)? } - if gnu.is_extended() { + if entry.header.as_gnu().map(|gnu| gnu.is_extended()) == Some(true) { let mut ext = GnuExtSparseHeader::new(); ext.isextended[0] = 1; while ext.is_extended() { @@ -485,14 +523,19 @@ impl<'a> EntriesFields<'a> { return Err(other("failed to read extension")); } - self.next += 512; + self.next += BLOCK_SIZE as u64; for block in ext.sparse.iter() { - add_block(block)?; + if !block.is_empty() { + add_block(&SparseEntry { + offset: block.offset()?, + size: block.length()?, + })?; + } } } } } - if cur != gnu.real_size()? { + if cur != real_size { return Err(other( "mismatch in sparse file chunks and \ size in header", diff --git a/src/entry.rs b/src/entry.rs index cce39d45..b3cfc11d 100644 --- a/src/entry.rs +++ b/src/entry.rs @@ -285,6 +285,46 @@ impl<'a> EntryFields<'a> { self.read_to_end(&mut v).map(|_| v) } + pub fn is_pax_sparse(&mut self) -> bool { + if let Some(ref pax) = self.pax_extensions { + let mut extensions = PaxExtensions::new(pax).filter_map(|f| f.ok()); + return extensions + .find(|f| f.key_bytes() == b"GNU.sparse.major" && f.value_bytes() == b"1") + .is_some() + && extensions + .find(|f| f.key_bytes() == b"GNU.sparse.minor" && f.value_bytes() == b"0") + .is_some(); + } + false + } + + pub fn pax_sparse_name(&mut self) -> Option> { + if let Some(ref pax) = self.pax_extensions { + return PaxExtensions::new(pax) + .filter_map(|f| f.ok()) + .find(|f| f.key_bytes() == b"GNU.sparse.name") + .map(|f| f.value_bytes().to_vec()); + } + None + } + + pub fn pax_sparse_realsize(&mut self) -> io::Result { + if let Some(ref pax) = self.pax_extensions { + let pax = PaxExtensions::new(pax) + .filter_map(|f| f.ok()) + .find(|f| f.key_bytes() == b"GNU.sparse.realsize") + .map(|f| f.value_bytes()); + if let Some(field) = pax { + let str = + std::str::from_utf8(&field).map_err(|_| other("failed to read string"))?; + return str + .parse::() + .map_err(|_| other("failed to parse the real size")); + } + } + Err(other("PAX extension GNU.sparse.realsize not found")) + } + fn path(&self) -> io::Result> { bytes2path(self.path_bytes()) } diff --git a/src/header.rs b/src/header.rs index 7e507fc7..4e482ca7 100644 --- a/src/header.rs +++ b/src/header.rs @@ -16,11 +16,13 @@ use std::str; use crate::other; use crate::EntryType; +pub const BLOCK_SIZE: usize = 512; + /// Representation of the header of an entry in an archive #[repr(C)] #[allow(missing_docs)] pub struct Header { - bytes: [u8; 512], + bytes: [u8; BLOCK_SIZE], } /// Declares the information that should be included when filling a Header @@ -110,6 +112,12 @@ pub struct GnuHeader { pub pad: [u8; 17], } +/// Description of a spare entry. +pub struct SparseEntry { + pub offset: u64, + pub size: u64, +} + /// Description of the header of a spare entry. /// /// Specifies the offset/number of bytes of a chunk of data in octal. diff --git a/tests/all.rs b/tests/all.rs index fa38ef62..22436535 100644 --- a/tests/all.rs +++ b/tests/all.rs @@ -1089,6 +1089,19 @@ fn sparse_with_trailing() { assert_eq!(&s[0x100_000..], "1MB through\n"); } +#[test] +fn pax_sparse() { + let rdr = Cursor::new(tar!("pax_sparse.tar")); + let mut ar = Archive::new(rdr); + let td = t!(TempBuilder::new().prefix("tar-rs").tempdir()); + t!(ar.unpack(td.path())); + + let mut s = String::new(); + t!(t!(File::open(td.path().join("sparse_begin.txt"))).read_to_string(&mut s)); + assert_eq!(&s[..5], "test\n"); + assert!(s[5..].chars().all(|x| x == '\u{0}')); +} + #[test] fn path_separators() { let mut ar = Builder::new(Vec::new()); diff --git a/tests/archives/pax_sparse.tar b/tests/archives/pax_sparse.tar new file mode 100644 index 0000000000000000000000000000000000000000..d74bef7b57b53810f3a35c68fae58f9672941e42 GIT binary patch literal 7168 zcmeH{!A`?442FBoQ{)MP?WDL6nO0$6q9j&q zfA*hyKimfY8sb&|;bR}3p2J(ysx+ixF#`Z=j4bk+?YPp<9)NTyksN~44oi@LRvRef zuvi8&4}JE@$DCU1y4jMm+jcwi&a}9($dwpX@+CFvNdjQzmFL^@+jn`sORt8>p;U3Bxs~nS1$g z5dL+u3kv?^7s%3>(Ldt&DV*xxA(=wLIl%sR{R+)1%w1576#B$Xzo3S2 jDolV0FaajO1egF5U;<2l2`~XBzyz286JP>NU_k=k(iU$S literal 0 HcmV?d00001 From 95d44da74da2ba7e670a54f883775f8976c9cdb6 Mon Sep 17 00:00:00 2001 From: ncihnegn Date: Mon, 12 Sep 2022 14:08:49 -0700 Subject: [PATCH 2/5] Addressing comments --- src/archive.rs | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/archive.rs b/src/archive.rs index 9e2d4b83..9abf3259 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -257,7 +257,7 @@ impl<'a, R: Read> Iterator for Entries<'a, R> { } } -#[allow(unused_assignments)] // https://github.com/rust-lang/rust/issues/22630 +#[allow(unused_assignments)] impl<'a> EntriesFields<'a> { fn next_entry_raw( &mut self, @@ -404,11 +404,14 @@ impl<'a> EntriesFields<'a> { let mut fields = EntryFields::from(entry); fields.pax_extensions = pax_extensions; + // False positive: unused assignment + // https://github.com/rust-lang/rust/issues/22630 pax_extensions = None; // Reset pax_extensions after use - if is_recognized_header && fields.is_pax_sparse() { - gnu_longname = fields.pax_sparse_name(); - } - fields.long_pathname = gnu_longname; + fields.long_pathname = if is_recognized_header && fields.is_pax_sparse() { + fields.pax_sparse_name() + } else { + gnu_longname + }; fields.long_linkname = gnu_longlink; self.parse_sparse_header(&mut fields)?; return Ok(Some(fields.into_entry())); @@ -483,9 +486,7 @@ impl<'a> EntriesFields<'a> { let data = &mut entry.data; let reader = &self.archive.inner; let size = entry.size; - let mut add_block = |block: &SparseEntry| -> io::Result<_> { - let off = block.offset; - let len = block.size; + let mut add_block = |off: u64, len: u64| -> io::Result<_> { if len != 0 && (size - remaining) % BLOCK_SIZE as u64 != 0 { return Err(other( "previous block in sparse file was not \ @@ -513,7 +514,7 @@ impl<'a> EntriesFields<'a> { Ok(()) }; for block in sparse_map { - add_block(&block)? + add_block(block.offset, block.size)? } if entry.header.as_gnu().map(|gnu| gnu.is_extended()) == Some(true) { let mut ext = GnuExtSparseHeader::new(); @@ -526,10 +527,7 @@ impl<'a> EntriesFields<'a> { self.next += BLOCK_SIZE as u64; for block in ext.sparse.iter() { if !block.is_empty() { - add_block(&SparseEntry { - offset: block.offset()?, - size: block.length()?, - })?; + add_block(block.offset()?, block.length()?)?; } } } From 3340fc2e7ed13f75612510b35d7d2dc74a498ebf Mon Sep 17 00:00:00 2001 From: ncihnegn Date: Mon, 12 Sep 2022 14:44:02 -0700 Subject: [PATCH 3/5] Update comments --- src/archive.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/archive.rs b/src/archive.rs index 9abf3259..be73ae22 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -396,8 +396,8 @@ impl<'a> EntriesFields<'a> { if let Some(pax_extensions_ref) = &pax_extensions { pax_size = pax_extensions_size(pax_extensions_ref); } - // Not an entry - // Keep pax_extensions for the next ustar header + // This entry has two headers. + // Keep pax_extensions for the next ustar header. processed -= 1; continue; } From 827a3a080545093e60984756f576a5f19893b1bb Mon Sep 17 00:00:00 2001 From: ncihnegn Date: Sun, 9 Jun 2024 14:00:00 -0800 Subject: [PATCH 4/5] Fix typos --- src/archive.rs | 5 +---- src/header.rs | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/archive.rs b/src/archive.rs index 613cf0a0..242ec79c 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -12,7 +12,7 @@ use crate::error::TarError; use crate::header::{SparseEntry, BLOCK_SIZE}; use crate::other; use crate::pax::*; -use crate::{Entry, GnuExtSparseHeader, GnuSparseHeader, Header}; +use crate::{Entry, GnuExtSparseHeader, Header}; /// A top-level representation of an archive file. /// @@ -422,9 +422,6 @@ impl<'a> EntriesFields<'a> { )); } pax_extensions = Some(EntryFields::from(entry).read_all()?); - if let Some(pax_extensions_ref) = &pax_extensions { - pax_size = pax_extensions_size(pax_extensions_ref); - } // This entry has two headers. // Keep pax_extensions for the next ustar header. processed -= 1; diff --git a/src/header.rs b/src/header.rs index 4eacc061..ef691228 100644 --- a/src/header.rs +++ b/src/header.rs @@ -16,14 +16,14 @@ use std::str; use crate::other; use crate::EntryType; -pub const BLOCK_SIZE: usize = 51 +pub const BLOCK_SIZE: usize = 512; /// A deterministic, arbitrary, non-zero timestamp that use used as `mtime` /// of headers when [`HeaderMode::Deterministic`] is used. /// /// This value, chosen after careful deliberation, corresponds to _Jul 23, 2006_, /// which is the date of the first commit for what would become Rust. #[cfg(any(unix, windows))] -const DETERMINISTIC_TIMESTAMP: u64 = 115370408 +const DETERMINISTIC_TIMESTAMP: u64 = 1153704088; /// Representation of the header of an entry in an archive #[repr(C)] From 8ad8efd64bfb999015ff7944e2a7de52025054f5 Mon Sep 17 00:00:00 2001 From: ncihnegn Date: Fri, 28 Feb 2025 00:53:10 -0800 Subject: [PATCH 5/5] Address comments --- src/entry.rs | 6 ++++-- src/pax.rs | 13 +++++++++++++ tests/all.rs | 9 ++++++--- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/entry.rs b/src/entry.rs index b3cfc11d..e5e6ed2a 100644 --- a/src/entry.rs +++ b/src/entry.rs @@ -13,6 +13,7 @@ use crate::archive::ArchiveInner; use crate::error::TarError; use crate::header::bytes2path; use crate::other; +use crate::pax::{GNU_SPARSE_MAJOR_EXTENSION, GNU_SPARSE_MINOR_EXTENSION}; use crate::{Archive, Header, PaxExtensions}; /// A read-only view into an entry of an archive. @@ -285,14 +286,15 @@ impl<'a> EntryFields<'a> { self.read_to_end(&mut v).map(|_| v) } + /// Check if the tar file is using PAX sparse extensions. pub fn is_pax_sparse(&mut self) -> bool { if let Some(ref pax) = self.pax_extensions { let mut extensions = PaxExtensions::new(pax).filter_map(|f| f.ok()); return extensions - .find(|f| f.key_bytes() == b"GNU.sparse.major" && f.value_bytes() == b"1") + .find(|f| *f == GNU_SPARSE_MAJOR_EXTENSION) .is_some() && extensions - .find(|f| f.key_bytes() == b"GNU.sparse.minor" && f.value_bytes() == b"0") + .find(|f| *f == GNU_SPARSE_MINOR_EXTENSION) .is_some(); } false diff --git a/src/pax.rs b/src/pax.rs index 80ca3e9b..7c397df7 100644 --- a/src/pax.rs +++ b/src/pax.rs @@ -25,11 +25,24 @@ impl<'entry> PaxExtensions<'entry> { } /// A key/value pair corresponding to a pax extension. +#[derive(PartialEq)] pub struct PaxExtension<'entry> { key: &'entry [u8], value: &'entry [u8], } +/// Constant of the GNU sparse major extension. +pub const GNU_SPARSE_MAJOR_EXTENSION: PaxExtension<'_> = PaxExtension { + key: b"GNU.sparse.major", + value: b"1", +}; + +/// Constant of the GNU sparse minor extension. +pub const GNU_SPARSE_MINOR_EXTENSION: PaxExtension<'_> = PaxExtension { + key: b"GNU.sparse.minor", + value: b"0", +}; + pub fn pax_extensions_size(a: &[u8]) -> Option { for extension in PaxExtensions::new(a) { let current_extension = match extension { diff --git a/tests/all.rs b/tests/all.rs index 22436535..1d1e5e50 100644 --- a/tests/all.rs +++ b/tests/all.rs @@ -1093,11 +1093,14 @@ fn sparse_with_trailing() { fn pax_sparse() { let rdr = Cursor::new(tar!("pax_sparse.tar")); let mut ar = Archive::new(rdr); - let td = t!(TempBuilder::new().prefix("tar-rs").tempdir()); - t!(ar.unpack(td.path())); + let td = TempBuilder::new().prefix("tar-rs").tempdir().unwrap(); + ar.unpack(td.path()).unwrap(); let mut s = String::new(); - t!(t!(File::open(td.path().join("sparse_begin.txt"))).read_to_string(&mut s)); + File::open(td.path().join("sparse_begin.txt")) + .unwrap() + .read_to_string(&mut s) + .unwrap(); assert_eq!(&s[..5], "test\n"); assert!(s[5..].chars().all(|x| x == '\u{0}')); }