From b688724ab78b20e7608d24c1b66db7199307238f Mon Sep 17 00:00:00 2001 From: "Bjarni R. Einarsson" Date: Fri, 2 Dec 2022 12:47:29 +0000 Subject: [PATCH 01/15] Make it possible to disable encryption for individual files --- pyzipper/zipfile.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/pyzipper/zipfile.py b/pyzipper/zipfile.py index ae46a65..f8d7750 100644 --- a/pyzipper/zipfile.py +++ b/pyzipper/zipfile.py @@ -1685,6 +1685,7 @@ class ZipFile: fp = None # Set here since __del__ checks it _windows_illegal_name_trans_table = None zipinfo_cls = ZipInfo + zipinfo_cls_cleartext = ZipInfo zipextfile_cls = ZipExtFile zipwritefile_cls = _ZipWriteFile @@ -1963,7 +1964,7 @@ def read(self, name, pwd=None): with self.open(name, "r", pwd) as fp: return fp.read() - def open(self, name, mode="r", pwd=None, *, force_zip64=False): + def open(self, name, mode="r", pwd=None, *, force_zip64=False, encrypt=True): """Return file-like object for 'name'. name is a string for the file name within the ZIP file, or a ZipInfo @@ -1991,11 +1992,14 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False): raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__) # Make sure we have an info object - if isinstance(name, self.zipinfo_cls): + if isinstance(name, (self.zipinfo_cls, self.zipinfo_cls_cleartext)): # 'name' is already an info object zinfo = name elif mode == 'w': - zinfo = self.zipinfo_cls(name) + if encrypt: + zinfo = self.zipinfo_cls(name) + else: + zinfo = self.zipinfo_cls_cleartext(name) zinfo.compress_type = self.compression zinfo._compresslevel = self.compresslevel else: @@ -2003,7 +2007,8 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False): zinfo = self.getinfo(name) if mode == 'w': - return self._open_to_write(zinfo, force_zip64=force_zip64, pwd=pwd) + return self._open_to_write(zinfo, + force_zip64=force_zip64, pwd=pwd, encrypt=encrypt) if self._writing: raise ValueError("Can't read from the ZIP file while there " @@ -2023,7 +2028,7 @@ def _open_to_read(self, mode, zinfo, pwd): zef_file.close() raise e - def _open_to_write(self, zinfo, force_zip64=False, pwd=None): + def _open_to_write(self, zinfo, force_zip64=False, pwd=None, encrypt=True): if force_zip64 and not self._allowZip64: raise ValueError( "force_zip64 is True, but allowZip64 was False when opening " @@ -2043,7 +2048,7 @@ def _open_to_write(self, zinfo, force_zip64=False, pwd=None): zinfo.flag_bits = 0x00 encrypter = None - if pwd is not None or self.encryption is not None: + if (pwd is not None or self.encryption is not None) and encrypt: zinfo.flag_bits |= _MASK_ENCRYPTED encrypter = self.get_encrypter() encrypter.update_zipinfo(zinfo) @@ -2235,7 +2240,7 @@ def write(self, filename, arcname=None, shutil.copyfileobj(src, dest, 1024*8) def writestr(self, zinfo_or_arcname, data, - compress_type=None, compresslevel=None): + compress_type=None, compresslevel=None, encrypt=True): """Write a file into the archive. The contents is 'data', which may be either a 'str' or a 'bytes' instance; if it is a 'str', it is encoded as UTF-8 first. @@ -2273,7 +2278,7 @@ def writestr(self, zinfo_or_arcname, data, zinfo.file_size = len(data) # Uncompressed size with self._lock: - with self.open(zinfo, mode='w') as dest: + with self.open(zinfo, mode='w', encrypt=encrypt) as dest: dest.write(data) def __del__(self): From 51dc446326557b2ce411bc55bbaac3a5d9300291 Mon Sep 17 00:00:00 2001 From: "Bjarni R. Einarsson" Date: Tue, 16 May 2023 14:27:18 +0000 Subject: [PATCH 02/15] Filenames can be bytes --- pyzipper/zipfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyzipper/zipfile.py b/pyzipper/zipfile.py index f8d7750..ed7f227 100644 --- a/pyzipper/zipfile.py +++ b/pyzipper/zipfile.py @@ -1720,7 +1720,7 @@ def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, else: if isinstance(file, pathlib.PurePath): file = str(file) - if isinstance(file, str): + if isinstance(file, (str, bytes)): # No, it's a filename self._filePassed = 0 self.filename = file From 030467b2863b40ae51d1c79cbbbf8ccf01857113 Mon Sep 17 00:00:00 2001 From: "Bjarni R. Einarsson" Date: Tue, 5 Sep 2023 15:22:33 +0000 Subject: [PATCH 03/15] Create ZipFileRW class, for truly editable ZIP files --- pyzipper/zipfile.py | 191 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 190 insertions(+), 1 deletion(-) diff --git a/pyzipper/zipfile.py b/pyzipper/zipfile.py index ed7f227..2c1bb6b 100644 --- a/pyzipper/zipfile.py +++ b/pyzipper/zipfile.py @@ -36,7 +36,7 @@ __all__ = ["BadZipFile", "BadZipfile", "error", "ZIP_STORED", "ZIP_DEFLATED", "ZIP_BZIP2", "ZIP_LZMA", - "is_zipfile", "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile"] + "is_zipfile", "ZipInfo", "ZipFile", "ZipFileRW", "PyZipFile", "LargeZipFile"] class BadZipFile(Exception): @@ -2543,6 +2543,195 @@ def _compile(file, optimize=-1): return (fname, archivename) +class ZipFileRW(ZipFile): + DELETED_JUNK = b'DELETED' * 1024 + DELETED_FN_FMT = 'DELETED/%s' + + DELETE_IS_INSECURE = False # Set to True, to leave "deleted" data in + # the archive body - only deleting entries + # from the central directory. Setting to True + # also disables compaction. + + COMPACTING_THRESHOLD = 0.0 # Set to a ratio of allowable wasted space, + # to reduce the I/O overhead of deletion. + + def _raw_data_length(self, zinfo): + """ + This calculates how many bytes of the archive are dedicated to + this file/directory. This is guaranteed to never overlap with + any other files, even if headers are corrupt or our code buggy. + """ + offsets = [zi.header_offset for zi in self.filelist] + offsets.append(self.start_dir) + offsets.sort() + for i, o in enumerate(offsets): + if o == zinfo.header_offset: + return (offsets[i+1] - o) + return 0 + + def _overwrite_deleted_data(self, zinfo): + """ + Overwrite the data in the archive with junk (unless insecure deletion + is requested). If we are lucky and this is the last entry in the + archive, truncate it. + + Returns the (ofs, length) of the overwritten segment, False if we + truncated, or None if deletion is insecure or the entry not found. + """ + junk_bytes = self._raw_data_length(zinfo) + if not junk_bytes: + return None + + self._writing = True + + if self.DELETE_IS_INSECURE: + gap = None + else: + gap = (zinfo.header_offset, junk_bytes) + self._didModify = True + self.fp.seek(zinfo.header_offset) + while True: + if junk_bytes > len(self.DELETED_JUNK): + self.fp.write(self.DELETED_JUNK) + junk_bytes -= len(self.DELETED_JUNK) + else: + self.fp.write(self.DELETED_JUNK[:junk_bytes]) + break + self.fp.seek(self.start_dir) + + if (zinfo.header_offset + junk_bytes) == self.start_dir: + self._didModify = True + self.fp.seek(zinfo.header_offset) + self.fp.truncate(zinfo.header_offset) + self.start_dir = zinfo.header_offset + gap = False + + self._writing = False + return gap + + def _move(self, zinfo, new_pos): + """ + Move a file to a new location within the archive. Overlap between + the source and destination is only allowed if moving backwards. + """ + data_bytes = self._raw_data_length(zinfo) + _from = zinfo.header_offset + _to = new_pos + + if _from != _to: + if _from < _to < _from + data_bytes: + raise io.UnsupportedOperation( + 'Overlapping ranges are only safe when moving backwards') + + self._didModify = True + _bytes = data_bytes + while _bytes > 0: + self.fp.seek(_from) + data = self.fp.read(min(_bytes, 256*1024)) + self.fp.seek(_to) + self.fp.write(data) + _bytes -= len(data) + _from += len(data) + _to += len(data) + self.fp.seek(self.start_dir) + zinfo.header_offset = new_pos + return new_pos + data_bytes + + def delete(self, filename): + """ + Delete an entry from the archive. + """ + if not self._seekable: + raise io.UnsupportedOperation('Can only delete from seekable files') + + if hasattr(filename, 'filename'): + filename = filename.filename + + with self._lock: + zinfo = self.NameToInfo[filename] # May throw KeyError + del self.NameToInfo[filename] + gap = self._overwrite_deleted_data(zinfo) + if gap: + fn = self.DELETED_FN_FMT % gap[0] + gap_zinfo = self.zipinfo_cls(fn) + gap_zinfo.header_offset = gap[0] + gap_zinfo.compress_size = 0 + gap_zinfo.file_size = 0 + gap_zinfo.CRC = 0 + self.NameToInfo[fn] = gap_zinfo + self.filelist[self.filelist.index(zinfo)] = gap_zinfo + else: + self.filelist.remove(zinfo) + self._didModify = True + + def compact(self, threshold=0): + """ + Reclaime unused space after one or more deletions, by rewriting + the archive in-place. + + WARNING: This may leave the archive in a corrupt state, if + interrupted part-way through. + """ + if not self._seekable: + return + + def is_del(zi): + return ( + (zi.compress_size == zi.CRC == zi.file_size == 0) and + (zi.filename.startswith(self.DELETED_FN_FMT % ''))) + + with self._lock: + deleted = [zi.header_offset for zi in self.filelist if is_del(zi)] + if not deleted: + return + + # Generate a list of (offset, zipinfo) tuples which is in the + # same order as the data in the archive itself. This guarantees + # our _move() ops below will always succeed. + offsets = [(zi.header_offset, zi) + for i, zi in enumerate(self.filelist)] + offsets.sort() + + # If a threshold is set, calculate how much space is being + # wasted before we proceed. If below the threshold, abort! + if threshold: + total = 0.0 + wasted = 0.0 + for i, (ofs, zi) in enumerate(offsets[:-1]): + size = offsets[i+1][0] - ofs + total += size + if ofs in deleted: + wasted += size + if (wasted / total) < threshold: + return + + writing_to = 0 + keep = [] + self._writing = True + self._didModify = True + for ofs, zinfo in offsets: + if ofs in deleted: + # Remove it in case we get interrupted; it is + # probably about to be overwritten! + self.filelist.remove(zinfo) + else: + # Not deleted: move forward if necessary! + writing_to = self._move(zinfo, writing_to) + keep.append(zinfo) + + self.fp.truncate(writing_to) + self.fp.seek(writing_to) + self.start_dir = writing_to + self.filelist = keep + self.NameToInfo = dict((zi.filename, zi) for zi in keep) + self._writing = False + + def _write_end_record(self): + if not self.DELETE_IS_INSECURE: + self.compact(threshold=self.COMPACTING_THRESHOLD) + return super()._write_end_record() + + def main(args=None): import argparse From bbc77ea15a950fe90899b3a56703535a458bf9a0 Mon Sep 17 00:00:00 2001 From: "Bjarni R. Einarsson" Date: Tue, 5 Sep 2023 15:23:45 +0000 Subject: [PATCH 04/15] Base AESZipFile off ZipFileRW, instead of plain ZipFile --- pyzipper/zipfile_aes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyzipper/zipfile_aes.py b/pyzipper/zipfile_aes.py index ddfd84a..7939d99 100644 --- a/pyzipper/zipfile_aes.py +++ b/pyzipper/zipfile_aes.py @@ -13,6 +13,7 @@ BadZipFile, BaseZipDecrypter, ZipFile, + ZipFileRW, ZipInfo, ZipExtFile, ) @@ -328,7 +329,7 @@ def check_integrity(self): super().check_integrity() -class AESZipFile(ZipFile): +class AESZipFile(ZipFileRW): zipinfo_cls = AESZipInfo zipextfile_cls = AESZipExtFile From a06eb3e74bddc5608558e8070857f1686f3d6d69 Mon Sep 17 00:00:00 2001 From: "Bjarni R. Einarsson" Date: Wed, 6 Sep 2023 09:48:53 +0000 Subject: [PATCH 05/15] Document ZipFileRW API, make configurable on init --- pyzipper/zipfile.py | 111 +++++++++++++++++++++++++++++--------------- 1 file changed, 73 insertions(+), 38 deletions(-) diff --git a/pyzipper/zipfile.py b/pyzipper/zipfile.py index 2c1bb6b..e46ce09 100644 --- a/pyzipper/zipfile.py +++ b/pyzipper/zipfile.py @@ -2544,17 +2544,48 @@ def _compile(file, optimize=-1): class ZipFileRW(ZipFile): - DELETED_JUNK = b'DELETED' * 1024 + """ ZipFile subclass which can delete in-place from ZIP archives. + + z = ZipFileRW(file, mode="a", + insecure_delete=False, + compacting_threshold=0.0, + ... ) + + In addition to inherited ZipFile arguments, ZipFileRW settings are: + + insecure_delete: Set to True, to leave "deleted" data in the archive + body. In this mode, deleting only removes entries + from the central directory. This is faster and makes + it theoretically possible to restore deleted files. + When True, archive compaction is disabled. + + compacting_threshold: A ratio of allowable wasted space (0.5 = 50%). + Setting to a value above the default of 0 will + reduce disk I/O somewhat, at the expense of + wasting space. It will also leave "DELETED" + entries the directory until compacted. + """ + DELETED_JUNK = b'DELETED!' * 1024 DELETED_FN_FMT = 'DELETED/%s' - DELETE_IS_INSECURE = False # Set to True, to leave "deleted" data in - # the archive body - only deleting entries - # from the central directory. Setting to True - # also disables compaction. - + # Subclasses can override these to adjust defaults, which will make + # sense for apps that know what their use profile is. + INSECURE_DELETE = False COMPACTING_THRESHOLD = 0.0 # Set to a ratio of allowable wasted space, # to reduce the I/O overhead of deletion. + def __init__(self, *args, **kwargs): + """ + A subclass of ZipFile + """ + self.compacting_threshold = kwargs.pop( + 'compacting_threshold', self.COMPACTING_THRESHOLD) + self.insecure_delete = kwargs.pop( + 'insecure_delete', self.INSECURE_DELETE) + self.deleted_fn_fmt = kwargs.pop( + 'deleted_fn_fmt', self.DELETED_FN_FMT) + super().__init__(*args, **kwargs) + def _raw_data_length(self, zinfo): """ This calculates how many bytes of the archive are dedicated to @@ -2584,7 +2615,7 @@ def _overwrite_deleted_data(self, zinfo): self._writing = True - if self.DELETE_IS_INSECURE: + if self.insecure_delete: gap = None else: gap = (zinfo.header_offset, junk_bytes) @@ -2637,22 +2668,27 @@ def _move(self, zinfo, new_pos): zinfo.header_offset = new_pos return new_pos + data_bytes - def delete(self, filename): + def delete(self, zinfo_or_arcname): """ Delete an entry from the archive. + + By default (when insecure_delete=False) the archived data will be + overwritten with garbage. Space is reclaimed upon compaction. """ if not self._seekable: raise io.UnsupportedOperation('Can only delete from seekable files') - if hasattr(filename, 'filename'): - filename = filename.filename + if hasattr(zinfo_or_arcname, 'filename'): + arcname = zinfo_or_arcname.filename + else: + arcname = zinfo_or_arcname with self._lock: - zinfo = self.NameToInfo[filename] # May throw KeyError - del self.NameToInfo[filename] + zinfo = self.NameToInfo[arcname] # May throw KeyError + del self.NameToInfo[arcname] gap = self._overwrite_deleted_data(zinfo) if gap: - fn = self.DELETED_FN_FMT % gap[0] + fn = self.deleted_fn_fmt % gap[0] gap_zinfo = self.zipinfo_cls(fn) gap_zinfo.header_offset = gap[0] gap_zinfo.compress_size = 0 @@ -2664,10 +2700,11 @@ def delete(self, filename): self.filelist.remove(zinfo) self._didModify = True - def compact(self, threshold=0): + def compact(self, threshold=0.0): """ - Reclaime unused space after one or more deletions, by rewriting - the archive in-place. + Reclaim unused space after one or more deletions, by rewriting + the archive in-place. DELETED/... entries are removed from the + central directory. WARNING: This may leave the archive in a corrupt state, if interrupted part-way through. @@ -2705,30 +2742,28 @@ def is_del(zi): if (wasted / total) < threshold: return - writing_to = 0 - keep = [] - self._writing = True - self._didModify = True - for ofs, zinfo in offsets: - if ofs in deleted: - # Remove it in case we get interrupted; it is - # probably about to be overwritten! - self.filelist.remove(zinfo) - else: - # Not deleted: move forward if necessary! - writing_to = self._move(zinfo, writing_to) - keep.append(zinfo) - - self.fp.truncate(writing_to) - self.fp.seek(writing_to) - self.start_dir = writing_to - self.filelist = keep - self.NameToInfo = dict((zi.filename, zi) for zi in keep) - self._writing = False + try: + writing_to = 0 + self._writing = True + self._didModify = True + for ofs, zinfo in offsets: + if ofs in deleted: + # Remove! Probably about to be overwritten. + self.filelist.remove(zinfo) + else: + # Keep: Move forward in the archive, if necessary. + writing_to = self._move(zinfo, writing_to) + self.fp.truncate(writing_to) + self.start_dir = writing_to + finally: + # This cleanup happens even if we were interrupted. + self.fp.seek(self.start_dir) + self.NameToInfo = dict((i.filename, i) for i in self.filelist) + self._writing = False def _write_end_record(self): - if not self.DELETE_IS_INSECURE: - self.compact(threshold=self.COMPACTING_THRESHOLD) + if not self.insecure_delete: + self.compact(threshold=self.compacting_threshold) return super()._write_end_record() From ffc101d57e1fad4773c3ed6f4d2614697378e12d Mon Sep 17 00:00:00 2001 From: "Bjarni R. Einarsson" Date: Wed, 6 Sep 2023 09:49:23 +0000 Subject: [PATCH 06/15] Rework ZipFileRW._move() to be more interrupt-safe --- pyzipper/zipfile.py | 47 ++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/pyzipper/zipfile.py b/pyzipper/zipfile.py index e46ce09..d6c2c89 100644 --- a/pyzipper/zipfile.py +++ b/pyzipper/zipfile.py @@ -2646,26 +2646,41 @@ def _move(self, zinfo, new_pos): the source and destination is only allowed if moving backwards. """ data_bytes = self._raw_data_length(zinfo) - _from = zinfo.header_offset - _to = new_pos + src = zinfo.header_offset + dst = new_pos - if _from != _to: - if _from < _to < _from + data_bytes: + if src != dst: + if src < dst < src + data_bytes: raise io.UnsupportedOperation( - 'Overlapping ranges are only safe when moving backwards') + 'Overlapping moves are only safe when moving backwards') + + # Overlapping moves risk corrupting the archive if we are + # interrupted. We at least try to handle that! + overlap = (dst < src < dst + data_bytes) + interrupted = None self._didModify = True - _bytes = data_bytes - while _bytes > 0: - self.fp.seek(_from) - data = self.fp.read(min(_bytes, 256*1024)) - self.fp.seek(_to) - self.fp.write(data) - _bytes -= len(data) - _from += len(data) - _to += len(data) - self.fp.seek(self.start_dir) - zinfo.header_offset = new_pos + chunk_size = 256 * 1024 + try: + for ofs in range(0, data_bytes, chunk_size): + while True: + try: + self.fp.seek(src + ofs) + chunk = self.fp.read(min(chunk_size, data_bytes - ofs)) + self.fp.seek(dst + ofs) + self.fp.write(chunk) + break + except KeyboardInterrupt as e: + interrupted = e + if not overlap: + # Aborting part way through is fine, old data is + # intact. Respect user wishes and abort. + raise + zinfo.header_offset = new_pos + if interrupted is not None: + raise KeyboardInterrupt(interrupted) + finally: + self.fp.seek(self.start_dir) return new_pos + data_bytes def delete(self, zinfo_or_arcname): From 24a2801030050b573b1d9c3d53ad6cfb6d7a2b3d Mon Sep 17 00:00:00 2001 From: "Bjarni R. Einarsson" Date: Wed, 6 Sep 2023 11:02:43 +0000 Subject: [PATCH 07/15] Add tests for ZipFileRW --- pyzipper/zipfile.py | 2 +- test/test_zipfile.py | 91 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 90 insertions(+), 3 deletions(-) diff --git a/pyzipper/zipfile.py b/pyzipper/zipfile.py index d6c2c89..9054909 100644 --- a/pyzipper/zipfile.py +++ b/pyzipper/zipfile.py @@ -2777,7 +2777,7 @@ def is_del(zi): self._writing = False def _write_end_record(self): - if not self.insecure_delete: + if not self.insecure_delete and self.compacting_threshold is not False: self.compact(threshold=self.compacting_threshold) return super()._write_end_record() diff --git a/test/test_zipfile.py b/test/test_zipfile.py index e1a3bae..45d7c1d 100644 --- a/test/test_zipfile.py +++ b/test/test_zipfile.py @@ -1053,13 +1053,14 @@ class LzmaTestZip64InSmallFiles(AbstractTestZip64InSmallFiles, class AbstractWriterTests: + ZIPFILE_CLS = zipfile.ZipFile def tearDown(self): unlink(TESTFN2) def test_close_after_close(self): data = b'content' - with zipfile.ZipFile(TESTFN2, "w", self.compression) as zipf: + with self.ZIPFILE_CLS(TESTFN2, "w", self.compression) as zipf: w = zipf.open('test', 'w') w.write(data) w.close() @@ -1070,7 +1071,7 @@ def test_close_after_close(self): def test_write_after_close(self): data = b'content' - with zipfile.ZipFile(TESTFN2, "w", self.compression) as zipf: + with self.ZIPFILE_CLS(TESTFN2, "w", self.compression) as zipf: w = zipf.open('test', 'w') w.write(data) w.close() @@ -1094,6 +1095,92 @@ class LzmaWriterTests(AbstractWriterTests, unittest.TestCase): compression = zipfile.ZIP_LZMA +class ZipFileRWTests(AbstractWriterTests, unittest.TestCase): + compression = zipfile.ZIP_STORED + ZIPFILE_CLS = zipfile.ZipFileRW + + def test_delete_secure(self): + junk = self.ZIPFILE_CLS.DELETED_JUNK[:10] + data = b'content' + kwargs = { + 'compression': self.compression, + 'insecure_delete': False, # Secure deletion please! + 'compacting_threshold': False} # Disable auto-compaction + + with self.ZIPFILE_CLS(TESTFN2, "w", **kwargs) as zipf: + with zipf.open('test', 'w') as w: + w.write(data) + with zipf.open('test2', 'w') as w: + w.write(data * 2) + with zipf.open('test3', 'w') as w: + w.write(data * 3) + + with self.ZIPFILE_CLS(TESTFN2, "a", **kwargs) as zipf: + self.assertEqual(zipf.read('test'), data) + zipf.delete('test') + try: + zipf.read('test') + self.assertFalse('not reached') + except KeyError: + pass # Great, deletion worked! + with zipf.open('test4', 'w') as w: + w.write(data * 4) + + with open(TESTFN2, 'rb') as raw: + self.assertTrue(junk in raw.read()) + + with self.ZIPFILE_CLS(TESTFN2, "a", **kwargs) as zipf: + try: + zipf.read('test') + self.assertFalse('not reached') + except KeyError: + pass # Great, deletion worked! + self.assertEqual(zipf.read('test2'), data * 2) + self.assertEqual(zipf.read('test3'), data * 3) + self.assertEqual(zipf.read('test4'), data * 4) + zipf.compact() + zipf.delete('test4') + + with open(TESTFN2, 'rb') as raw: + # We compacted, and then deleted the LAST file in the archive + # (which auto-truncates), so the archive should have no junk. + self.assertTrue(junk not in raw.read()) + + with self.ZIPFILE_CLS(TESTFN2, "a", **kwargs) as zipf: + self.assertEqual(zipf.read('test2'), data * 2) + self.assertEqual(zipf.read('test3'), data * 3) + + def test_delete_insecure(self): + junk = self.ZIPFILE_CLS.DELETED_JUNK[:10] + data1 = b'insecure' + data2 = b'content' + kwargs = { + 'compression': self.compression, + 'insecure_delete': True} + + with self.ZIPFILE_CLS(TESTFN2, "w", **kwargs) as zipf: + with zipf.open('test1', 'w') as w: + w.write(data1) + with zipf.open('test2', 'w') as w: + w.write(data2) + + with self.ZIPFILE_CLS(TESTFN2, "a", **kwargs) as zipf: + self.assertEqual(zipf.read('test1'), data1) + self.assertEqual(zipf.read('test2'), data2) + zipf.delete('test1') + try: + zipf.read('test1') + self.assertFalse('not reached') + except KeyError: + pass # Great, deletion worked! + zipf.compact() # Shouldn't do much of anything + + with open(TESTFN2, 'rb') as raw: + raw_data = raw.read() + self.assertTrue(data1 in raw_data) # The raw data is still there + self.assertTrue(junk not in raw_data) + + @unittest.skipIf(sys.version_info[0:2] < (3, 5), 'Requires Python >= 3.5') class PyZipFileTests(unittest.TestCase): def assertCompiledIn(self, name, namelist): From 163a90b9078541b3a7b238dc0b775ef6f8284a01 Mon Sep 17 00:00:00 2001 From: "Bjarni R. Einarsson" Date: Wed, 6 Sep 2023 11:14:06 +0000 Subject: [PATCH 08/15] Add test for opening with a bytestring filename --- test/test_zipfile.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/test_zipfile.py b/test/test_zipfile.py index 45d7c1d..0ebc66a 100644 --- a/test/test_zipfile.py +++ b/test/test_zipfile.py @@ -152,6 +152,12 @@ def test_open(self): for f in get_files(self): self.zip_open_test(f, self.compression) + def test_open_with_bytes(self): + path = bytes(TESTFN2, 'utf-8') + self.zip_open_test(path, self.compression) + with zipfile.ZipFile(path, "r", self.compression) as zipfp: + self.assertIsInstance(zipfp.filename, bytes) + def test_open_with_pathlike(self): path = pathlib.Path(TESTFN2) self.zip_open_test(path, self.compression) From ac6b9b7ad61c748f16bf85d8332a5b6f8c091fa8 Mon Sep 17 00:00:00 2001 From: "Bjarni R. Einarsson" Date: Wed, 6 Sep 2023 11:45:58 +0000 Subject: [PATCH 09/15] Add encrypt= arg to write() --- pyzipper/zipfile.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyzipper/zipfile.py b/pyzipper/zipfile.py index 9054909..7869462 100644 --- a/pyzipper/zipfile.py +++ b/pyzipper/zipfile.py @@ -2191,7 +2191,7 @@ def _writecheck(self, zinfo): " would require ZIP64 extensions") def write(self, filename, arcname=None, - compress_type=None, compresslevel=None): + compress_type=None, compresslevel=None, encrypt=True): """Put the bytes from filename into the archive under the name arcname.""" if not self.fp: @@ -2236,7 +2236,7 @@ def write(self, filename, arcname=None, self.fp.write(zinfo.FileHeader(False)) self.start_dir = self.fp.tell() else: - with open(filename, "rb") as src, self.open(zinfo, 'w') as dest: + with open(filename, "rb") as src, self.open(zinfo, 'w', encrypt=encrypt) as dest: shutil.copyfileobj(src, dest, 1024*8) def writestr(self, zinfo_or_arcname, data, From d983d8ed6f8ebf70e356a4ddf78958958e02bec8 Mon Sep 17 00:00:00 2001 From: "Bjarni R. Einarsson" Date: Wed, 6 Sep 2023 11:46:31 +0000 Subject: [PATCH 10/15] Document deletion and partial encryption features --- README.rst | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index d9784d8..b208a1f 100644 --- a/README.rst +++ b/README.rst @@ -11,10 +11,14 @@ pyzipper ======== A replacement for Python's ``zipfile`` that can read and write AES encrypted -zip files. Forked from Python 3.7's ``zipfile`` module, it features the same +zip files. Secure deletion of individual files from an existing ZIP archive +is also supported. + +Forked from Python 3.7's ``zipfile`` module, it features the same ``zipfile`` API from that time (most notably, lacking support for ``pathlib``-compatible wrappers that were introduced in Python 3.8). + Installation ------------ @@ -68,11 +72,67 @@ encryption kwargs: zf.setpassword(secret_password) my_secrets = zf.read('test.txt') + +Partial Encryption +------------------ + +It is possible to create archives which contain a mixture of encrypted +and unencrypted files. This can be useful (for example) to include +clear-text documentation or recovery instructions, within an otherwise +secure archive. + +To add a clear-text file to an otherwise encrypted archive, pass +``encrypt=False`` to ``open()``, ``write()`` or ``writestr()``. + +.. code-block:: python + + import pyzipper + + secret_password = b'lost art of keeping a secret' + + with pyzipper.AESZipFile('new_test.zip', + 'w', + compression=pyzipper.ZIP_LZMA) as zf: + zf.setpassword(secret_password) + zf.setencryption(pyzipper.WZ_AES, nbits=128) + zf.writestr('test.txt', "What ever you do, don't tell anyone!") + zf.writestr('README.txt', "Secrets enclosed!", encrypt=False) + + +Deletion +-------- + +Deletion of individual files from within an existing archive is supported +via the ``ZipFile.delete(filename)`` method. + +To replace an existing file, delete it first and then add a new one with +the same name. + +Note that archives must be opened in with ``mode="a"`` (append), to allow +modifications. Deleting from archives opened for reading is not supported, +and opening with ``mode="w"`` or ``mode="x"`` will replace the entire ZIP +archive with an empty one (so there will be nothing to delete). + +The algorithm used for deletion defaults to secure behavior, where data is +immediately overwritten with junk, then removed from the central directory +index, and finally (upon close) the archive is rewritten to reclaim space. + +Secure deletion can be disabled by passing ``insecure_delete=True`` to the +``ZipFile`` or ``AESZipFie`` constructors. + +It is possible to adjust how frequently the archive is rewritten, by passing +``compacting_threshold=X`` to the constructor, where X is a float between 0 +and 1, representing how much of the archive can be "wasted space" before +triggering a compaction. Manual compaction is also available using the +``compact()`` method. + + Documentation ------------- Official Python ZipFile documentation is available here: https://docs.python.org/3/library/zipfile.html + Credits ------- From 93292d4b9bd2ad5f03921b7b567eed61788ae72f Mon Sep 17 00:00:00 2001 From: "Bjarni R. Einarsson" Date: Wed, 6 Sep 2023 13:24:25 +0000 Subject: [PATCH 11/15] Add test for partial encryption --- test/test_zipfile_aes.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/test/test_zipfile_aes.py b/test/test_zipfile_aes.py index 2240ca3..ba5140c 100644 --- a/test/test_zipfile_aes.py +++ b/test/test_zipfile_aes.py @@ -555,7 +555,7 @@ def set_pwd_if_needed(self, zipfp): if self.encryption and self.pwd: zipfp.setpassword(self.pwd) - def make_test_archive(self, f, compression): + def make_test_archive(self, f, compression, partial_enc=False): # Create the ZIP archive with zipfile_aes.AESZipFile(f, "w", compression) as zipfp: if self.encryption: @@ -570,15 +570,25 @@ def make_test_archive(self, f, compression): **encryption_kwargs ) zipfp.write(TESTFN, "another.name") - zipfp.write(TESTFN, TESTFN) + zipfp.write(TESTFN, TESTFN, encrypt=(not partial_enc)) - def zip_test(self, f, compression): - self.make_test_archive(f, compression) + def zip_test(self, f, compression, partial_enc=False): + self.make_test_archive(f, compression, partial_enc=partial_enc) # Read the ZIP archive with zipfile_aes.AESZipFile(f, "r", compression) as zipfp: - self.set_pwd_if_needed(zipfp) - testdata = zipfp.read(TESTFN) + if partial_enc: + testdata = zipfp.read(TESTFN) # Unencrypted: succeeds + try: + unreadable = zipfp.read('another.name') # Encrypted: fails + self.assertFalse('not reached') + except RuntimeError: + pass # Good! Reading encrypted data failed. + self.set_pwd_if_needed(zipfp) + else: + self.set_pwd_if_needed(zipfp) + testdata = zipfp.read(TESTFN) + self.assertEqual(len(testdata), len(self.data)) self.assertEqual(testdata, self.data) self.assertEqual(zipfp.read("another.name"), self.data) @@ -587,6 +597,10 @@ def test_read(self): for f in get_files(self): self.zip_test(f, self.compression) + def test_partial_encryption(self): + for f in get_files(self): + self.zip_test(f, self.compression, partial_enc=True) + def zip_open_test(self, f, compression): self.make_test_archive(f, compression) From 57b70c8a8388d2e067a0678d2350c6f977da73d6 Mon Sep 17 00:00:00 2001 From: "Bjarni R. Einarsson" Date: Wed, 6 Sep 2023 13:24:44 +0000 Subject: [PATCH 12/15] Copy ZipFileRW tests to test_zipfile2 --- test/test_zipfile2.py | 86 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/test/test_zipfile2.py b/test/test_zipfile2.py index 37c626a..50bf518 100644 --- a/test/test_zipfile2.py +++ b/test/test_zipfile2.py @@ -847,6 +847,92 @@ class LzmaWriterTests(AbstractWriterTests, unittest.TestCase): compression = zipfile.ZIP_LZMA +class ZipFileRWTests(AbstractWriterTests, unittest.TestCase): + compression = zipfile.ZIP_STORED + ZIPFILE_CLS = zipfile_aes.AESZipFile + + def test_delete_secure(self): + junk = self.ZIPFILE_CLS.DELETED_JUNK[:10] + data = b'content' + kwargs = { + 'compression': self.compression, + 'insecure_delete': False, # Secure deletion please! + 'compacting_threshold': False} # Disable auto-compaction + + with self.ZIPFILE_CLS(TESTFN2, "w", **kwargs) as zipf: + with zipf.open('test', 'w') as w: + w.write(data) + with zipf.open('test2', 'w') as w: + w.write(data * 2) + with zipf.open('test3', 'w') as w: + w.write(data * 3) + + with self.ZIPFILE_CLS(TESTFN2, "a", **kwargs) as zipf: + self.assertEqual(zipf.read('test'), data) + zipf.delete('test') + try: + zipf.read('test') + self.assertFalse('not reached') + except KeyError: + pass # Great, deletion worked! + with zipf.open('test4', 'w') as w: + w.write(data * 4) + + with open(TESTFN2, 'rb') as raw: + self.assertTrue(junk in raw.read()) + + with self.ZIPFILE_CLS(TESTFN2, "a", **kwargs) as zipf: + try: + zipf.read('test') + self.assertFalse('not reached') + except KeyError: + pass # Great, deletion worked! + self.assertEqual(zipf.read('test2'), data * 2) + self.assertEqual(zipf.read('test3'), data * 3) + self.assertEqual(zipf.read('test4'), data * 4) + zipf.compact() + zipf.delete('test4') + + with open(TESTFN2, 'rb') as raw: + # We compacted, and then deleted the LAST file in the archive + # (which auto-truncates), so the archive should have no junk. + self.assertTrue(junk not in raw.read()) + + with self.ZIPFILE_CLS(TESTFN2, "a", **kwargs) as zipf: + self.assertEqual(zipf.read('test2'), data * 2) + self.assertEqual(zipf.read('test3'), data * 3) + + def test_delete_insecure(self): + junk = self.ZIPFILE_CLS.DELETED_JUNK[:10] + data1 = b'insecure' + data2 = b'content' + kwargs = { + 'compression': self.compression, + 'insecure_delete': True} + + with self.ZIPFILE_CLS(TESTFN2, "w", **kwargs) as zipf: + with zipf.open('test1', 'w') as w: + w.write(data1) + with zipf.open('test2', 'w') as w: + w.write(data2) + + with self.ZIPFILE_CLS(TESTFN2, "a", **kwargs) as zipf: + self.assertEqual(zipf.read('test1'), data1) + self.assertEqual(zipf.read('test2'), data2) + zipf.delete('test1') + try: + zipf.read('test1') + self.assertFalse('not reached') + except KeyError: + pass # Great, deletion worked! + zipf.compact() # Shouldn't do much of anything + + with open(TESTFN2, 'rb') as raw: + raw_data = raw.read() + self.assertTrue(data1 in raw_data) # The raw data is still there + self.assertTrue(junk not in raw_data) + + @unittest.skipIf(sys.version_info[0:2] < (3, 5), 'Requires Python >= 3.5') class PyZipFileTests(unittest.TestCase): def assertCompiledIn(self, name, namelist): From 0b981e072a47175d28b102eadbb69ca3ad40f05c Mon Sep 17 00:00:00 2001 From: "Bjarni R. Einarsson" Date: Wed, 6 Sep 2023 13:39:33 +0000 Subject: [PATCH 13/15] Make flake8 happy with my changes --- pyzipper/zipfile.py | 9 +++++---- pyzipper/zipfile_aes.py | 1 - test/test_zipfile_aes.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pyzipper/zipfile.py b/pyzipper/zipfile.py index 7869462..f7e29e4 100644 --- a/pyzipper/zipfile.py +++ b/pyzipper/zipfile.py @@ -2008,7 +2008,9 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False, encrypt=True): if mode == 'w': return self._open_to_write(zinfo, - force_zip64=force_zip64, pwd=pwd, encrypt=encrypt) + force_zip64=force_zip64, + pwd=pwd, + encrypt=encrypt) if self._writing: raise ValueError("Can't read from the ZIP file while there " @@ -2565,14 +2567,13 @@ class ZipFileRW(ZipFile): wasting space. It will also leave "DELETED" entries the directory until compacted. """ - DELETED_JUNK = b'DELETED!' * 1024 + DELETED_JUNK = b'DELETED!' * 1024 DELETED_FN_FMT = 'DELETED/%s' # Subclasses can override these to adjust defaults, which will make # sense for apps that know what their use profile is. INSECURE_DELETE = False - COMPACTING_THRESHOLD = 0.0 # Set to a ratio of allowable wasted space, - # to reduce the I/O overhead of deletion. + COMPACTING_THRESHOLD = 0.0 def __init__(self, *args, **kwargs): """ diff --git a/pyzipper/zipfile_aes.py b/pyzipper/zipfile_aes.py index 7939d99..ca4d68a 100644 --- a/pyzipper/zipfile_aes.py +++ b/pyzipper/zipfile_aes.py @@ -12,7 +12,6 @@ ZIP_LZMA, BadZipFile, BaseZipDecrypter, - ZipFile, ZipFileRW, ZipInfo, ZipExtFile, diff --git a/test/test_zipfile_aes.py b/test/test_zipfile_aes.py index ba5140c..8df7664 100644 --- a/test/test_zipfile_aes.py +++ b/test/test_zipfile_aes.py @@ -580,7 +580,7 @@ def zip_test(self, f, compression, partial_enc=False): if partial_enc: testdata = zipfp.read(TESTFN) # Unencrypted: succeeds try: - unreadable = zipfp.read('another.name') # Encrypted: fails + zipfp.read('another.name') # Encrypted: should fail self.assertFalse('not reached') except RuntimeError: pass # Good! Reading encrypted data failed. @@ -751,7 +751,7 @@ def zip_test(self, f, compression): directory = fp.getvalue() lines = directory.splitlines() - self.assertEqual(len(lines), 4) # Number of files + header + self.assertEqual(len(lines), 4) # Number of files + header self.assertIn('File Name', lines[0]) self.assertIn('Modified', lines[0]) From 266bfafea683eda3a1e0111bae8d529a092c61c0 Mon Sep 17 00:00:00 2001 From: "Bjarni R. Einarsson" Date: Fri, 17 Jan 2025 05:07:13 +0000 Subject: [PATCH 14/15] Be more careful not to lose the directory when appending/updating --- pyzipper/zipfile.py | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/pyzipper/zipfile.py b/pyzipper/zipfile.py index f7e29e4..bd77ec7 100644 --- a/pyzipper/zipfile.py +++ b/pyzipper/zipfile.py @@ -1711,6 +1711,7 @@ def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, self.encryption_kwargs = None self._comment = b'' self._strict_timestamps = strict_timestamps + self.careful = False # Check if we were passed a file-like object # os.PathLike and os.fspath were added in python 3.6 @@ -1765,6 +1766,7 @@ def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, except (AttributeError, OSError): self._seekable = False elif mode == 'a': + self.careful = True try: # See if file is a zip file self._RealGetContents() @@ -2194,8 +2196,9 @@ def _writecheck(self, zinfo): def write(self, filename, arcname=None, compress_type=None, compresslevel=None, encrypt=True): - """Put the bytes from filename into the archive under the name - arcname.""" + """Put the bytes from filename into the archive under the name + arcname.""" + with self._lock: if not self.fp: raise ValueError( "Attempt to write to ZIP archive that was already closed") @@ -2221,7 +2224,8 @@ def write(self, filename, arcname=None, else: zinfo._compresslevel = self.compresslevel - if zinfo.is_dir(): + try: + if zinfo.is_dir(): with self._lock: if self._seekable: self.fp.seek(self.start_dir) @@ -2237,17 +2241,23 @@ def write(self, filename, arcname=None, self.NameToInfo[zinfo.filename] = zinfo self.fp.write(zinfo.FileHeader(False)) self.start_dir = self.fp.tell() - else: + else: with open(filename, "rb") as src, self.open(zinfo, 'w', encrypt=encrypt) as dest: shutil.copyfileobj(src, dest, 1024*8) + finally: + if self._seekable and self.careful: + self.fp.seek(self.start_dir) + self._write_end_record() + self.fp.seek(self.start_dir) def writestr(self, zinfo_or_arcname, data, compress_type=None, compresslevel=None, encrypt=True): - """Write a file into the archive. The contents is 'data', which - may be either a 'str' or a 'bytes' instance; if it is a 'str', - it is encoded as UTF-8 first. - 'zinfo_or_arcname' is either a ZipInfo instance or - the name of the file in the archive.""" + """Write a file into the archive. The contents is 'data', which + may be either a 'str' or a 'bytes' instance; if it is a 'str', + it is encoded as UTF-8 first. + 'zinfo_or_arcname' is either a ZipInfo instance or + the name of the file in the archive.""" + with self._lock: if isinstance(data, str): data = data.encode("utf-8") if not isinstance(zinfo_or_arcname, self.zipinfo_cls): @@ -2279,9 +2289,14 @@ def writestr(self, zinfo_or_arcname, data, zinfo._compresslevel = compresslevel zinfo.file_size = len(data) # Uncompressed size - with self._lock: + try: with self.open(zinfo, mode='w', encrypt=encrypt) as dest: dest.write(data) + finally: + if self._seekable and self.careful: + self.fp.seek(self.start_dir) + self._write_end_record() + self.fp.seek(self.start_dir) def __del__(self): """Call the "close()" method in case the user forgot.""" @@ -2301,9 +2316,12 @@ def close(self): try: if self.mode in ('w', 'x', 'a') and self._didModify: # write ending records with self._lock: + try: if self._seekable: self.fp.seek(self.start_dir) self._write_end_record() + except ValueError: + pass finally: fp = self.fp self.fp = None From 0e36f24084fb2c06564127d41c33c35cdfba2a92 Mon Sep 17 00:00:00 2001 From: "Bjarni R. Einarsson" Date: Sat, 3 May 2025 15:08:01 +0000 Subject: [PATCH 15/15] Avoid bug where very small files fail to decrypt --- pyzipper/zipfile_aes.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pyzipper/zipfile_aes.py b/pyzipper/zipfile_aes.py index ca4d68a..8a9d13a 100644 --- a/pyzipper/zipfile_aes.py +++ b/pyzipper/zipfile_aes.py @@ -192,6 +192,15 @@ def __init__(self, *args, **kwargs): self.wz_aes_vendor_id = None self.wz_aes_strength = None + def __repr__(self): + r = [super().__repr__()[:-1]] + r.extend([ + ' wz_aes_version=%s' % self.wz_aes_version, + ' wz_aes_vendor_id=%s' % self.wz_aes_vendor_id, + ' wz_aes_strength=%s' % self.wz_aes_strength, + '>']) + return ''.join(r) + def decode_extra_wz_aes(self, ln, extra): if ln == 7: counts = struct.unpack("