diff --git a/README.rst b/README.rst index d9784d8..b208a1f 100644 --- a/README.rst +++ b/README.rst @@ -11,10 +11,14 @@ pyzipper ======== A replacement for Python's ``zipfile`` that can read and write AES encrypted -zip files. Forked from Python 3.7's ``zipfile`` module, it features the same +zip files. Secure deletion of individual files from an existing ZIP archive +is also supported. + +Forked from Python 3.7's ``zipfile`` module, it features the same ``zipfile`` API from that time (most notably, lacking support for ``pathlib``-compatible wrappers that were introduced in Python 3.8). + Installation ------------ @@ -68,11 +72,67 @@ encryption kwargs: zf.setpassword(secret_password) my_secrets = zf.read('test.txt') + +Partial Encryption +------------------ + +It is possible to create archives which contain a mixture of encrypted +and unencrypted files. This can be useful (for example) to include +clear-text documentation or recovery instructions, within an otherwise +secure archive. + +To add a clear-text file to an otherwise encrypted archive, pass +``encrypt=False`` to ``open()``, ``write()`` or ``writestr()``. + +.. code-block:: python + + import pyzipper + + secret_password = b'lost art of keeping a secret' + + with pyzipper.AESZipFile('new_test.zip', + 'w', + compression=pyzipper.ZIP_LZMA) as zf: + zf.setpassword(secret_password) + zf.setencryption(pyzipper.WZ_AES, nbits=128) + zf.writestr('test.txt', "What ever you do, don't tell anyone!") + zf.writestr('README.txt', "Secrets enclosed!", encrypt=False) + + +Deletion +-------- + +Deletion of individual files from within an existing archive is supported +via the ``ZipFile.delete(filename)`` method. + +To replace an existing file, delete it first and then add a new one with +the same name. + +Note that archives must be opened in with ``mode="a"`` (append), to allow +modifications. Deleting from archives opened for reading is not supported, +and opening with ``mode="w"`` or ``mode="x"`` will replace the entire ZIP +archive with an empty one (so there will be nothing to delete). + +The algorithm used for deletion defaults to secure behavior, where data is +immediately overwritten with junk, then removed from the central directory +index, and finally (upon close) the archive is rewritten to reclaim space. + +Secure deletion can be disabled by passing ``insecure_delete=True`` to the +``ZipFile`` or ``AESZipFie`` constructors. + +It is possible to adjust how frequently the archive is rewritten, by passing +``compacting_threshold=X`` to the constructor, where X is a float between 0 +and 1, representing how much of the archive can be "wasted space" before +triggering a compaction. Manual compaction is also available using the +``compact()`` method. + + Documentation ------------- Official Python ZipFile documentation is available here: https://docs.python.org/3/library/zipfile.html + Credits ------- diff --git a/pyzipper/zipfile.py b/pyzipper/zipfile.py index ae46a65..bd77ec7 100644 --- a/pyzipper/zipfile.py +++ b/pyzipper/zipfile.py @@ -36,7 +36,7 @@ __all__ = ["BadZipFile", "BadZipfile", "error", "ZIP_STORED", "ZIP_DEFLATED", "ZIP_BZIP2", "ZIP_LZMA", - "is_zipfile", "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile"] + "is_zipfile", "ZipInfo", "ZipFile", "ZipFileRW", "PyZipFile", "LargeZipFile"] class BadZipFile(Exception): @@ -1685,6 +1685,7 @@ class ZipFile: fp = None # Set here since __del__ checks it _windows_illegal_name_trans_table = None zipinfo_cls = ZipInfo + zipinfo_cls_cleartext = ZipInfo zipextfile_cls = ZipExtFile zipwritefile_cls = _ZipWriteFile @@ -1710,6 +1711,7 @@ def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, self.encryption_kwargs = None self._comment = b'' self._strict_timestamps = strict_timestamps + self.careful = False # Check if we were passed a file-like object # os.PathLike and os.fspath were added in python 3.6 @@ -1719,7 +1721,7 @@ def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, else: if isinstance(file, pathlib.PurePath): file = str(file) - if isinstance(file, str): + if isinstance(file, (str, bytes)): # No, it's a filename self._filePassed = 0 self.filename = file @@ -1764,6 +1766,7 @@ def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, except (AttributeError, OSError): self._seekable = False elif mode == 'a': + self.careful = True try: # See if file is a zip file self._RealGetContents() @@ -1963,7 +1966,7 @@ def read(self, name, pwd=None): with self.open(name, "r", pwd) as fp: return fp.read() - def open(self, name, mode="r", pwd=None, *, force_zip64=False): + def open(self, name, mode="r", pwd=None, *, force_zip64=False, encrypt=True): """Return file-like object for 'name'. name is a string for the file name within the ZIP file, or a ZipInfo @@ -1991,11 +1994,14 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False): raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__) # Make sure we have an info object - if isinstance(name, self.zipinfo_cls): + if isinstance(name, (self.zipinfo_cls, self.zipinfo_cls_cleartext)): # 'name' is already an info object zinfo = name elif mode == 'w': - zinfo = self.zipinfo_cls(name) + if encrypt: + zinfo = self.zipinfo_cls(name) + else: + zinfo = self.zipinfo_cls_cleartext(name) zinfo.compress_type = self.compression zinfo._compresslevel = self.compresslevel else: @@ -2003,7 +2009,10 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False): zinfo = self.getinfo(name) if mode == 'w': - return self._open_to_write(zinfo, force_zip64=force_zip64, pwd=pwd) + return self._open_to_write(zinfo, + force_zip64=force_zip64, + pwd=pwd, + encrypt=encrypt) if self._writing: raise ValueError("Can't read from the ZIP file while there " @@ -2023,7 +2032,7 @@ def _open_to_read(self, mode, zinfo, pwd): zef_file.close() raise e - def _open_to_write(self, zinfo, force_zip64=False, pwd=None): + def _open_to_write(self, zinfo, force_zip64=False, pwd=None, encrypt=True): if force_zip64 and not self._allowZip64: raise ValueError( "force_zip64 is True, but allowZip64 was False when opening " @@ -2043,7 +2052,7 @@ def _open_to_write(self, zinfo, force_zip64=False, pwd=None): zinfo.flag_bits = 0x00 encrypter = None - if pwd is not None or self.encryption is not None: + if (pwd is not None or self.encryption is not None) and encrypt: zinfo.flag_bits |= _MASK_ENCRYPTED encrypter = self.get_encrypter() encrypter.update_zipinfo(zinfo) @@ -2186,9 +2195,10 @@ def _writecheck(self, zinfo): " would require ZIP64 extensions") def write(self, filename, arcname=None, - compress_type=None, compresslevel=None): - """Put the bytes from filename into the archive under the name - arcname.""" + compress_type=None, compresslevel=None, encrypt=True): + """Put the bytes from filename into the archive under the name + arcname.""" + with self._lock: if not self.fp: raise ValueError( "Attempt to write to ZIP archive that was already closed") @@ -2214,7 +2224,8 @@ def write(self, filename, arcname=None, else: zinfo._compresslevel = self.compresslevel - if zinfo.is_dir(): + try: + if zinfo.is_dir(): with self._lock: if self._seekable: self.fp.seek(self.start_dir) @@ -2230,17 +2241,23 @@ def write(self, filename, arcname=None, self.NameToInfo[zinfo.filename] = zinfo self.fp.write(zinfo.FileHeader(False)) self.start_dir = self.fp.tell() - else: - with open(filename, "rb") as src, self.open(zinfo, 'w') as dest: + else: + with open(filename, "rb") as src, self.open(zinfo, 'w', encrypt=encrypt) as dest: shutil.copyfileobj(src, dest, 1024*8) + finally: + if self._seekable and self.careful: + self.fp.seek(self.start_dir) + self._write_end_record() + self.fp.seek(self.start_dir) def writestr(self, zinfo_or_arcname, data, - compress_type=None, compresslevel=None): - """Write a file into the archive. The contents is 'data', which - may be either a 'str' or a 'bytes' instance; if it is a 'str', - it is encoded as UTF-8 first. - 'zinfo_or_arcname' is either a ZipInfo instance or - the name of the file in the archive.""" + compress_type=None, compresslevel=None, encrypt=True): + """Write a file into the archive. The contents is 'data', which + may be either a 'str' or a 'bytes' instance; if it is a 'str', + it is encoded as UTF-8 first. + 'zinfo_or_arcname' is either a ZipInfo instance or + the name of the file in the archive.""" + with self._lock: if isinstance(data, str): data = data.encode("utf-8") if not isinstance(zinfo_or_arcname, self.zipinfo_cls): @@ -2272,9 +2289,14 @@ def writestr(self, zinfo_or_arcname, data, zinfo._compresslevel = compresslevel zinfo.file_size = len(data) # Uncompressed size - with self._lock: - with self.open(zinfo, mode='w') as dest: + try: + with self.open(zinfo, mode='w', encrypt=encrypt) as dest: dest.write(data) + finally: + if self._seekable and self.careful: + self.fp.seek(self.start_dir) + self._write_end_record() + self.fp.seek(self.start_dir) def __del__(self): """Call the "close()" method in case the user forgot.""" @@ -2294,9 +2316,12 @@ def close(self): try: if self.mode in ('w', 'x', 'a') and self._didModify: # write ending records with self._lock: + try: if self._seekable: self.fp.seek(self.start_dir) self._write_end_record() + except ValueError: + pass finally: fp = self.fp self.fp = None @@ -2538,6 +2563,244 @@ def _compile(file, optimize=-1): return (fname, archivename) +class ZipFileRW(ZipFile): + """ ZipFile subclass which can delete in-place from ZIP archives. + + z = ZipFileRW(file, mode="a", + insecure_delete=False, + compacting_threshold=0.0, + ... ) + + In addition to inherited ZipFile arguments, ZipFileRW settings are: + + insecure_delete: Set to True, to leave "deleted" data in the archive + body. In this mode, deleting only removes entries + from the central directory. This is faster and makes + it theoretically possible to restore deleted files. + When True, archive compaction is disabled. + + compacting_threshold: A ratio of allowable wasted space (0.5 = 50%). + Setting to a value above the default of 0 will + reduce disk I/O somewhat, at the expense of + wasting space. It will also leave "DELETED" + entries the directory until compacted. + """ + DELETED_JUNK = b'DELETED!' * 1024 + DELETED_FN_FMT = 'DELETED/%s' + + # Subclasses can override these to adjust defaults, which will make + # sense for apps that know what their use profile is. + INSECURE_DELETE = False + COMPACTING_THRESHOLD = 0.0 + + def __init__(self, *args, **kwargs): + """ + A subclass of ZipFile + """ + self.compacting_threshold = kwargs.pop( + 'compacting_threshold', self.COMPACTING_THRESHOLD) + self.insecure_delete = kwargs.pop( + 'insecure_delete', self.INSECURE_DELETE) + self.deleted_fn_fmt = kwargs.pop( + 'deleted_fn_fmt', self.DELETED_FN_FMT) + super().__init__(*args, **kwargs) + + def _raw_data_length(self, zinfo): + """ + This calculates how many bytes of the archive are dedicated to + this file/directory. This is guaranteed to never overlap with + any other files, even if headers are corrupt or our code buggy. + """ + offsets = [zi.header_offset for zi in self.filelist] + offsets.append(self.start_dir) + offsets.sort() + for i, o in enumerate(offsets): + if o == zinfo.header_offset: + return (offsets[i+1] - o) + return 0 + + def _overwrite_deleted_data(self, zinfo): + """ + Overwrite the data in the archive with junk (unless insecure deletion + is requested). If we are lucky and this is the last entry in the + archive, truncate it. + + Returns the (ofs, length) of the overwritten segment, False if we + truncated, or None if deletion is insecure or the entry not found. + """ + junk_bytes = self._raw_data_length(zinfo) + if not junk_bytes: + return None + + self._writing = True + + if self.insecure_delete: + gap = None + else: + gap = (zinfo.header_offset, junk_bytes) + self._didModify = True + self.fp.seek(zinfo.header_offset) + while True: + if junk_bytes > len(self.DELETED_JUNK): + self.fp.write(self.DELETED_JUNK) + junk_bytes -= len(self.DELETED_JUNK) + else: + self.fp.write(self.DELETED_JUNK[:junk_bytes]) + break + self.fp.seek(self.start_dir) + + if (zinfo.header_offset + junk_bytes) == self.start_dir: + self._didModify = True + self.fp.seek(zinfo.header_offset) + self.fp.truncate(zinfo.header_offset) + self.start_dir = zinfo.header_offset + gap = False + + self._writing = False + return gap + + def _move(self, zinfo, new_pos): + """ + Move a file to a new location within the archive. Overlap between + the source and destination is only allowed if moving backwards. + """ + data_bytes = self._raw_data_length(zinfo) + src = zinfo.header_offset + dst = new_pos + + if src != dst: + if src < dst < src + data_bytes: + raise io.UnsupportedOperation( + 'Overlapping moves are only safe when moving backwards') + + # Overlapping moves risk corrupting the archive if we are + # interrupted. We at least try to handle that! + overlap = (dst < src < dst + data_bytes) + interrupted = None + + self._didModify = True + chunk_size = 256 * 1024 + try: + for ofs in range(0, data_bytes, chunk_size): + while True: + try: + self.fp.seek(src + ofs) + chunk = self.fp.read(min(chunk_size, data_bytes - ofs)) + self.fp.seek(dst + ofs) + self.fp.write(chunk) + break + except KeyboardInterrupt as e: + interrupted = e + if not overlap: + # Aborting part way through is fine, old data is + # intact. Respect user wishes and abort. + raise + zinfo.header_offset = new_pos + if interrupted is not None: + raise KeyboardInterrupt(interrupted) + finally: + self.fp.seek(self.start_dir) + return new_pos + data_bytes + + def delete(self, zinfo_or_arcname): + """ + Delete an entry from the archive. + + By default (when insecure_delete=False) the archived data will be + overwritten with garbage. Space is reclaimed upon compaction. + """ + if not self._seekable: + raise io.UnsupportedOperation('Can only delete from seekable files') + + if hasattr(zinfo_or_arcname, 'filename'): + arcname = zinfo_or_arcname.filename + else: + arcname = zinfo_or_arcname + + with self._lock: + zinfo = self.NameToInfo[arcname] # May throw KeyError + del self.NameToInfo[arcname] + gap = self._overwrite_deleted_data(zinfo) + if gap: + fn = self.deleted_fn_fmt % gap[0] + gap_zinfo = self.zipinfo_cls(fn) + gap_zinfo.header_offset = gap[0] + gap_zinfo.compress_size = 0 + gap_zinfo.file_size = 0 + gap_zinfo.CRC = 0 + self.NameToInfo[fn] = gap_zinfo + self.filelist[self.filelist.index(zinfo)] = gap_zinfo + else: + self.filelist.remove(zinfo) + self._didModify = True + + def compact(self, threshold=0.0): + """ + Reclaim unused space after one or more deletions, by rewriting + the archive in-place. DELETED/... entries are removed from the + central directory. + + WARNING: This may leave the archive in a corrupt state, if + interrupted part-way through. + """ + if not self._seekable: + return + + def is_del(zi): + return ( + (zi.compress_size == zi.CRC == zi.file_size == 0) and + (zi.filename.startswith(self.DELETED_FN_FMT % ''))) + + with self._lock: + deleted = [zi.header_offset for zi in self.filelist if is_del(zi)] + if not deleted: + return + + # Generate a list of (offset, zipinfo) tuples which is in the + # same order as the data in the archive itself. This guarantees + # our _move() ops below will always succeed. + offsets = [(zi.header_offset, zi) + for i, zi in enumerate(self.filelist)] + offsets.sort() + + # If a threshold is set, calculate how much space is being + # wasted before we proceed. If below the threshold, abort! + if threshold: + total = 0.0 + wasted = 0.0 + for i, (ofs, zi) in enumerate(offsets[:-1]): + size = offsets[i+1][0] - ofs + total += size + if ofs in deleted: + wasted += size + if (wasted / total) < threshold: + return + + try: + writing_to = 0 + self._writing = True + self._didModify = True + for ofs, zinfo in offsets: + if ofs in deleted: + # Remove! Probably about to be overwritten. + self.filelist.remove(zinfo) + else: + # Keep: Move forward in the archive, if necessary. + writing_to = self._move(zinfo, writing_to) + self.fp.truncate(writing_to) + self.start_dir = writing_to + finally: + # This cleanup happens even if we were interrupted. + self.fp.seek(self.start_dir) + self.NameToInfo = dict((i.filename, i) for i in self.filelist) + self._writing = False + + def _write_end_record(self): + if not self.insecure_delete and self.compacting_threshold is not False: + self.compact(threshold=self.compacting_threshold) + return super()._write_end_record() + + def main(args=None): import argparse diff --git a/pyzipper/zipfile_aes.py b/pyzipper/zipfile_aes.py index ddfd84a..8a9d13a 100644 --- a/pyzipper/zipfile_aes.py +++ b/pyzipper/zipfile_aes.py @@ -12,7 +12,7 @@ ZIP_LZMA, BadZipFile, BaseZipDecrypter, - ZipFile, + ZipFileRW, ZipInfo, ZipExtFile, ) @@ -192,6 +192,15 @@ def __init__(self, *args, **kwargs): self.wz_aes_vendor_id = None self.wz_aes_strength = None + def __repr__(self): + r = [super().__repr__()[:-1]] + r.extend([ + ' wz_aes_version=%s' % self.wz_aes_version, + ' wz_aes_vendor_id=%s' % self.wz_aes_vendor_id, + ' wz_aes_strength=%s' % self.wz_aes_strength, + '>']) + return ''.join(r) + def decode_extra_wz_aes(self, ln, extra): if ln == 7: counts = struct.unpack("= 3.5') class PyZipFileTests(unittest.TestCase): def assertCompiledIn(self, name, namelist): diff --git a/test/test_zipfile2.py b/test/test_zipfile2.py index 37c626a..50bf518 100644 --- a/test/test_zipfile2.py +++ b/test/test_zipfile2.py @@ -847,6 +847,92 @@ class LzmaWriterTests(AbstractWriterTests, unittest.TestCase): compression = zipfile.ZIP_LZMA +class ZipFileRWTests(AbstractWriterTests, unittest.TestCase): + compression = zipfile.ZIP_STORED + ZIPFILE_CLS = zipfile_aes.AESZipFile + + def test_delete_secure(self): + junk = self.ZIPFILE_CLS.DELETED_JUNK[:10] + data = b'content' + kwargs = { + 'compression': self.compression, + 'insecure_delete': False, # Secure deletion please! + 'compacting_threshold': False} # Disable auto-compaction + + with self.ZIPFILE_CLS(TESTFN2, "w", **kwargs) as zipf: + with zipf.open('test', 'w') as w: + w.write(data) + with zipf.open('test2', 'w') as w: + w.write(data * 2) + with zipf.open('test3', 'w') as w: + w.write(data * 3) + + with self.ZIPFILE_CLS(TESTFN2, "a", **kwargs) as zipf: + self.assertEqual(zipf.read('test'), data) + zipf.delete('test') + try: + zipf.read('test') + self.assertFalse('not reached') + except KeyError: + pass # Great, deletion worked! + with zipf.open('test4', 'w') as w: + w.write(data * 4) + + with open(TESTFN2, 'rb') as raw: + self.assertTrue(junk in raw.read()) + + with self.ZIPFILE_CLS(TESTFN2, "a", **kwargs) as zipf: + try: + zipf.read('test') + self.assertFalse('not reached') + except KeyError: + pass # Great, deletion worked! + self.assertEqual(zipf.read('test2'), data * 2) + self.assertEqual(zipf.read('test3'), data * 3) + self.assertEqual(zipf.read('test4'), data * 4) + zipf.compact() + zipf.delete('test4') + + with open(TESTFN2, 'rb') as raw: + # We compacted, and then deleted the LAST file in the archive + # (which auto-truncates), so the archive should have no junk. + self.assertTrue(junk not in raw.read()) + + with self.ZIPFILE_CLS(TESTFN2, "a", **kwargs) as zipf: + self.assertEqual(zipf.read('test2'), data * 2) + self.assertEqual(zipf.read('test3'), data * 3) + + def test_delete_insecure(self): + junk = self.ZIPFILE_CLS.DELETED_JUNK[:10] + data1 = b'insecure' + data2 = b'content' + kwargs = { + 'compression': self.compression, + 'insecure_delete': True} + + with self.ZIPFILE_CLS(TESTFN2, "w", **kwargs) as zipf: + with zipf.open('test1', 'w') as w: + w.write(data1) + with zipf.open('test2', 'w') as w: + w.write(data2) + + with self.ZIPFILE_CLS(TESTFN2, "a", **kwargs) as zipf: + self.assertEqual(zipf.read('test1'), data1) + self.assertEqual(zipf.read('test2'), data2) + zipf.delete('test1') + try: + zipf.read('test1') + self.assertFalse('not reached') + except KeyError: + pass # Great, deletion worked! + zipf.compact() # Shouldn't do much of anything + + with open(TESTFN2, 'rb') as raw: + raw_data = raw.read() + self.assertTrue(data1 in raw_data) # The raw data is still there + self.assertTrue(junk not in raw_data) + + @unittest.skipIf(sys.version_info[0:2] < (3, 5), 'Requires Python >= 3.5') class PyZipFileTests(unittest.TestCase): def assertCompiledIn(self, name, namelist): diff --git a/test/test_zipfile_aes.py b/test/test_zipfile_aes.py index 2240ca3..8df7664 100644 --- a/test/test_zipfile_aes.py +++ b/test/test_zipfile_aes.py @@ -555,7 +555,7 @@ def set_pwd_if_needed(self, zipfp): if self.encryption and self.pwd: zipfp.setpassword(self.pwd) - def make_test_archive(self, f, compression): + def make_test_archive(self, f, compression, partial_enc=False): # Create the ZIP archive with zipfile_aes.AESZipFile(f, "w", compression) as zipfp: if self.encryption: @@ -570,15 +570,25 @@ def make_test_archive(self, f, compression): **encryption_kwargs ) zipfp.write(TESTFN, "another.name") - zipfp.write(TESTFN, TESTFN) + zipfp.write(TESTFN, TESTFN, encrypt=(not partial_enc)) - def zip_test(self, f, compression): - self.make_test_archive(f, compression) + def zip_test(self, f, compression, partial_enc=False): + self.make_test_archive(f, compression, partial_enc=partial_enc) # Read the ZIP archive with zipfile_aes.AESZipFile(f, "r", compression) as zipfp: - self.set_pwd_if_needed(zipfp) - testdata = zipfp.read(TESTFN) + if partial_enc: + testdata = zipfp.read(TESTFN) # Unencrypted: succeeds + try: + zipfp.read('another.name') # Encrypted: should fail + self.assertFalse('not reached') + except RuntimeError: + pass # Good! Reading encrypted data failed. + self.set_pwd_if_needed(zipfp) + else: + self.set_pwd_if_needed(zipfp) + testdata = zipfp.read(TESTFN) + self.assertEqual(len(testdata), len(self.data)) self.assertEqual(testdata, self.data) self.assertEqual(zipfp.read("another.name"), self.data) @@ -587,6 +597,10 @@ def test_read(self): for f in get_files(self): self.zip_test(f, self.compression) + def test_partial_encryption(self): + for f in get_files(self): + self.zip_test(f, self.compression, partial_enc=True) + def zip_open_test(self, f, compression): self.make_test_archive(f, compression) @@ -737,7 +751,7 @@ def zip_test(self, f, compression): directory = fp.getvalue() lines = directory.splitlines() - self.assertEqual(len(lines), 4) # Number of files + header + self.assertEqual(len(lines), 4) # Number of files + header self.assertIn('File Name', lines[0]) self.assertIn('Modified', lines[0])