diff --git a/src/borghash/HashTable.pxd b/src/borghash/HashTable.pxd
index d00222c..0451520 100644
--- a/src/borghash/HashTable.pxd
+++ b/src/borghash/HashTable.pxd
@@ -1,19 +1,23 @@
 from libc.stdint cimport uint8_t, uint32_t
 
 cdef class HashTable:
-    cdef int ksize, vsize
+    cdef public int ksize, vsize
     cdef readonly size_t capacity, used
     cdef size_t initial_capacity, tombstones
     cdef float max_load_factor, min_load_factor, shrink_factor, grow_factor
     cdef uint32_t* table
-    cdef uint32_t kv_capacity, kv_used
+    cdef public uint32_t kv_capacity, kv_used
     cdef float kv_grow_factor
-    cdef uint8_t* keys
-    cdef uint8_t* values
+    cdef uint8_t* kv
+    cdef uint8_t* header
+    cdef int fd
+    cdef size_t mmap_size
+    cdef uint32_t kv_offset
     cdef int stats_get, stats_set, stats_del, stats_iter, stats_lookup, stats_linear
     cdef int stats_resize_table, stats_resize_kv
 
     cdef size_t _get_index(self, uint8_t* key)
     cdef int _lookup_index(self, uint8_t* key_ptr, size_t* index_ptr)
+    cpdef void update_table_only(self, bytes key, uint32_t kv_index)
     cdef void _resize_table(self, size_t new_capacity)
     cdef void _resize_kv(self, size_t new_capacity)
diff --git a/src/borghash/HashTable.pyx b/src/borghash/HashTable.pyx
index 67284e9..fd1b8ba 100644
--- a/src/borghash/HashTable.pyx
+++ b/src/borghash/HashTable.pyx
@@ -1,8 +1,8 @@
 """
 HashTable: low-level hash table mapping fully random bytes keys to bytes values.
            Key and value lengths can be chosen, but are fixed thereafter.
-           The keys and values are stored in arrays separate from the hashtable.
-           The hashtable only stores the 32-bit indices into the key/value arrays.
+           The keys and values are stored together in an array separate from the hashtable.
+           The hashtable only stores the 32-bit indices into the key/value array.
 """
 from __future__ import annotations
 from typing import BinaryIO, Iterator, Any
@@ -10,6 +10,10 @@ from typing import BinaryIO, Iterator, Any
 from libc.stdlib cimport malloc, free, realloc
 from libc.string cimport memcpy, memset, memcmp
 from libc.stdint cimport uint8_t, uint32_t
+from libc.errno cimport errno
+from posix.unistd cimport close, ftruncate, lseek, SEEK_END
+from posix.fcntl cimport open as c_open, O_RDWR, O_CREAT
+from posix.mman cimport mmap, munmap, MAP_SHARED, PROT_READ, PROT_WRITE
 
 from collections.abc import Mapping
 
@@ -47,11 +51,12 @@ cdef class HashTable:
                  key_size: int = 0, value_size: int = 0, capacity: int = MIN_CAPACITY,
                  max_load_factor: float = 0.5, min_load_factor: float = 0.10,
                  shrink_factor: float = 0.4, grow_factor: float = 2.0,
-                 kv_grow_factor: float = 1.3) -> None:
+                 kv_grow_factor: float = 1.3,
+                 path: str = None, kv_offset: int = 0) -> None:
         # the load of the ht (.table) shall be between 0.25 and 0.5, so it is fast and has few collisions.
         # it is cheap to have a low hash table load, because .table only stores uint32_t indices into the
-        # .keys and .values array.
-        # the keys/values arrays have bigger elements and are not hash tables, thus collisions and load
+        # .kv array.
+        # the .kv array has bigger elements and is not a hash table, thus collisions and load
         # factor are no concern there. the kv_grow_factor can be relatively small.
         if key_size < 4:
             raise ValueError("key_size must be specified and must be >= 4.")
@@ -59,6 +64,15 @@ cdef class HashTable:
             raise ValueError("value_size must be specified and must be > 0.")
         self.ksize = key_size
         self.vsize = value_size
+        # vvv mmap vvv
+        self.fd = -1
+        self.mmap_size = 0
+        self.kv_offset = kv_offset
+        if path:
+            self.fd = c_open(path.encode('utf-8'), O_RDWR | O_CREAT, 0o644)
+            if self.fd == -1:
+                raise OSError(errno, f"Could not open {path}")
+        # ^^^ mmap ^^^
         # vvv hash table vvv
         self.max_load_factor = max_load_factor
         self.min_load_factor = min_load_factor
@@ -71,13 +85,31 @@ cdef class HashTable:
         self.table = NULL
         self._resize_table(self.initial_capacity)
         # ^^^ hash table ^^^
-        # vvv kv arrays vvv
+        # vvv kv array vvv
         self.kv_grow_factor = kv_grow_factor
         self.kv_used = 0
-        self.keys = NULL
-        self.values = NULL
-        self._resize_kv(int(self.initial_capacity * self.max_load_factor))
-        # ^^^ kv arrays ^^^
+        self.kv = NULL
+        self.header = NULL
+        if self.fd != -1:
+            # For mmap, we determine current size and capacity from file size.
+            file_size = lseek(self.fd, 0, SEEK_END)
+            if file_size > 0:
+                self.mmap_size = file_size
+                new_kv = mmap(NULL, self.mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, self.fd, 0)
+                if new_kv == <void*> -1:
+                    raise OSError(errno, "mmap failed")
+                self.header = <uint8_t*> new_kv
+                if self.kv_offset > 0:
+                    self.kv = <uint8_t*> new_kv + self.kv_offset
+                    self.kv_capacity = <uint32_t>((self.mmap_size - self.kv_offset) // (self.ksize + self.vsize))
+                else:
+                    self.kv = NULL
+                    self.kv_capacity = 0
+            else:
+                self._resize_kv(int(self.initial_capacity * self.max_load_factor))
+        else:
+            self._resize_kv(int(self.initial_capacity * self.max_load_factor))
+        # ^^^ kv array ^^^
         # vvv stats vvv
         self.stats_get = 0
         self.stats_set = 0
@@ -92,8 +124,17 @@ cdef class HashTable:
 
     def __del__(self) -> None:
         free(self.table)
-        free(self.keys)
-        free(self.values)
+        self.table = NULL
+        if self.fd != -1:
+            if self.header != NULL:
+                munmap(self.header, self.mmap_size)
+                self.header = NULL
+                self.kv = NULL
+            close(self.fd)
+            self.fd = -1
+        else:
+            free(self.kv)
+            self.kv = NULL
 
     def clear(self) -> None:
         """Empty the HashTable and start from scratch."""
@@ -122,7 +163,7 @@ cdef class HashTable:
         self.stats_lookup += 1
         while (kv_index := self.table[index]) != FREE_BUCKET:
             self.stats_linear += 1
-            if kv_index != TOMBSTONE_BUCKET and memcmp(self.keys + kv_index * self.ksize, key_ptr, self.ksize) == 0:
+            if kv_index != TOMBSTONE_BUCKET and memcmp(self.kv + kv_index * (self.ksize + self.vsize), key_ptr, self.ksize) == 0:
                 if index_ptr:
                     index_ptr[0] = index
                 return 1  # found
@@ -142,11 +183,12 @@ cdef class HashTable:
         self.stats_set += 1
         if self._lookup_index(key_ptr, &index):
             kv_index = self.table[index]
-            memcpy(self.values + kv_index * self.vsize, value_ptr, self.vsize)
+            memcpy(self.kv + kv_index * (self.ksize + self.vsize) + self.ksize, value_ptr, self.vsize)
             return
 
         if self.kv_used >= self.kv_capacity:
-            self._resize_kv(int(self.kv_capacity * self.kv_grow_factor))
+            # "+ 1" ensures growth even for very small or 0 capacity.
+            self._resize_kv(int(self.kv_capacity * self.kv_grow_factor + 1))
         if self.kv_used >= self.kv_capacity:
             # Should never happen. See "RESERVED" constant - we allow almost 4Gi kv entries.
             # For a typical 256-bit key and a small 32-bit value that would already consume 176GiB+
@@ -154,8 +196,8 @@ cdef class HashTable:
             raise RuntimeError("KV array is full")
 
         kv_index = self.kv_used
-        memcpy(self.keys + kv_index * self.ksize, key_ptr, self.ksize)
-        memcpy(self.values + kv_index * self.vsize, value_ptr, self.vsize)
+        memcpy(self.kv + kv_index * (self.ksize + self.vsize), key_ptr, self.ksize)
+        memcpy(self.kv + kv_index * (self.ksize + self.vsize) + self.ksize, value_ptr, self.vsize)
         self.kv_used += 1
 
         self.used += 1
@@ -177,7 +219,7 @@ cdef class HashTable:
         self.stats_get += 1
         if self._lookup_index(<uint8_t*> key, &index):
             kv_index = self.table[index]
-            return self.values[kv_index * self.vsize:(kv_index + 1) * self.vsize]
+            return self.kv[kv_index * (self.ksize + self.vsize) + self.ksize : kv_index * (self.ksize + self.vsize) + self.ksize + self.vsize]
         else:
             raise KeyError("Key not found")
 
@@ -191,8 +233,7 @@ cdef class HashTable:
         self.stats_del += 1
         if self._lookup_index(key_ptr, &index):
             kv_index = self.table[index]
-            memset(self.keys + kv_index * self.ksize, 0, self.ksize)
-            memset(self.values + kv_index * self.vsize, 0, self.vsize)
+            memset(self.kv + kv_index * (self.ksize + self.vsize), 0, self.ksize + self.vsize)
             self.table[index] = TOMBSTONE_BUCKET
             self.used -= 1
             self.tombstones += 1
@@ -233,10 +274,21 @@ cdef class HashTable:
         for i in range(self.capacity):
             kv_index = self.table[i]
             if kv_index not in (FREE_BUCKET, TOMBSTONE_BUCKET):
-                key = self.keys[kv_index * self.ksize:(kv_index + 1) * self.ksize]
-                value = self.values[kv_index * self.vsize:(kv_index + 1) * self.vsize]
+                key = self.kv[kv_index * (self.ksize + self.vsize) : kv_index * (self.ksize + self.vsize) + self.ksize]
+                value = self.kv[kv_index * (self.ksize + self.vsize) + self.ksize : kv_index * (self.ksize + self.vsize) + self.ksize + self.vsize]
                 yield key, value
 
+    cpdef void update_table_only(self, bytes key, uint32_t kv_index):
+        cdef size_t index
+        self._lookup_index(<uint8_t*> key, &index)
+        # index is either a bucket containing the key (if it already existed)
+        # or it is the first free/tombstone bucket in the probe sequence.
+        if self.table[index] == FREE_BUCKET or self.table[index] == TOMBSTONE_BUCKET:
+            self.used += 1
+        self.table[index] = kv_index
+        if self.used + self.tombstones > self.capacity * self.max_load_factor:
+            self._resize_table(int(self.capacity * self.grow_factor))
+
     cdef void _resize_table(self, size_t new_capacity):
         cdef size_t i, index
         cdef uint32_t kv_index
@@ -250,7 +302,7 @@ cdef class HashTable:
         for i in range(current_capacity):
             kv_index = self.table[i]
             if kv_index not in (FREE_BUCKET, TOMBSTONE_BUCKET):
-                index = self._get_index(self.keys + kv_index * self.ksize)
+                index = self._get_index(self.kv + kv_index * (self.ksize + self.vsize))
                 while new_table[index] != FREE_BUCKET:
                     index = (index + 1) % new_capacity
                 new_table[index] = kv_index
@@ -262,12 +314,36 @@ cdef class HashTable:
     cdef void _resize_kv(self, size_t new_capacity):
         # We must never use kv indices >= RESERVED; thus, we'll never need more capacity either.
         cdef size_t capacity = min(new_capacity, <size_t> RESERVED - 1)
+        cdef size_t new_mmap_size
+        cdef void* new_ptr
         self.stats_resize_kv += 1
-        # realloc is already highly optimized (in Linux). By using mremap internally only the peak address space usage is "old size" + "new size", while the peak memory usage is only "new size".
-        self.keys = <uint8_t*> realloc(self.keys, capacity * self.ksize * sizeof(uint8_t))
-        self.values = <uint8_t*> realloc(self.values, capacity * self.vsize * sizeof(uint8_t))
+        if self.fd != -1:
+            new_mmap_size = self.kv_offset + capacity * (self.ksize + self.vsize) * sizeof(uint8_t)
+            if self.header != NULL:
+                # Don't shrink automatically during resize if we already have space.
+                # This prevents truncating an existing file's data when it's opened
+                # with a smaller initial_capacity than the file already contains.
+                # HOWEVER, we MUST shrink if capacity < self.kv_capacity (e.g. shrink_to_fit).
+                if new_mmap_size <= self.mmap_size and capacity >= self.kv_capacity:
+                    return
+                munmap(self.header, self.mmap_size)
+            if ftruncate(self.fd, new_mmap_size) == -1:
+                raise OSError(errno, "ftruncate failed")
+            new_ptr = mmap(NULL, new_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, self.fd, 0)
+            if new_ptr == <void*> -1:
+                raise OSError(errno, "mmap failed")
+            self.header = <uint8_t*> new_ptr
+            self.kv = <uint8_t*> new_ptr + self.kv_offset
+            self.mmap_size = new_mmap_size
+        else:
+            # realloc is already highly optimized (in Linux). By using mremap internally, only the peak address space usage is "old size" + "new size", while the peak memory usage is only "new size".
+            self.kv = <uint8_t*> realloc(self.kv, capacity * (self.ksize + self.vsize) * sizeof(uint8_t))
         self.kv_capacity = <uint32_t> capacity
 
+    def shrink_to_fit(self) -> None:
+        """Shrink the KV array and the file to the actually used size."""
+        self._resize_kv(self.kv_used)
+
     def k_to_idx(self, key: bytes) -> int:
         """
         Return the key's index in the keys array (index is stable while in memory).
@@ -283,15 +359,17 @@ cdef class HashTable:
 
     def idx_to_k(self, idx: int) -> bytes:
         """
-        For a given index, return the key stored at that index in the keys array.
+        For a given index, return the key stored at that index in the kv array.
         This is the reverse of k_to_idx (e.g., 32-bit index -> 256-bit key).
         """
         cdef uint32_t kv_index = <uint32_t> idx
-        return self.keys[kv_index * self.ksize:(kv_index + 1) * self.ksize]
+        if kv_index >= self.kv_used:
+             raise KeyError(f"Index {kv_index} out of range (kv_used={self.kv_used})")
+        return self.kv[kv_index * (self.ksize + self.vsize) : kv_index * (self.ksize + self.vsize) + self.ksize]
 
     def kv_to_idx(self, key: bytes, value: bytes) -> int:
         """
-        Return the key's/value's index in the keys/values array (index is stable while in memory).
+        Return the key's/value's index in the kv array (index is stable while in memory).
         This can be used to "abbreviate" a known key/value pair (e.g., 256-bit key + 32-bit value -> 32-bit index).
         """
         if len(key) != self.ksize:
@@ -302,19 +380,19 @@ cdef class HashTable:
         cdef uint32_t kv_index
         if self._lookup_index(<uint8_t*> key, &index):
             kv_index = self.table[index]
-            value_found = self.values[kv_index * self.vsize:(kv_index + 1) * self.vsize]
+            value_found = self.kv[kv_index * (self.ksize + self.vsize) + self.ksize : kv_index * (self.ksize + self.vsize) + self.ksize + self.vsize]
             if value == value_found:
                 return kv_index
         raise KeyError("Key/Value not found")
 
     def idx_to_kv(self, idx: int) -> tuple[bytes, bytes]:
         """
-        For a given index, return the key/value stored at that index in the keys/values array.
+        For a given index, return the key/value stored at that index in the kv array.
         This is the reverse of kv_to_idx (e.g., 32-bit index -> 256-bit key + 32-bit value).
         """
         cdef uint32_t kv_index = <uint32_t> idx
-        key = self.keys[kv_index * self.ksize:(kv_index + 1) * self.ksize]
-        value = self.values[kv_index * self.vsize:(kv_index + 1) * self.vsize]
+        key = self.kv[kv_index * (self.ksize + self.vsize) : kv_index * (self.ksize + self.vsize) + self.ksize]
+        value = self.kv[kv_index * (self.ksize + self.vsize) + self.ksize : kv_index * (self.ksize + self.vsize) + self.ksize + self.vsize]
         return key, value
 
     @property
diff --git a/src/borghash/HashTableNT.pxd b/src/borghash/HashTableNT.pxd
index d3b15f7..fdc2a2f 100644
--- a/src/borghash/HashTableNT.pxd
+++ b/src/borghash/HashTableNT.pxd
@@ -1,8 +1,10 @@
+from .HashTable cimport HashTable
+
 cdef class HashTableNT:
-    cdef int key_size
-    cdef object byte_order
-    cdef object value_type
-    cdef object value_format
-    cdef object value_struct
-    cdef int value_size
-    cdef object inner
+    cdef public int key_size
+    cdef public object byte_order
+    cdef public object value_type
+    cdef public object value_format
+    cdef public object value_struct
+    cdef public int value_size
+    cdef public HashTable inner
diff --git a/src/borghash/HashTableNT.pyx b/src/borghash/HashTableNT.pyx
index 02b0e92..2fc50d9 100644
--- a/src/borghash/HashTableNT.pyx
+++ b/src/borghash/HashTableNT.pyx
@@ -11,11 +11,16 @@ import json
 import struct
 
 from .HashTable import HashTable, MIN_CAPACITY, _fill
+from libc.stdint cimport uint8_t
+from posix.types cimport off_t
+from posix.unistd cimport lseek, SEEK_SET, SEEK_CUR, write as c_write
+from libc.string cimport memcpy, memset
 
 MAGIC = b"BORGHASH"
 assert len(MAGIC) == 8
 VERSION = 1  # version of the on-disk (serialized) format produced by .write().
-HEADER_FMT = "<8sII"  # magic, version, meta length
+HEADER_FMT = "<8sIIIII"  # magic, version, meta length, capacity, used, kv_used
+ALIGNMENT = 64  # usual length of cache line
 
 BYTE_ORDER = dict(big=">", little="<", network="!", native="=")  # struct format chars
 
@@ -23,8 +28,10 @@ _NoDefault = object()
 
 cdef class HashTableNT:
     def __init__(self, items=None, *,
-                 key_size: int, value_type: Any, value_format: Any,
-                 capacity: int = MIN_CAPACITY, byte_order="little") -> None:
+                 int key_size=0, value_type=None, value_format=None,
+                 int capacity = MIN_CAPACITY, str byte_order="little",
+                 str path = None, int kv_offset = 4096) -> None:
+        cdef int valid_count
         if not isinstance(key_size, int) or not key_size >= 4:
             raise ValueError("key_size must be an integer and >= 4.")
         if type(value_type) is not type:
@@ -43,7 +50,36 @@ cdef class HashTableNT:
         self.byte_order = byte_order
         self.value_struct = struct.Struct(BYTE_ORDER[byte_order] + "".join(value_format))
         self.value_size = self.value_struct.size
-        self.inner = HashTable(key_size=self.key_size, value_size=self.value_size, capacity=capacity)
+        self.inner = HashTable(key_size=self.key_size, value_size=self.value_size, capacity=capacity,
+                               path=path, kv_offset=kv_offset)
+        if path:
+            # Check if file already exists and has a header
+            header_size = struct.calcsize(HEADER_FMT)
+            if self.inner.mmap_size >= header_size:
+                header_bytes = (<char*>self.inner.header)[:header_size]
+                magic, version, meta_size, capacity, used, kv_used = struct.unpack(HEADER_FMT, header_bytes)
+                if magic == MAGIC:
+                    # If the file already has a header, it means it's an existing table.
+                    # We MUST use the capacity from the header to correctly restore the hash table.
+                    # Also, reset used to 0 as update_table_only will increment it.
+                    self.inner.kv_used = kv_used
+                    (<HashTable>self.inner).used = 0
+                    if capacity != self.inner.capacity:
+                         self.inner._resize_table(capacity)
+
+                    # This is an existing file with a header, let's restore the state
+                    # Populate hash table
+                    valid_count = 0
+                    for i in range(kv_used):
+                        key = self.inner.idx_to_k(i)
+                        # A deleted slot MUST have a zero key. idx_to_k returns the raw bytes.
+                        if any(key):
+                            self.inner.update_table_only(key, i)
+                            valid_count += 1
+
+                    # After populating, self.inner.used should match 'used' from header
+                    assert self.inner.used == used
+                    assert valid_count == used
         _fill(self, items)
 
     def clear(self) -> None:
@@ -123,7 +159,7 @@ cdef class HashTableNT:
         else:
             return self._to_namedtuple_value(binary_value)
 
-    def update(self, other=(), /, **kwds):
+    def update(self, other=(), **kwds):
         """Like dict.update(), but 'other' can also be a HashTableNT instance."""
         if isinstance(other, HashTableNT):
             for key, value in other.items():
@@ -175,19 +211,29 @@ cdef class HashTableNT:
             'value_format_name': self.value_format.__class__.__name__,
             'value_format_fields': self.value_format._fields,
             'value_format': self.value_format,
-            'capacity': self.inner.capacity,
-            'used': self.inner.used,  # count of keys / values
         }
         meta_bytes = json.dumps(meta).encode("utf-8")
-        meta_size = len(meta_bytes)
-        header_bytes = struct.pack(HEADER_FMT, MAGIC, VERSION, meta_size)
+        header_size = struct.calcsize(HEADER_FMT)
+        # Calculate kv_offset based on current meta_bytes length, then meta_size is everything in between.
+        kv_offset = (header_size + len(meta_bytes) + ALIGNMENT - 1) // ALIGNMENT * ALIGNMENT
+        meta_size = kv_offset - header_size
+
+        header_bytes = struct.pack(HEADER_FMT, MAGIC, VERSION, meta_size,
+                                   self.inner.capacity, self.inner.used, self.inner.used)
         fd.write(header_bytes)
         fd.write(meta_bytes)
+        # Pad with zeros until kv_offset
+        fd.write(b'\x00' * (meta_size - len(meta_bytes)))
+
         count = 0
-        for key, value in self.inner.items():
-            fd.write(key)
-            fd.write(value)
-            count += 1
+        for i in range(self.inner.kv_used):
+             try:
+                 key, value = self.inner.idx_to_kv(i)
+             except KeyError:
+                 continue
+             fd.write(key)
+             fd.write(value)
+             count += 1
         assert count == self.inner.used
 
     @classmethod
@@ -203,35 +249,124 @@ cdef class HashTableNT:
         header_size = struct.calcsize(HEADER_FMT)
         header_bytes = fd.read(header_size)
         if len(header_bytes) < header_size:
-            raise ValueError(f"Invalid file, file is too short.")
-        magic, version, meta_size = struct.unpack(HEADER_FMT, header_bytes)
+            raise ValueError("Invalid file, file is too short.")
+        magic, version, meta_size, capacity, used, kv_used = struct.unpack(HEADER_FMT, header_bytes)
         if magic != MAGIC:
-            raise ValueError(f"Invalid file, magic {MAGIC.decode()} not found.")
+            # Try old header format? No, we broke compatibility on purpose.
+            raise ValueError("Invalid file, magic %s not found." % MAGIC.decode())
         if version != VERSION:
-            raise ValueError(f"Unsupported file version {version}.")
+            raise ValueError("Unsupported file version %d." % version)
         meta_bytes = fd.read(meta_size)
         if len(meta_bytes) < meta_size:
-            raise ValueError(f"Invalid file, file is too short.")
-        meta = json.loads(meta_bytes.decode("utf-8"))
+            raise ValueError("Invalid file, file is too short.")
+        meta = json.loads(meta_bytes.decode("utf-8").rstrip('\x00'))
         value_type = namedtuple(meta['value_type_name'], meta['value_type_fields'])
         value_format_t = namedtuple(meta['value_format_name'], meta['value_format_fields'])
         value_format = value_format_t(*meta['value_format'])
         ht = cls(key_size=meta['key_size'], value_format=value_format, value_type=value_type,
-                 capacity=meta['capacity'], byte_order=meta['byte_order'])
-        count = 0
+                 capacity=capacity, byte_order=meta['byte_order'])
         ksize, vsize = meta['key_size'], meta['value_size']
-        for i in range(meta['used']):
+        for i in range(kv_used):
             key = fd.read(ksize)
             value = fd.read(vsize)
-            ht._set_raw(key, value)
+            # A zero key means it's a deleted slot or uninitialized
+            if any(key):
+                 ht._set_raw(key, value)
         return ht
 
+    @classmethod
+    def open_mmap(cls, path: str, value_type: Any = None, value_format: Any = None):
+        """Open an existing borghash file in mmap mode."""
+        # We can just use the constructor, it already handles existing files with headers
+        # but we need to extract meta to get value_type/format if not provided.
+        # This is a bit redundant but stays compatible with the signature.
+        with open(path, 'rb') as fd:
+            header_size = struct.calcsize(HEADER_FMT)
+            header_bytes = fd.read(header_size)
+            if len(header_bytes) < header_size:
+                raise ValueError("Invalid file, file is too short.")
+            magic, version, meta_size, capacity, used, kv_used = struct.unpack(HEADER_FMT, header_bytes)
+            if magic != MAGIC:
+                raise ValueError("Invalid file, magic %s not found." % MAGIC.decode())
+            if version != VERSION:
+                raise ValueError("Unsupported file version %d." % version)
+            meta_bytes = fd.read(meta_size)
+            if len(meta_bytes) < meta_size:
+                raise ValueError("Invalid file, file is too short.")
+            meta = json.loads(meta_bytes.decode("utf-8").rstrip('\x00'))
+            kv_offset = header_size + meta_size
+
+        if value_type is None:
+            value_type = namedtuple(meta['value_type_name'], meta['value_type_fields'])
+        if value_format is None:
+            value_format_t = namedtuple(meta['value_format_name'], meta['value_format_fields'])
+            value_format = value_format_t(*meta['value_format'])
+
+        return cls(key_size=meta['key_size'], value_format=value_format, value_type=value_type,
+                   capacity=capacity, byte_order=meta['byte_order'],
+                   path=path, kv_offset=kv_offset)
+
     def size(self) -> int:
         """
         Do a rough worst-case estimate of the on-disk size when using .write().
 
         The serialized size of the metadata is a bit hard to predict, but we cover that with one_time_overheads.
         """
-        one_time_overheads = 4096  # very rough
+        one_time_overheads = 4096  # header + meta + alignment padding
         N = self.inner.used
         return int(N * (self.key_size + self.value_size) + one_time_overheads)
+
+    def write_header(self):
+        """Write/update the file header and metadata. Required for mmapped files."""
+        if self.inner.fd == -1:
+             raise RuntimeError("Not a memory-mapped HashTableNT (no path/fd).")
+
+        # Always shrink to fit before writing header
+        # BUT only if we are in mmap mode and it was a resize that led to over-capacity.
+        self.inner.shrink_to_fit()
+
+        header_size = struct.calcsize(HEADER_FMT)
+
+        # If the file is new or was just created, it might not even have space for the meta.
+        # We should ensure the metadata is also written if it's not already there.
+        # For a brand new file, kv_offset might be the initial 4096.
+
+        meta = {
+            'key_size': self.key_size,
+            'value_size': self.value_size,
+            'byte_order': self.byte_order,
+            'value_type_name': self.value_type.__name__,
+            'value_type_fields': self.value_type._fields,
+            'value_format_name': self.value_format.__class__.__name__,
+            'value_format_fields': self.value_format._fields,
+            'value_format': self.value_format,
+        }
+        meta_bytes = json.dumps(meta).encode("utf-8")
+        kv_offset = (header_size + len(meta_bytes) + ALIGNMENT - 1) // ALIGNMENT * ALIGNMENT
+        if kv_offset > self.inner.kv_offset:
+             # This can happen if user didn't provide enough kv_offset initially for the metadata.
+             # In __init__, default kv_offset is 4096 which is usually plenty.
+             raise RuntimeError(f"Metadata too large ({len(meta_bytes)} bytes) for current kv_offset ({self.inner.kv_offset}).")
+
+        # We always use the self.inner.kv_offset that was established.
+        meta_size = self.inner.kv_offset - header_size
+        header_bytes = struct.pack(HEADER_FMT, MAGIC, VERSION, meta_size,
+                                   self.inner.capacity, self.inner.used, self.inner.kv_used)
+
+        # Ensure mmap is large enough to hold at least the header and meta
+        if self.inner.mmap_size < self.inner.kv_offset:
+             self.inner._resize_kv(0) # this will grow it to at least kv_offset
+
+        cdef HashTable inner = <HashTable>self.inner
+        cdef uint8_t* header = inner.header
+        cdef char* header_ptr = <char*>header_bytes
+        cdef char* meta_ptr = <char*>meta_bytes
+        cdef size_t h_size = header_size
+        # Update the header in the mmapped memory
+        memcpy(header, header_ptr, h_size)
+        # Also write/update the meta bytes if they are not there or to be sure.
+        memcpy(header + h_size, meta_ptr, len(meta_bytes))
+        # Zero out padding between meta and KV array
+        padding = meta_size - len(meta_bytes)
+        if padding > 0:
+             memset(header + h_size + len(meta_bytes), 0, padding)
diff --git a/tests/benchmark_test.py b/tests/benchmark_test.py
index 24e68bb..c3a3c6e 100644
--- a/tests/benchmark_test.py
+++ b/tests/benchmark_test.py
@@ -9,6 +9,9 @@
 from borghash import HashTable, HashTableNT
 from .hashtable_test import H2
 
+import tempfile
+import os
+
 VALUE_TYPE = namedtuple("value_type", "value")
 VALUE_FMT_TYPE = namedtuple("value_format", "value")
 VALUE_FMT = VALUE_FMT_TYPE("I")
@@ -41,7 +44,22 @@ def pd():  # Python dict
     return dict()
 
 
-TEST_PARAMS = [(bh, False), (bhnt, True), (pd, False), (pd, True)]
+def bhmmap():  # BorgHash HashTableNT with mmap
+    fd, path = tempfile.mkstemp(suffix=".borghash")
+    os.close(fd)
+    
+    class MappedHashTableNT(HashTableNT):
+        def __del__(self):
+            try:
+                os.remove(path)
+            except OSError:
+                pass
+
+    ht = MappedHashTableNT(key_size=KEY_SIZE, value_type=VALUE_TYPE, value_format=VALUE_FMT, path=path)
+    return ht
+
+
+TEST_PARAMS = [(bh, False), (bhnt, True), (pd, False), (pd, True), (bhmmap, True)]
 
 
 def setup(ht_class, items, fill=False, nt=False):
diff --git a/tests/hashtablent_mmap_test.py b/tests/hashtablent_mmap_test.py
new file mode 100644
index 0000000..5112ba4
--- /dev/null
+++ b/tests/hashtablent_mmap_test.py
@@ -0,0 +1,155 @@
+from collections import namedtuple
+import os
+
+import pytest
+
+from borghash import HashTableNT
+from .hashtable_test import H2
+
+key_size = 32
+value_type = namedtuple("vt", "v1 v2 v3")
+value_format_t = namedtuple("vf", "v1 v2 v3")
+value_format = value_format_t(v1="I", v2="I", v3="I")
+
+key1, value1 = b"a" * 32, value_type(11, 12, 13)
+key2, value2 = b"b" * 32, value_type(21, 22, 23)
+key3, value3 = b"c" * 32, value_type(31, 32, 33)
+
+
+def test_mmap_open_existing(tmp_path):
+    path = str(tmp_path / "test.borghash")
+    # Create and write a file
+    ht = HashTableNT(key_size=key_size, value_type=value_type, value_format=value_format)
+    ht[key1] = value1
+    ht[key2] = value2
+    ht.write(path)
+
+    # Open in mmap mode
+    ht_mmap = HashTableNT.open_mmap(path)
+    assert len(ht_mmap) == 2
+    assert ht_mmap[key1] == value1
+    assert ht_mmap[key2] == value2
+
+
+def test_mmap_persistence(tmp_path):
+    path = str(tmp_path / "test_persistence.borghash")
+    ht = HashTableNT(key_size=key_size, value_type=value_type, value_format=value_format)
+    ht[key1] = value1
+    ht.write(path)
+
+    # Open mmap, modify, and close
+    ht_mmap = HashTableNT.open_mmap(path)
+    ht_mmap[key2] = value2
+    del ht_mmap[key1]
+    # Update header/metadata in the file
+    ht_mmap.write_header()
+
+    # Re-open normally to verify
+    ht_read = HashTableNT.read(path)
+    assert key1 not in ht_read
+    assert ht_read[key2] == value2
+
+
+def test_mmap_resize(tmp_path):
+    path = str(tmp_path / "test_resize.borghash")
+    # Small initial capacity to trigger resize early
+    ht = HashTableNT(key_size=key_size, value_type=value_type, value_format=value_format, capacity=100)
+    ht[key1] = value1
+    ht.write(path)
+
+    ht_mmap = HashTableNT.open_mmap(path)
+    # Add many items to trigger KV and table resize
+    for i in range(200):
+        key = H2(i)
+        ht_mmap[key] = value_type(i, i+1, i+2)
+
+    ht_mmap.write_header()  # update used count in metadata
+
+    assert len(ht_mmap) == 201
+    assert ht_mmap[key1] == value1
+
+    # Close and reopen to ensure resized file is valid
+    ht_reopened = HashTableNT.open_mmap(path)
+    assert len(ht_reopened) == 201
+    assert ht_reopened[key1] == value1
+    for i in range(200):
+        key = H2(i)
+        assert ht_reopened[key] == value_type(i, i+1, i+2)
+
+
+def test_mmap_shrink_to_fit(tmp_path):
+    path = str(tmp_path / "test_shrink.borghash")
+    # Small initial_capacity so it grows
+    ht = HashTableNT(key_size=key_size, value_type=value_type, value_format=value_format, capacity=100)
+    for i in range(100):
+        ht[H2(i)] = value_type(i, 0, 0)
+    ht.write(path)
+
+    ht_mmap = HashTableNT.open_mmap(path)
+    # Add enough items to trigger KV growth
+    for i in range(100, 1000):
+         ht_mmap[H2(i)] = value_type(i, 0, 0)
+
+    # After 1000 items, kv_capacity > 1000 (due to kv_grow_factor)
+    assert ht_mmap.inner.kv_capacity > 1000
+    initial_size = os.path.getsize(path)
+
+    # write_header should now automatically call shrink_to_fit
+    ht_mmap.write_header()
+
+    shrunk_size = os.path.getsize(path)
+    assert shrunk_size < initial_size
+    assert len(ht_mmap) == 1000
+
+
+def test_mmap_new_file(tmp_path):
+    # Testing using HashTableNT directly with a path to create a NEW mmapped file
+    path = str(tmp_path / "new_mmap.borghash")
+    ht = HashTableNT(key_size=key_size, value_type=value_type, value_format=value_format, path=path)
+    ht[key1] = value1
+    ht.write_header()  # Initialize header for new file
+    assert os.path.exists(path)
+
+    # Check if it persists without explicit write()
+    ht2 = HashTableNT.open_mmap(path)
+    assert ht2[key1] == value1
+
+
+def test_mmap_corrupt_magic(tmp_path):
+    path = tmp_path / "corrupt.borghash"
+    path.write_bytes(b"NOTBORG" + b"\x00" * 100)
+    with pytest.raises(ValueError, match="magic BORGHASH not found"):
+        HashTableNT.open_mmap(str(path))
+
+
+def test_mmap_delete_persistence(tmp_path):
+    path = str(tmp_path / "delete_test.borghash")
+    # use a mmapped HashTableNT
+    ht = HashTableNT(key_size=key_size, value_type=value_type, value_format=value_format, path=path)
+
+    # add 100 entries
+    for i in range(100):
+        ht[H2(i)] = value_type(i, i, i)
+    assert len(ht) == 100
+
+    # delete 50 entries
+    for i in range(25, 75):
+        del ht[H2(i)]
+    assert len(ht) == 50
+
+    # flush ht to disk
+    ht.write_header()
+    del ht
+
+    # open that ht again via mmap and check if the ht size is 50
+    ht2 = HashTableNT(key_size=key_size, value_type=value_type, value_format=value_format, path=path)
+    assert len(ht2) == 50
+
+    # verification of entries
+    # 0..24 were NOT deleted, 25..74 WERE deleted, 75..99 were NOT deleted
+    for i in range(25):
+        assert ht2[H2(i)] == value_type(i, i, i)
+    for i in range(25, 75):
+        assert H2(i) not in ht2
+    for i in range(75, 100):
+        assert ht2[H2(i)] == value_type(i, i, i)
diff --git a/tests/hashtablent_test.py b/tests/hashtablent_test.py
index cad2498..3a578ac 100644
--- a/tests/hashtablent_test.py
+++ b/tests/hashtablent_test.py
@@ -184,7 +184,7 @@ def test_size(ntht, n):
         ntht.write(f)
         real_size = f.tell()
     # Is our estimate good enough?
-    assert estimated_size * 0.9 < real_size < estimated_size * 1.0
+    assert estimated_size * 0.9 < real_size <= estimated_size * 1.0
 
 
 def test_demo():