diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6151054..ad05812 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -3,11 +3,36 @@ project(cubes CXX)
 
 # default to release build because speed maters.
 if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release")
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "CMAKE_BUILD_TYPE: Release, Debug or RelWithDebInfo" FORCE)
 endif()
 
+if(NOT BUILD_CUBES_MAX_N)
+	set(BUILD_CUBES_MAX_N 20 CACHE STRING "Limit of maximum N Polycubes to be computed")
+endif()
+
+if(NOT CUBES_PACK_CUBE_XYZ_ADDR)
+	set(CUBES_PACK_CUBE_XYZ_ADDR ON CACHE BOOL "Pack Cube struct XYZ memory address into 56-bit field.")
+endif()
+
+# Try extract current HEAD commit-id in git
+find_package(Git)
+if(GIT_FOUND)
+	execute_process(
+		COMMAND ${GIT_EXECUTABLE} rev-list -n1 HEAD
+		WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+		OUTPUT_STRIP_TRAILING_WHITESPACE
+		RESULT_VARIABLE RESULT
+		OUTPUT_VARIABLE CONFIG_GIT_VERSION)
+	message(STATUS "Set ${CONFIG_GIT_VERSION} to build version info")
+endif()
+
+# generate config.hpp header in build directory.
+set(CONFIG_IS_READONLY "Warning: this file is overwritten during build. Do not edit.")
+configure_file("config.hpp.in" "config.hpp")
+
 include_directories("include")
 include_directories("libraries")
+include_directories("${PROJECT_BINARY_DIR}")
 
 macro(ConfigureTarget Target)
 	# Enable C++17
@@ -38,18 +63,22 @@ macro(ConfigureTarget Target)
 	)
 endmacro()
 
+add_library(mapped_file STATIC "libraries/mapped_file.cpp")
+ConfigureTarget(mapped_file)
+
 # Source files
 add_library(CubeObjs OBJECT
 	"src/cubes.cpp"
-	"src/cache.cpp"
 	"src/rotations.cpp"
 	"src/newCache.cpp"
+	"src/cubeSwapSet.cpp"
 )
 ConfigureTarget(CubeObjs)
 
 # Build main program
 add_executable(${PROJECT_NAME} "program.cpp" $<TARGET_OBJECTS:CubeObjs>)
 target_link_libraries(${PROJECT_NAME} pthread)
+target_link_libraries(${PROJECT_NAME} mapped_file)
 ConfigureTarget(${PROJECT_NAME})
 
 # Optionally build tests
diff --git a/cpp/Readme.md b/cpp/Readme.md
index ce7de8c..7e34fba 100644
--- a/cpp/Readme.md
+++ b/cpp/Readme.md
@@ -1,27 +1,71 @@
 # C++ implementation of opencubes
 - uses list representation of coordinates with ones
 - hashfunction for coordinate is simple concatination of bytes
-- can split problem into threads, but performance can be improoved
+- can split problem into threads, but performance can be improved
 
 ## usage:
 ```bash
 ./cubes -n N
 ```
-options:
+### options:
 ```
+-n    --cube_size
+the size of polycube to generate up to
+This parameter is required.
+
 -t    --threads
 the number of threads to use while generating
 This parameter is optional. The default value is '1'.
 
 -c    --use_cache
-whether to load cache files
+whether to load cache files.
+The last N-1 run must have used -w parameter and that process
+must have completed without errors. The cache file
+must be present under the cache folder. (-f parameter)
 This parameter is optional. The default value is '0'.
 
 -w    --write_cache
-wheather to save cache files
+whether to save cache files
+This parameter is optional. The default value is '0'.
+
+-s    --split_cache
+whether to save separated cache files per output shape.
+requires -w parameter to take affect.
+No combined cache file is saved when -s is present.
 This parameter is optional. The default value is '0'.
+
+-u    --use_split_cache
+whether to load separated cache files per output shape.
+The last N-1 run must have used -s parameter and that process
+must have completed without errors. The split cache file(s)
+must be present under the cache folder. (-f parameter)
+This parameter is optional. The default value is '0'.
+
+-f    --cache_file_folder
+where to store cache files.
+This parameter is optional. The default value is './cache/'.
 ```
 
+### split cache usage:
+Starting with N=9 and beyond it makes sense to use the disk cache system.
+To generate starting cache run:
+```bash
+./cubes -n 9 -w -s
+```
+
+Above saves of the results into the cache folder (specified with -f parameter)
+as split cache files. Next N=10 run can continue processing from where the last N=9 process stopped:
+```bash
+./cubes -n 10 -w -s -u
+```
+The split cache file mode attempts to minimize memory usage.
+All following runs can use above command by incrementing the N by one each time.
+
+If required you can merge the split cache files
+back into single file at last run by dropping the `-s` parameter.
+Merging the split cache this way however uses vastly more memory.
+(Tool should be developed to export/merge the split cache files as standard cube format file)
+
 ## building (cmake)
 To build a release version (with optimisations , default)
 ```bash
diff --git a/cpp/config.hpp.in b/cpp/config.hpp.in
new file mode 100644
index 0000000..695addc
--- /dev/null
+++ b/cpp/config.hpp.in
@@ -0,0 +1,18 @@
+#pragma once
+#ifndef OPENCUBES_CONFIG_HPP
+#define OPENCUBES_CONFIG_HPP
+
+// @CONFIG_IS_READONLY@
+
+// Version info embedded into the build
+#define CONFIG_VERSION "@CONFIG_GIT_VERSION@"
+#define CONFIG_BUILDTYPE "@CMAKE_BUILD_TYPE@"
+#define CONFIG_COMPILERID "@CMAKE_CXX_COMPILER_ID@ @CMAKE_CXX_COMPILER_VERSION@"
+
+// Enable Cube struct pointer compaction
+#cmakedefine01 CUBES_PACK_CUBE_XYZ_ADDR
+
+// Maximum Polycubes N that may be computed
+#define CUBES_MAX_N @BUILD_CUBES_MAX_N@
+
+#endif
diff --git a/cpp/include/cache.hpp b/cpp/include/cache.hpp
deleted file mode 100644
index 6c3480d..0000000
--- a/cpp/include/cache.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#pragma once
-#ifndef OPENCUBES_CACHE_HPP
-#define OPENCUBES_CACHE_HPP
-#include <string>
-
-#include "hashes.hpp"
-#include "utils.hpp"
-
-struct Cache {
-    static constexpr uint32_t MAGIC = 0x42554350;
-    static constexpr uint32_t XYZ_SIZE = 3;
-    static constexpr uint32_t ALL_SHAPES = -1;
-    struct Header {
-        uint32_t magic = MAGIC;  // shoud be "PCUB" = 0x42554350
-        uint32_t n;              // we will never need 32bit but it is nicely aligned
-        uint32_t numShapes;      // defines length of the shapeTable
-        uint64_t numPolycubes;   // total number of polycubes
-    };
-    struct ShapeEntry {
-        uint8_t dim0;      // offset by -1
-        uint8_t dim1;      // offset by -1
-        uint8_t dim2;      // offset by -1
-        uint8_t reserved;  // for alignment
-        uint64_t offset;   // from beginning of file
-        uint64_t size;     // in bytes should be multiple of XYZ_SIZE
-    };
-
-    static void save(std::string path, Hashy& hashes, uint8_t n);
-    static Hashy load(std::string path, uint32_t extractShape = ALL_SHAPES);
-
-    int filedesc;
-    void* mmap_ptr;
-};
-
-#endif
diff --git a/cpp/include/cube.hpp b/cpp/include/cube.hpp
index 83feaa7..5abf4c7 100644
--- a/cpp/include/cube.hpp
+++ b/cpp/include/cube.hpp
@@ -3,11 +3,13 @@
 #define OPENCUBES_CUBE_HPP
 
 #include <algorithm>
+#include <cassert>
 #include <cstdint>
 #include <memory>
 #include <unordered_set>
 #include <vector>
 
+#include "config.hpp"
 #include "utils.hpp"
 
 struct XYZ {
@@ -45,20 +47,69 @@ using XYZSet = std::unordered_set<XYZ, HashXYZ, std::equal_to<XYZ>>;
 
 struct Cube {
    private:
-    struct {
+    // cube memory is stored two ways:
+    // normal, new'd buffer: is_shared == false
+    // shared, external memory: is_shared == true
+#if CUBES_PACK_CUBE_XYZ_ADDR == 1
+    struct bits_t {
+        uint64_t is_shared : 1;
+        uint64_t size : 7;   // MAX 127
+        uint64_t addr : 56;  // low 56-bits of memory address.
+    };
+    static_assert(sizeof(bits_t) == sizeof(void *));
+#else
+    struct bits_t {
+        uint64_t addr;
         uint8_t is_shared : 1;
         uint8_t size : 7;  // MAX 127
-    } bits;
-    XYZ *array = nullptr;
+    };
+#endif
+    // fields
+    bits_t fields;
+    // extract the pointer from bits_t
+    static XYZ *get(bits_t key) {
+        // pointer bit-hacking:
+        uint64_t addr = key.addr;
+#if CUBES_PACK_CUBE_XYZ_ADDR == 1
+// todo: on x86-64 depending if 5-level-paging is enabled
+// either 47-bit or 56-bit should be replicated to the high
+// part of the address. Don't know how to do this check yet,
+// so the high 8-bits is left zeroed.
+// If we get segfaults dereferencing get(fields)
+// then CUBES_PACK_CUBE_XYZ_ADDR must be disabled.
+#endif
+        return reinterpret_cast<XYZ *>(addr);
+    }
 
-    static_assert(sizeof(bits) == sizeof(uint8_t));
+    static bits_t put(bool is_shared, int size, XYZ *addr) {
+#if CUBES_PACK_CUBE_XYZ_ADDR == 1
+        // pack the memory address into 56-bits
+        // on x86-64 it is not used by the hardware (yet).
+        // This hack actually saves 8 bytes because previously
+        // the uint8_t caused padding to 16 bytes.
+        uint64_t tmp = reinterpret_cast<uint64_t>((void *)addr);
+        assert((tmp & ~0xffffffffffffff) == 0 && "BUG: CUBES_PACK_CUBE_XYZ_ADDR should be disabled");
+        tmp &= 0xffffffffffffff;
+        bits_t bits;
+        bits.addr = tmp;
+        bits.is_shared = is_shared;
+        bits.size = size;
+        return bits;
+#else
+        bits_t bits;
+        bits.addr = reinterpret_cast<uint64_t>((void *)addr);
+        bits.is_shared = is_shared;
+        bits.size = size;
+        return bits;
+#endif
+    }
 
    public:
     // Empty cube
-    Cube() : bits{0, 0} {}
+    Cube() : fields{put(0, 0, nullptr)} {}
 
     // Cube with N capacity
-    explicit Cube(uint8_t N) : bits{0, N}, array(new XYZ[bits.size]) {}
+    explicit Cube(uint8_t N) : fields{put(0, N, new XYZ[N])} {}
 
     // Construct from pieces
     Cube(std::initializer_list<XYZ> il) : Cube(il.size()) { std::copy(il.begin(), il.end(), begin()); }
@@ -69,20 +120,23 @@ struct Cube {
     // Construct from external source.
     // Cube shares this the memory until modified.
     // Caller guarantees the memory given will live longer than *this
-    Cube(XYZ *start, uint8_t n) : bits{1, n}, array(start) {}
+    Cube(const XYZ *start, uint8_t n) : fields{put(1, n, const_cast<XYZ *>(start))} {}
 
     // Copy ctor.
     Cube(const Cube &copy) : Cube(copy.size()) { std::copy(copy.begin(), copy.end(), begin()); }
 
     ~Cube() {
+        bits_t bits = fields;
         if (!bits.is_shared) {
-            delete[] array;
+            delete[] get(bits);
         }
     }
     friend void swap(Cube &a, Cube &b) {
         using std::swap;
-        swap(a.array, b.array);
-        swap(a.bits, b.bits);
+        bits_t abits = a.fields;
+        bits_t bbits = b.fields;
+        a.fields = bbits;
+        b.fields = abits;
     }
 
     Cube(Cube &&mv) : Cube() { swap(*this, mv); }
@@ -98,19 +152,11 @@ struct Cube {
         return *this;
     }
 
-    size_t size() const { return bits.size; }
+    size_t size() const { return fields.size; }
 
-    XYZ *data() {
-        if (bits.is_shared) {
-            // lift to RAM: this should never happen really.
-            Cube tmp(array, bits.size);
-            swap(*this, tmp);
-            std::printf("Bad use of Cube\n");
-        }
-        return array;
-    }
+    XYZ *data() { return get(fields); }
 
-    const XYZ *data() const { return array; }
+    const XYZ *data() const { return get(fields); }
 
     XYZ *begin() { return data(); }
 
@@ -138,8 +184,19 @@ struct Cube {
     void print() const {
         for (auto &p : *this) std::printf("  (%2d %2d %2d)\n\r", p.x(), p.y(), p.z());
     }
+
+    /**
+     * Copy cube data into destination buffer.
+     */
+    void copyout(int num, XYZ *dest) const {
+        assert(num <= (signed)size());
+        std::copy_n(begin(), num, dest);
+    }
 };
 
+#if CUBES_PACK_CUBE_XYZ_ADDR == 1
+static_assert(sizeof(Cube) == 8, "Unexpected sizeof(Cube) for Cube");
+#endif
 static_assert(std::is_move_assignable_v<Cube>, "Cube must be moveable");
 static_assert(std::is_swappable_v<Cube>, "Cube must swappable");
 
diff --git a/cpp/include/cubeSwapSet.hpp b/cpp/include/cubeSwapSet.hpp
new file mode 100644
index 0000000..b3395aa
--- /dev/null
+++ b/cpp/include/cubeSwapSet.hpp
@@ -0,0 +1,231 @@
+#pragma once
+#ifndef OPENCUBES_CUBE_DISKSWAP_SET_HPP
+#define OPENCUBES_CUBE_DISKSWAP_SET_HPP
+
+#include <filesystem>
+#include <memory>
+#include <mutex>
+#include <atomic>
+#include <unordered_set>
+
+#include "cube.hpp"
+#include "mapped_file.hpp"
+
+/**
+ * CubeSwapSet: Implement std::unordered_set<> that offloads XYZ data into a file:
+ *
+ * Cubes stored in the set have reduced cost of memory:
+ * Only the std::unordered_set<> itself and the internal nodes are stored in RAM.
+ * The element *data* (i.e. XYZ data) is stored in the file.
+ * The performance cost is that each time the set element is accessed
+ * the data is read back from the file.
+ * (Iterating the entire CubeSwapSet involves reading the entire file)
+ *
+ * Features:
+ * - XYZ data is recorded sequentially into the file and
+ *   the Cube size is not saved in the storage file.
+ * - Cube XYZ data length is constant in CubeStorage instance.
+ * - Clearing the CubeSwapSet does not release the file managed by CubeStorage.
+ *   (CubePtr(s) cannot be erased from CubeStorage)
+ * - CubeStorage::read(const CubePtr&) caches up to 1024 Cubes for each thread.
+ *   This read-cache is maintained by any thread that calls CubePtr::get().
+ *   CubeStorage::discard() is used to begin writing the XYZ data at new file instance.
+ * - CacheWriter utilizes the file instance from CubeStorage:
+ *   the CubeSwapSet is not iterated through at all by CacheWriter
+ *   and instead CubeStorage::getFile() is assigned into a copy job and then
+ *   copied into the cache-file with mapped::file::copyAt().
+ *   The source storage file is deleted once the copy is completed.
+ *   This provides wait-free saving of the cache-file and uses
+ *   minimal amount of system memory.
+ */
+class CubeStorage;
+
+/**
+ * CubePtr: "File Pointer to Cube" that reads the cube data from file.
+ * CubePtr needs CubeStorage instance to be able to access
+ * its contents with CubePtr::get().
+ * The associated CubeStorage should always be available
+ * in context where CubePtr(s) data is accessed.
+ */
+class CubePtr {
+   protected:
+    mapped::seekoff_t m_seek = 0;
+
+   public:
+    explicit CubePtr(mapped::seekoff_t offset) : m_seek(offset) {}
+    CubePtr(const CubePtr& c) : m_seek(c.m_seek) {}
+
+    /**
+     * Get the Cube pointed by this instance.
+     * @note The Cube is cached in the thread-local read-cache.
+     * @warn
+     *  The Cube object is local to calling thread and shall
+     *  not be passed into other threads.
+     */
+    const Cube& get(const CubeStorage& storage) const;
+
+    /**
+     * Raw data copy. By-passes the thread-local cache.
+     */
+    void copyout(const CubeStorage& storage, size_t n, XYZ* out) const;
+
+    template <typename Itr>
+    void copyout(const CubeStorage& storage, size_t n, Itr out) const {
+        std::vector<XYZ> buff(n);
+        copyout(storage, n, buff.data());
+        std::copy_n(buff.begin(), n, out);
+    }
+
+    mapped::seekoff_t seek() const { return m_seek; }
+};
+
+/**
+ * Stateful comparator for Cubeptr
+ */
+class CubePtrEqual {
+   protected:
+    const CubeStorage* m_storage = nullptr;
+
+   public:
+    // C++20 feature:
+    using is_transparent = void;
+
+    CubePtrEqual(const CubeStorage* ctx) : m_storage(ctx) {}
+    CubePtrEqual(const CubePtrEqual& ctx) : m_storage(ctx.m_storage) {}
+
+    bool operator()(const CubePtr& a, const CubePtr& b) const {
+        // todo: there is possibility that
+        // a.get() returned cube is *deleted* from the cache by b.get()
+        // The read-cache size must be at least 3 to avoid this.
+        return a.get(*m_storage) == b.get(*m_storage);
+    }
+};
+
+class CubePtrHash {
+   protected:
+    const CubeStorage* m_storage = nullptr;
+
+   public:
+    // C++20 feature:
+    using is_transparent = void;
+    using transparent_key_equal = CubePtrEqual;
+
+    CubePtrHash(const CubeStorage* ctx) : m_storage(ctx) {}
+    CubePtrHash(const CubePtrHash& ctx) : m_storage(ctx.m_storage) {}
+
+    size_t operator()(const CubePtr& x) const {
+        auto& cube = x.get(*m_storage);
+        std::size_t seed = cube.size();
+        for (auto& p : cube) {
+            auto x = HashXYZ()(p);
+            seed ^= x + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+        }
+        return seed;
+    }
+};
+
+class CubeStorage {
+   protected:
+    mutable std::mutex m_mtx;
+    std::filesystem::path m_fpath;
+    std::shared_ptr<mapped::file> m_file;
+
+    static std::atomic<int> m_init_num;
+    int m_storage_version = 0;
+    const size_t m_cube_size;
+
+    mapped::seekoff_t m_reserved_end;
+    // End of committed data.
+    mapped::seekoff_t m_alloc_seek;
+
+    // m_file_head: 2 MiB memory mapped area at end of the file.
+    std::unique_ptr<mapped::region> m_file_head;
+
+   public:
+    /**
+     * Initialize Cube file storage
+     * @param path directory where to write the storage file.
+     * @param n The storage is written in n sized chunks of XYZ structs.
+     *   This should be equal to Cube::size() that are passed into local()
+     *   Different sized Cubes in same CubeStorage instance will not work.
+     * @note the file creation is delayed until commit() is called first time.
+     */
+    CubeStorage(std::filesystem::path path, size_t n);
+    ~CubeStorage();
+
+    // not copyable
+    CubeStorage(const CubeStorage&) = delete;
+    CubeStorage& operator=(const CubeStorage&) = delete;
+    // move constructible: but only if no allocations exists in mv
+    CubeStorage(CubeStorage&& mv);
+    CubeStorage& operator=(CubeStorage&& mv) = delete;
+
+    size_t cubeSize() const { return m_cube_size; }
+
+    /**
+     * Make thread local CubePtr instance.
+     * @note
+     *  Other thread(s) cannot access the returned CubePtr until commit() is called.
+     *  This requires that external lock is held for the data structure
+     *  if CubePtr is made visible to other thread(s) until this thread calls commit()
+     */
+    CubePtr local(const Cube& cube) const;
+
+    /**
+     * Publish the last local() returned CubePtr.
+     * commit() writes this the data into the file storage
+     * making it visible to all threads.
+     */
+    void commit();
+
+    /**
+     * Discard the last local() returned CubePtr.
+     */
+    void drop() const;
+
+    /**
+     * Retrieve the cube data from the backing file
+     * and cache the result for the caller thread.
+     */
+    const Cube& read(const CubePtr& x) const;
+
+    /**
+     * Copy the cube data from the storage into destination buffer.
+     */
+    void copydata(const CubePtr& x, size_t n, XYZ* destination) const;
+
+    /**
+     * Explicitly clear the calling thread's read-cache.
+     * @note this will *initialize* callers read-cache instance
+     *  if the thread has not used the read-cache yet.
+     *  Only call this from thread that has used to read() previously.
+     */
+    void resetReadCache() const;
+
+    /**
+     * Get the file name CubeStorage is using.
+     */
+    std::filesystem::path fileName() const { return m_fpath; }
+
+    /**
+     * Get the mapped::file instance.
+     * @note this can be null if nothing has been written to the storage yet.
+     */
+    std::shared_ptr<mapped::file> getFile() const { return m_file; }
+
+    /**
+     * Drop all stored data.
+     */
+    void discard();
+};
+
+/**
+ * CubeStorage enabled std::unordered_set<>
+ *
+ * The CubeSwapSet must be constructed with already initialized
+ * stateful instances of CubePtrEqual and CubePtrHash functors
+ * that resolve the CubePtr(s) using the CubeStorage instance.
+ */
+using CubeSwapSet = std::unordered_set<CubePtr, CubePtrHash, CubePtrEqual>;
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/hashes.hpp b/cpp/include/hashes.hpp
index 7999d5c..cc838ab 100644
--- a/cpp/include/hashes.hpp
+++ b/cpp/include/hashes.hpp
@@ -3,12 +3,15 @@
 #define OPENCUBES_HASHES_HPP
 #include <array>
 #include <cstdio>
+#include <deque>
+#include <filesystem>
 #include <map>
 #include <shared_mutex>
 #include <unordered_set>
 #include <vector>
 
 #include "cube.hpp"
+#include "cubeSwapSet.hpp"
 #include "utils.hpp"
 
 struct HashCube {
@@ -23,54 +26,108 @@ struct HashCube {
     }
 };
 
-using CubeSet = std::unordered_set<Cube, HashCube, std::equal_to<Cube>>;
+// using CubeSet = std::unordered_set<Cube, HashCube, std::equal_to<Cube>>;
 
-struct Hashy {
-    struct Subsubhashy {
-        CubeSet set;
-        std::shared_mutex set_mutex;
+class Subsubhashy {
+   protected:
+    CubeStorage set_storage;
+    CubeSwapSet set;
+    mutable std::shared_mutex set_mutex;
 
-        template <typename CubeT>
-        void insert(CubeT &&c) {
-            std::lock_guard lock(set_mutex);
-            set.emplace(std::forward<CubeT>(c));
-        }
+   public:
+    explicit Subsubhashy(std::filesystem::path path, size_t n) : set_storage(path, n), set(1, CubePtrHash(&set_storage), CubePtrEqual(&set_storage)) {}
 
-        bool contains(const Cube &c) {
-            std::shared_lock lock(set_mutex);
-            return set.count(c);
+    template <typename CubeT>
+    void insert(CubeT &&c) {
+        std::unique_lock lock(set_mutex);
+        auto cptr = set_storage.local(std::forward<CubeT>(c));
+        auto [itr, isnew] = set.emplace(cptr);
+        if (isnew) {
+            set_storage.commit();
+        } else {
+            lock.unlock();
+            set_storage.drop();
         }
+    }
 
-        auto size() {
-            std::shared_lock lock(set_mutex);
-            return set.size();
-        }
-    };
-    template <int NUM>
-    struct Subhashy {
-        std::array<Subsubhashy, NUM> byhash;
-
-        template <typename CubeT>
-        void insert(CubeT &&c) {
-            HashCube hash;
-            auto idx = hash(c) % NUM;
-            auto &set = byhash[idx];
-            if (!set.contains(c)) set.insert(std::forward<CubeT>(c));
-            // printf("new size %ld\n\r", byshape[shape].size());
+    bool contains(const Cube &c) const {
+        std::shared_lock lock(set_mutex);
+        auto cptr = set_storage.local(c);
+        auto itr = set.find(cptr);
+        set_storage.drop();
+        return itr != set.end();
+    }
+
+    auto size() const {
+        std::shared_lock lock(set_mutex);
+        return set.size();
+    }
+
+    void clear() {
+        std::lock_guard lock(set_mutex);
+        set.clear();
+        set.reserve(1);
+        set_storage.discard();
+    }
+
+    // Get CubeStorage instance.
+    const CubeStorage &storage() const { return set_storage; }
+
+    auto begin() const { return set.begin(); }
+    auto end() const { return set.end(); }
+    auto begin() { return set.begin(); }
+    auto end() { return set.end(); }
+};
+
+class Subhashy {
+   protected:
+    std::deque<Subsubhashy> byhash;
+
+   public:
+    Subhashy(int NUM, size_t N, std::filesystem::path path) {
+        for (int i = 0; i < NUM; ++i) {
+            byhash.emplace_back(path, N);
         }
+    }
+
+    template <typename CubeT>
+    void insert(CubeT &&c) {
+        HashCube hash;
+        auto idx = hash(c) % byhash.size();
+        auto &set = byhash[idx];
 
-        auto size() {
-            size_t sum = 0;
-            for (auto &set : byhash) {
-                auto part = set.size();
-                sum += part;
-            }
-            return sum;
+        if (set.contains(c)) return;
+        set.insert(std::forward<CubeT>(c));
+        // printf("new size %ld\n\r", byshape[shape].size());
+    }
+
+    void clear() {
+        for (auto &set : byhash) set.clear();
+    }
+
+    auto size() const {
+        size_t sum = 0;
+        for (auto &set : byhash) {
+            auto part = set.size();
+            sum += part;
         }
-    };
+        return sum;
+    }
 
-    std::map<XYZ, Subhashy<32>> byshape;
+    auto begin() const { return byhash.begin(); }
+    auto end() const { return byhash.end(); }
+    auto begin() { return byhash.begin(); }
+    auto end() { return byhash.end(); }
+};
+
+class Hashy {
+   protected:
+    std::map<XYZ, Subhashy> byshape;
+    std::filesystem::path base_path;
+    int N;
+    mutable std::shared_mutex set_mutex;
 
+   public:
     static std::vector<XYZ> generateShapes(int n) {
         std::vector<XYZ> out;
         for (int x = 0; x < n; ++x)
@@ -83,27 +140,68 @@ struct Hashy {
         return out;
     }
 
+    explicit Hashy(std::string path = ".") : base_path(path) {}
+
     void init(int n) {
         // create all subhashy which will be needed for N
-        for (auto s : generateShapes(n)) byshape[s].size();
+        N = n;
+        for (auto s : generateShapes(n)) {
+            initSubHashy(n, s);
+        }
         std::printf("%ld sets by shape for N=%d\n\r", byshape.size(), n);
     }
 
+    Subhashy &initSubHashy(int n, XYZ s) {
+        assert(N == n);
+
+        auto itr = byshape.find(s);
+        if (itr == byshape.end()) {
+            auto [itr, isnew] = byshape.emplace(s, Subhashy(32, n, base_path));
+            assert(isnew);
+            itr->second.size();
+            return itr->second;
+        } else {
+            return itr->second;
+        }
+    }
+
+    Subhashy &at(XYZ shape) {
+        std::shared_lock lock(set_mutex);
+        auto itr = byshape.find(shape);
+        if (itr != byshape.end()) {
+            return itr->second;
+        }
+        // should never get here...
+        std::printf("BUG: missing shape [%2d %2d %2d]:\n\r", shape.x(), shape.y(), shape.z());
+        std::abort();
+        return *((Subhashy *)0);
+    }
+
     template <typename CubeT>
     void insert(CubeT &&c, XYZ shape) {
-        auto &set = byshape[shape];
-        set.insert(std::forward<CubeT>(c));
+        at(shape).insert(std::forward<CubeT>(c));
     }
 
-    auto size() {
+    auto size() const {
+        std::shared_lock lock(set_mutex);
         size_t sum = 0;
-        DEBUG_PRINTF("%ld maps by shape\n\r", byshape.size());
+        DEBUG1_PRINTF("%ld maps by shape\n\r", byshape.size());
         for (auto &set : byshape) {
             auto part = set.second.size();
-            DEBUG_PRINTF("bucket [%2d %2d %2d]: %ld\n", set.first.x(), set.first.y(), set.first.z(), part);
+            DEBUG1_PRINTF("bucket [%2d %2d %2d]: %ld\n", set.first.x(), set.first.y(), set.first.z(), part);
             sum += part;
         }
         return sum;
     }
+
+    int numShapes() const {
+        std::shared_lock lock(set_mutex);
+        return byshape.size();
+    }
+
+    auto begin() const { return byshape.begin(); }
+    auto end() const { return byshape.end(); }
+    auto begin() { return byshape.begin(); }
+    auto end() { return byshape.end(); }
 };
 #endif
diff --git a/cpp/include/newCache.hpp b/cpp/include/newCache.hpp
index 29e622e..20fd660 100644
--- a/cpp/include/newCache.hpp
+++ b/cpp/include/newCache.hpp
@@ -1,15 +1,81 @@
 #pragma once
 #ifndef OPENCUBES_NEWCACHE_HPP
 #define OPENCUBES_NEWCACHE_HPP
+#include <condition_variable>
 #include <cstring>
+#include <deque>
+#include <functional>
+#include <mutex>
 #include <string>
+#include <thread>
+#include <memory>
 
 #include "cube.hpp"
 #include "hashes.hpp"
+#include "mapped_file.hpp"
 
-class Workset;
+namespace cacheformat {
+static constexpr uint32_t MAGIC = 0x42554350;
+static constexpr uint32_t XYZ_SIZE = 3;
+static constexpr uint32_t ALL_SHAPES = -1;
 
-class CubeIterator {
+struct Header {
+    uint32_t magic = MAGIC;  // shoud be "PCUB" = 0x42554350
+    uint32_t n;              // we will never need 32bit but it is nicely aligned
+    uint32_t numShapes;      // defines length of the shapeTable
+    uint64_t numPolycubes;   // total number of polycubes
+};
+struct ShapeEntry {
+    uint8_t dim0;      // offset by -1
+    uint8_t dim1;      // offset by -1
+    uint8_t dim2;      // offset by -1
+    uint8_t reserved;  // for alignment
+    uint64_t offset;   // from beginning of file
+    uint64_t size;     // in bytes should be multiple of XYZ_SIZE
+};
+};  // namespace cacheformat
+
+/**
+ * newCache.hpp: provide two versions of the cache:
+ *
+ * - FlatCache implements "memory-only" cache and is constructed from Hashy.
+ *   It is needed for boot-strapping the cache files and computing
+ *   cubes without writing any data into disk.
+ *   FlatCache::getCubesByShape() return ShapeRange that points into the Cube data in memory.
+ *   ShapeRange then provides the Cube range as CubeIterator(s).
+ *
+ * - CacheReader implements the actual cache file system.
+ *   CacheReader::getCubesByShape() return FileShapeRange that
+ *   defines subset shape range from the cache file.
+ *   FileShapeRange then provides the Cube range as CubeFileIterator(s).
+ */
+class ICubeIterator {
+   public:
+    using iterator_category = std::forward_iterator_tag;
+    using difference_type = std::ptrdiff_t;
+    using value_type = Cube;
+    using pointer = Cube*;    // or also value_type*
+    using reference = Cube&;  // or also value_type&
+
+    virtual ~ICubeIterator(){};
+
+    virtual std::unique_ptr<ICubeIterator> clone() const = 0;
+
+    virtual const value_type operator*() const = 0;
+    virtual uint64_t seek() const = 0;
+    virtual ICubeIterator& operator++() = 0;
+    virtual ICubeIterator& operator+=(int incr) = 0;
+
+    friend bool operator==(const ICubeIterator& a, const ICubeIterator& b) { return a.seek() == b.seek(); };
+    friend bool operator<(const ICubeIterator& a, const ICubeIterator& b) { return a.seek() < b.seek(); };
+    friend bool operator>(const ICubeIterator& a, const ICubeIterator& b) { return a.seek() > b.seek(); };
+    friend bool operator!=(const ICubeIterator& a, const ICubeIterator& b) { return a.seek() != b.seek(); };
+};
+
+/**
+ * Iterator for Cubes stored in some memory area.
+ */
+class CubeIterator : public ICubeIterator {
    public:
     using iterator_category = std::forward_iterator_tag;
     using difference_type = std::ptrdiff_t;
@@ -18,22 +84,27 @@ class CubeIterator {
     using reference = Cube&;  // or also value_type&
 
     // constructor
-    CubeIterator(uint32_t _n, XYZ* ptr) : n(_n), m_ptr(ptr) {}
+    CubeIterator(uint32_t _n, const XYZ* ptr) : n(_n), m_ptr(ptr) {}
 
     // invalid iterator (can't deference)
     explicit CubeIterator() : n(0), m_ptr(nullptr) {}
 
+    std::unique_ptr<ICubeIterator> clone() const override { return std::make_unique<CubeIterator>(*this); }
+
     // derefecence
-    const value_type operator*() const { return Cube(m_ptr, n); }
+    const value_type operator*() const override { return Cube(m_ptr, n); }
+
     // pointer operator->() { return (pointer)m_ptr; }
 
+    uint64_t seek() const override { return (uint64_t)m_ptr; }
+
     // Prefix increment
-    CubeIterator& operator++() {
+    ICubeIterator& operator++() override {
         m_ptr += n;
         return *this;
     }
 
-    CubeIterator& operator+=(int incr) {
+    ICubeIterator& operator+=(int incr) override {
         m_ptr += n * incr;
         return *this;
     }
@@ -49,26 +120,168 @@ class CubeIterator {
     friend bool operator<(const CubeIterator& a, const CubeIterator& b) { return a.m_ptr < b.m_ptr; };
     friend bool operator>(const CubeIterator& a, const CubeIterator& b) { return a.m_ptr > b.m_ptr; };
     friend bool operator!=(const CubeIterator& a, const CubeIterator& b) { return a.m_ptr != b.m_ptr; };
-    friend class Workset;
 
    private:
     uint32_t n;
-    XYZ* m_ptr;
+    const XYZ* m_ptr;
 };
 
-class ShapeRange {
+class CubeReadIterator : public ICubeIterator {
    public:
-    ShapeRange(XYZ* start, XYZ* stop, uint64_t _cubeLen, XYZ _shape)
-        : b(_cubeLen, start), e(_cubeLen, stop), size_(((uint64_t)stop - (uint64_t)start) / (_cubeLen * sizeof(XYZ))), shape_(_shape) {}
+    using iterator_category = std::forward_iterator_tag;
+    using difference_type = std::ptrdiff_t;
+    using value_type = Cube;
+    using pointer = Cube*;    // or also value_type*
+    using reference = Cube&;  // or also value_type&
+
+    // constructor
+    CubeReadIterator(std::shared_ptr<mapped::file> file, uint32_t _n, mapped::seekoff_t offset) : n(_n), m_seek(offset), m_file(file) {}
 
-    CubeIterator begin() { return b; }
-    CubeIterator end() { return e; }
+    // invalid iterator (can't deference)
+    explicit CubeReadIterator() : n(0), m_seek(-1) {}
+
+    std::unique_ptr<ICubeIterator> clone() const override { return std::make_unique<CubeReadIterator>(*this); }
+
+    // derefecence
+    const value_type operator*() const override { return read(); }
 
-    XYZ& shape() { return shape_; }
-    auto size() const { return size_; }
+    // pointer operator->() { return (pointer)m_seek; }
+
+    uint64_t seek() const override { return (uint64_t)m_seek; }
+
+    // Prefix increment
+    ICubeIterator& operator++() override {
+        m_seek += n * sizeof(XYZ);
+        return *this;
+    }
+
+    ICubeIterator& operator+=(int incr) override {
+        m_seek += n * incr * sizeof(XYZ);
+        return *this;
+    }
+
+    // Postfix increment
+    CubeReadIterator operator++(int) {
+        CubeReadIterator tmp = *this;
+        ++(*this);
+        return tmp;
+    }
+
+    friend bool operator==(const CubeReadIterator& a, const CubeReadIterator& b) { return a.m_seek == b.m_seek; };
+    friend bool operator<(const CubeReadIterator& a, const CubeReadIterator& b) { return a.m_seek < b.m_seek; };
+    friend bool operator>(const CubeReadIterator& a, const CubeReadIterator& b) { return a.m_seek > b.m_seek; };
+    friend bool operator!=(const CubeReadIterator& a, const CubeReadIterator& b) { return a.m_seek != b.m_seek; };
 
    private:
-    CubeIterator b, e;
+    uint32_t n;
+    mapped::seekoff_t m_seek;
+    std::shared_ptr<mapped::file> m_file;
+
+    // de-reference is implemented by read()
+    Cube read() const;
+};
+
+/**
+ * To avoid complicating the use of the ICubeIterator
+ * CacheIterator provides type-erased wrapper that can be copied.
+ */
+class CacheIterator {
+   public:
+    using iterator_category = std::forward_iterator_tag;
+    using difference_type = std::ptrdiff_t;
+    using value_type = Cube;
+    using pointer = Cube*;    // or also value_type*
+    using reference = Cube&;  // or also value_type&
+
+    CacheIterator() {}
+
+    template <typename Itr>
+    explicit CacheIterator(Itr&& init) : proxy(std::make_unique<std::decay_t<Itr>>(std::forward<Itr>(init))) {}
+
+    CacheIterator(const CacheIterator& copy) {
+        if (copy.proxy) {
+            proxy = copy.proxy->clone();
+        }
+    }
+    CacheIterator& operator=(const CacheIterator& x) {
+        CacheIterator tmp(x);
+        std::swap(proxy, tmp.proxy);
+        return *this;
+    }
+    CacheIterator(CacheIterator&& copy) =default;
+    CacheIterator& operator=(CacheIterator&& x) =default;
+
+    const value_type operator*() const { return **proxy; }
+
+    uint64_t seek() const { return proxy->seek(); }
+
+    CacheIterator& operator++() {
+        ++(*proxy);
+        return *this;
+    }
+    CacheIterator& operator+=(int incr) {
+        (*proxy) += incr;
+        return *this;
+    }
+
+    CacheIterator operator++(int) {
+        CacheIterator tmp = *this;
+        ++(*this);
+        return tmp;
+    }
+
+    friend bool operator==(const CacheIterator& a, const CacheIterator& b) { return a.seek() == b.seek(); };
+    friend bool operator<(const CacheIterator& a, const CacheIterator& b) { return a.seek() < b.seek(); };
+    friend bool operator>(const CacheIterator& a, const CacheIterator& b) { return a.seek() > b.seek(); };
+    friend bool operator!=(const CacheIterator& a, const CacheIterator& b) { return a.seek() != b.seek(); };
+
+   private:
+    std::unique_ptr<ICubeIterator> proxy;
+};
+
+class IShapeRange {
+   public:
+    IShapeRange(){};
+    virtual ~IShapeRange() {}
+
+    virtual CacheIterator begin() const = 0;
+    virtual CacheIterator end() const = 0;
+    virtual XYZ& shape() = 0;
+    virtual size_t size() const = 0;
+};
+
+class ShapeRange : public IShapeRange {
+   public:
+    ShapeRange(const XYZ* start, const XYZ* stop, uint64_t _cubeLen, XYZ _shape)
+        : b(CubeIterator(_cubeLen, start)), e(CubeIterator(_cubeLen, stop)), size_(std::distance(start, stop) / _cubeLen), shape_(_shape) {}
+
+    CacheIterator begin() const override { return b; }
+    CacheIterator end() const override { return e; }
+
+    XYZ& shape() override { return shape_; }
+    size_t size() const override { return size_; }
+
+   private:
+    CacheIterator b, e;
+    uint64_t size_;
+    XYZ shape_;
+};
+
+class FileShapeRange : public IShapeRange {
+   public:
+    FileShapeRange(std::shared_ptr<mapped::file> file, mapped::seekoff_t start, mapped::seekoff_t stop, uint64_t _cubeLen, XYZ _shape)
+        : b(CubeReadIterator(file, _cubeLen, start)),
+        e(CubeReadIterator(file, _cubeLen, stop)),
+        size_((stop - start) / _cubeLen), shape_(_shape) {}
+
+    CacheIterator begin() const override { return b; }
+    CacheIterator end() const override { return e; }
+
+    XYZ& shape() override { return shape_; }
+    size_t size() const override { return size_; }
+
+   private:
+    CacheIterator b, e;
     uint64_t size_;
     XYZ shape_;
 };
@@ -76,7 +289,7 @@ class ShapeRange {
 class ICache {
    public:
     virtual ~ICache(){};
-    virtual ShapeRange getCubesByShape(uint32_t i) = 0;
+    virtual IShapeRange& getCubesByShape(uint32_t i) = 0;
     virtual uint32_t numShapes() = 0;
     virtual size_t size() = 0;
 };
@@ -98,46 +311,21 @@ class CacheReader : public ICache {
     uint32_t numShapes() override { return header->numShapes; };
     operator bool() { return fileLoaded_; }
 
-    static constexpr uint32_t MAGIC = 0x42554350;
-    static constexpr uint32_t XYZ_SIZE = 3;
-    static constexpr uint32_t ALL_SHAPES = -1;
-
-    struct Header {
-        uint32_t magic = MAGIC;  // shoud be "PCUB" = 0x42554350
-        uint32_t n;              // we will never need 32bit but it is nicely aligned
-        uint32_t numShapes;      // defines length of the shapeTable
-        uint64_t numPolycubes;   // total number of polycubes
-    };
-    struct ShapeEntry {
-        uint8_t dim0;      // offset by -1
-        uint8_t dim1;      // offset by -1
-        uint8_t dim2;      // offset by -1
-        uint8_t reserved;  // for alignment
-        uint64_t offset;   // from beginning of file
-        uint64_t size;     // in bytes should be multiple of XYZ_SIZE
-    };
-
-    CubeIterator begin() {
-        uint8_t* start = filePointer + shapes[0].offset;
-        return CubeIterator(header->n, (XYZ*)start);
-    }
+    // get shapes at index [0, numShapes()[
+    IShapeRange& getCubesByShape(uint32_t i) override;
 
-    CubeIterator end() {
-        uint8_t* stop = filePointer + shapes[0].offset + header->numPolycubes * header->n * XYZ_SIZE;
-        return CubeIterator(header->n, (XYZ*)stop);
-    }
+   private:
+    std::shared_ptr<mapped::file> file_;
+    std::unique_ptr<const mapped::struct_region<cacheformat::Header>> header_;
+    std::unique_ptr<const mapped::array_region<cacheformat::ShapeEntry>> shapes_;
 
-    ShapeRange getCubesByShape(uint32_t i) override;
+    std::vector<FileShapeRange> shapeRanges;
 
-   private:
-    uint8_t* filePointer;
     std::string path_;
-    int fileDescriptor_;
-    uint64_t fileSize_;
     bool fileLoaded_;
-    Header dummyHeader;
-    Header* header;
-    ShapeEntry* shapes;
+    const cacheformat::Header dummyHeader;
+    const cacheformat::Header* header;
+    const cacheformat::ShapeEntry* shapes;
 };
 
 class FlatCache : public ICache {
@@ -149,26 +337,63 @@ class FlatCache : public ICache {
     FlatCache() {}
     FlatCache(Hashy& hashes, uint8_t n) : n(n) {
         allXYZs.reserve(hashes.size() * n);
-        shapes.reserve(hashes.byshape.size());
+        shapes.reserve(hashes.numShapes());
         // std::printf("Flatcache %d %p %p\n", n, (void*)allXYZs.data(), (void*)shapes.data());
-        for (auto& [shape, set] : hashes.byshape) {
+        for (auto& [shape, set] : hashes) {
             auto begin = allXYZs.data() + allXYZs.size();
-            for (auto& subset : set.byhash) {
-                for (auto& cube : subset.set)
-                    // allXYZs.emplace_back(allXYZs.end(), subset.set.begin(), subset.set.end());
-                    std::copy(cube.begin(), cube.end(), std::back_inserter(allXYZs));
+            for (auto& subset : set) {
+                for (auto& cubeptr : subset) cubeptr.copyout(subset.storage(), n, std::back_inserter(allXYZs));
             }
             auto end = allXYZs.data() + allXYZs.size();
             // std::printf("  SR %p %p\n", (void*)begin, (void*)end);
             shapes.emplace_back(begin, end, n, shape);
         }
+
+        // Add dummy shape range at back:
+        shapes.emplace_back(nullptr, nullptr, n, XYZ(0, 0, 0));
     }
-    ShapeRange getCubesByShape(uint32_t i) override {
-        if (i >= shapes.size()) return ShapeRange{nullptr, nullptr, 0, XYZ(0, 0, 0)};
+    IShapeRange& getCubesByShape(uint32_t i) override {
+        if (i >= shapes.size() - 1) return shapes.back();
         return shapes[i];
     };
     uint32_t numShapes() override { return shapes.size(); };
     size_t size() override { return allXYZs.size() / n / sizeof(XYZ); }
 };
 
+class CacheWriter {
+   protected:
+    std::mutex m_mtx;
+    std::condition_variable m_run;
+    std::condition_variable m_wait;
+    bool m_active = true;
+
+    // Jobs that flush and finalize the written file.
+    size_t m_num_flushes = 0;
+    std::deque<std::function<void(void)>> m_flushes;
+
+    // Temporary copy jobs into the memory mapped file.
+    size_t m_num_copys = 0;
+    std::deque<std::function<void(void)>> m_copy;
+
+    // thread pool executing the jobs.
+    std::deque<std::thread> m_flushers;
+
+    void run();
+
+   public:
+    CacheWriter(int num_threads = 8);
+    ~CacheWriter();
+
+    /**
+     * Capture snapshot of the Hashy and write cache file.
+     * The data may not be entirely flushed before save() returns.
+     */
+    void save(std::string path, Hashy& hashes, uint8_t n);
+
+    /**
+     * Complete all flushes immediately.
+     */
+    void flush();
+};
+
 #endif
diff --git a/cpp/include/utils.hpp b/cpp/include/utils.hpp
index 4cd23e3..f895877 100644
--- a/cpp/include/utils.hpp
+++ b/cpp/include/utils.hpp
@@ -3,12 +3,39 @@
 #define OPENCUBES_UTILS_HPP
 
 #include <cstdio>
+
+// Debug print level: all prints enabled
+// below DEBUG_LEVEL.
+// DEBUG_LEVEL -> 0 all prints disabled.
+// DEBUG_LEVEL -> 1 enable DEBUG_PRINTF() statements
+// DEBUG_LEVEL -> 2 enable DEBUG1_PRINTF() statements and earlier
+// DEBUG_LEVEL -> 3 all prints enabled.
+#define DEBUG_LEVEL 1
+
 #ifdef DEBUG
+
+#if DEBUG_LEVEL >= 1
 #define DEBUG_PRINTF(...) std::printf(__VA_ARGS__)
-#else
-#define DEBUG_PRINTF(...) \
-    do {                  \
-    } while (0)
 #endif
 
+#if DEBUG_LEVEL >= 2
+#define DEBUG1_PRINTF(...) std::printf(__VA_ARGS__)
+#endif
+
+#if DEBUG_LEVEL >= 3
+#define DEBUG2_PRINTF(...) std::printf(__VA_ARGS__)
+#endif
+
+#endif
+
+#ifndef DEBUG_PRINTF
+#define DEBUG_PRINTF(...) do {} while (0)
 #endif
+#ifndef DEBUG1_PRINTF
+#define DEBUG1_PRINTF(...) do {} while (0)
+#endif
+#ifndef DEBUG2_PRINTF
+#define DEBUG2_PRINTF(...) do {} while (0)
+#endif
+
+#endif
\ No newline at end of file
diff --git a/cpp/libraries/mapped_file.cpp b/cpp/libraries/mapped_file.cpp
new file mode 100644
index 0000000..f0e4b0f
--- /dev/null
+++ b/cpp/libraries/mapped_file.cpp
@@ -0,0 +1,489 @@
+/**
+ * Copyright 2023 Jarmo A Tiitto
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the “Software”), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "mapped_file.hpp"
+
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+#include <fstream>
+#include <string>
+
+// POSIX/Linux APIs
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+
+#ifndef MAP_HUGE_2MB
+#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
+#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
+#endif
+
+namespace mapped {
+
+/**
+ * Mapped file POSIX/Linux compatible implementation
+ */
+file::file() : fd(-1), fd_size(0) {}
+
+file::~file() { close(); }
+
+void file::close() {
+    if (fd >= 0) {
+        ::fsync(fd);
+        ::close(fd);
+        fd = -1;
+        fd_size = 0;
+    }
+}
+
+int file::open(const char* fname) {
+    close();
+
+    fd = ::open64(fname, O_RDONLY);
+    if (fd == -1) {
+        // std::fprintf(stderr, "Error opening file for reading\n");
+        return -1;
+    }
+
+    struct stat64 finfo;
+    if (fstat64(fd, &finfo)) {
+        std::fprintf(stderr, "Error getting file size: %s\n", std::strerror(errno));
+        return -1;
+    }
+    fd_size = finfo.st_size;
+    fd_rw = false;
+    return 0;
+}
+
+int file::openrw(const char* fname, size_t maxsize, int flags) {
+    // create new files with "normal" permissions: "-rw-r--r--"
+    const mode_t fperms = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH;
+
+    close();
+
+    maxsize = roundUp(maxsize);
+
+    if (!flags) {
+        fd = ::open64(fname, O_RDWR | O_CLOEXEC);
+        if (fd == -1) {
+            // std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
+            return -1;
+        }
+
+        fd_rw = true;
+
+        struct stat64 finfo;
+        if (fstat64(fd, &finfo)) {
+            std::fprintf(stderr, "Error getting file size:%s\n", std::strerror(errno));
+            return -1;
+        }
+        return truncate(finfo.st_size);
+
+    } else if ((flags & (CREATE | RESIZE)) == (CREATE | RESIZE)) {
+        fd = ::open64(fname, O_CREAT | O_RDWR | O_TRUNC | O_CLOEXEC, fperms);
+        if (fd == -1) {
+            // std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
+            return -1;
+        }
+        fd_rw = true;
+
+        if(flags & FSTUNE) {
+            int flags = 0;
+            ioctl(fd, FS_IOC_GETFLAGS, &flags);
+            flags |= FS_NOATIME_FL | FS_NOCOW_FL;
+            ioctl(fd, FS_IOC_SETFLAGS, &flags);
+        }
+        return truncate(maxsize);
+
+    } else if ((flags & RESIZE) != 0) {
+        fd = ::open64(fname, O_RDWR | O_CLOEXEC, fperms);
+        if (fd == -1) {
+            // std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
+            return -1;
+        }
+        fd_rw = true;
+        return truncate(maxsize);
+    } else {
+        std::fprintf(stderr, "Invalid open flags:%s\n", std::strerror(errno));
+        return -1;
+    }
+}
+
+bool file::is_rw() const { return fd_rw; }
+
+seekoff_t file::size() const { return fd_size; }
+
+int file::truncate(seekoff_t newsize) {
+    // resize the backing file
+    if (newsize != fd_size && ftruncate64(fd, newsize)) {
+        std::fprintf(stderr, "Error resizing backing file:%s\n", std::strerror(errno));
+        return -1;
+    }
+    fd_size = newsize;
+    return 0;
+}
+
+int file::readAt(seekoff_t fpos, len_t size, void* dataout) const
+{
+    ssize_t rd = pread(fd, dataout, size, fpos);
+    if (rd != (signed)size) {
+        std::fprintf(stderr, "Error reading data from file:%s\n", std::strerror(errno));
+        return -1;
+    }
+    return 0;
+}
+
+int file::writeAt(seekoff_t fpos, len_t size, const void* datain)
+{
+    std::lock_guard lock(mut);
+
+    ssize_t rd = pwrite(fd, datain, size, fpos);
+    if (rd != (signed)size) {
+        std::fprintf(stderr, "Error writing data into file:%s\n", std::strerror(errno));
+        return -1;
+    }
+
+    fd_size = std::max(fd_size, fpos+size);
+    return 0;
+}
+
+int file::copyAt(std::shared_ptr<file> other, seekoff_t other_fpos, len_t size, seekoff_t dest_fpos)
+{
+    off64_t srcp = other_fpos;
+    off64_t dstp = dest_fpos;
+    ssize_t cpy = ::copy_file_range(other->fd, &srcp, fd, &dstp, size, 0);
+    if (cpy != (signed)size) {
+        std::fprintf(stderr, "Error copying file data:%s\n", std::strerror(errno));
+        return -1;
+    }
+
+    std::lock_guard lock(mut);
+    fd_size = std::max(fd_size, dest_fpos+size);
+    return 0;
+}
+
+
+/**
+ * Mapped region POSIX/Linux compatible implementation.
+ */
+
+region::region(std::shared_ptr<file> src, seekoff_t fpos, len_t size, len_t window) : mfile(src) {
+    std::lock_guard lock(mtx);
+    remap(fpos, size, window);
+}
+
+region::region(std::shared_ptr<file> src) : mfile(src) {
+    std::lock_guard lock(mtx);
+    auto sz = mfile->size();
+    remap(0, sz, sz);
+}
+
+region::~region() {
+    // destructor is not thread-safe.
+    std::lock_guard lock(mtx);
+    map_fseek = 0;
+    remap(0, 0, 0);
+}
+
+/**
+ * This is the core implementation of mapped_file:
+ * remap(0,0) releases the mapping.
+ * remap(0, n) mmap roundUp(n) bytes at offset 0
+ * remap(0, k) mremap roundUp(n) bytes at offset 0 (grows the existing mapping)
+ * remap(n, j) munmap old region, mmap new at offset roundDown(n)
+ *
+ * In read-write mode the backing file is grown to fit the mapping.
+ *
+ * @warn this->mtx must be held when this function is called.
+ */
+void region::remap(const seekoff_t fpos, const len_t size, const len_t window) {
+    if (fpos == usr_fseek && size == usr_size) return;  // No-op
+    // check if [fpos, fpos+size] fits into the existing
+    // mmap() window and only adjust the user region.
+    if (size && map_ptr && (map_fseek <= fpos && fpos + size <= map_fseek + map_size)) {
+        usr_fseek = fpos;
+        usr_ptr = (uint8_t*)map_ptr + (fpos - map_fseek);
+        usr_size = size;
+        return;
+    }
+
+    // if size == 0 or the usr_fseek != fpos,
+    // we have to unmap the old region first, if any.
+    if (!!map_ptr && (size == 0 || usr_fseek != fpos)) {
+        if (::munmap(map_ptr, map_size) == -1) {
+            std::fprintf(stderr, "Error mapping file memory\n");
+            return;
+        }
+        map_ptr = nullptr;
+        map_size = 0;
+        usr_ptr = nullptr;
+        usr_size = 0;
+        if (size == 0) return;
+    }
+    // keep what user tried to ask:
+    usr_fseek = fpos;
+    usr_size = size;
+
+    if (map_ptr && map_fseek == fpos) {
+        // this mapping exists already at same map_fseek
+        // remap it to grow the region.
+        auto newsize = roundUp(std::max(size, window));
+        void* newptr = mremap(map_ptr, map_size, newsize, MREMAP_MAYMOVE);
+        if (newptr == MAP_FAILED) {
+            std::fprintf(stderr, "Error resizing memory-map of file:%s\n", std::strerror(errno));
+            std::abort();
+            return;
+        }
+        map_ptr = newptr;
+        map_size = newsize;
+        return;
+    }
+
+    // create new mapping
+    if (mfile->is_rw()) {
+        // RW mapping
+        auto newsize = roundUp(std::max(size, window));
+
+        // take file lock so size() check --> truncate is atomic.
+        std::unique_lock trunclock(mfile->mut);
+        if (mfile->size() < fpos + newsize && mfile->truncate(fpos + newsize)) {
+            // failed. Disk full?
+            std::abort();
+            return;
+        }
+        trunclock.unlock();
+
+        // mmap requires fpos && size to be multiple of PAGE_SIZE
+        map_fseek = roundDown(fpos);
+        if (map_fseek < fpos) {
+            // adjust size to cover.
+            newsize += PAGE_SIZE;
+        }
+        map_size = newsize;
+        map_ptr = mmap(0, map_size, PROT_READ | PROT_WRITE, MAP_SHARED, mfile->fd, map_fseek);
+        if (map_ptr == MAP_FAILED) {
+            // If this gets triggered we are in deep trouble
+            std::fprintf(stderr, "Error memory-mapping file:%s %lu %d %lu\n", std::strerror(errno), size, mfile->fd, fpos);
+            std::fprintf(stderr, "Dumping /proc/self/maps:\n");
+            // for debugging information try print /proc/self/mmaps contents
+            // as this explains why we hit some limit of the system.
+            std::ifstream fmaps("/proc/self/maps");
+            std::string buf;
+            int count = 0;
+            while(std::getline(fmaps, buf)) {
+                std::fprintf(stderr, "%s\n", buf.c_str());
+                ++count;
+            }
+            std::fprintf(stderr, "counted %d memory-maps in process.\n", count);
+            return;
+        }
+    } else {
+        // RO mapping
+        if (mfile->size() <= fpos) {
+            // can't: the backing file is too small.
+            std::fprintf(stderr, "Error seeking past end of file.\n");
+            std::abort();
+            return;
+        }
+        map_size = roundUp(std::max(size, window));
+        map_fseek = roundDown(fpos);
+        // Map the region. (use huge pages, don't reserve backing store)
+        map_ptr = mmap(0, map_size, PROT_READ, MAP_SHARED | MAP_NORESERVE, mfile->fd, map_fseek);
+
+        if (!map_ptr || map_ptr == MAP_FAILED) {
+            std::fprintf(stderr, "Error mapping file\n");
+            std::abort();
+            return;
+        }
+    }
+
+    // hint that this memory is accessed in random order.
+    if(madvise(map_ptr, map_size, MADV_RANDOM)) {
+        std::fprintf(stderr, "warn: madvice(MADV_RANDOM) failed: %s\n", std::strerror(errno));
+    }
+    // adjust the usr_ptr to fix
+    // any page misalignment.
+    usr_ptr = (uint8_t*)map_ptr + (fpos - map_fseek);
+}
+
+void region::window(len_t window) {
+    std::lock_guard lock(mtx);
+    auto usize = usr_size;
+    // note: remap() does nothing if window == usr_size
+    remap(usr_fseek, window, window);
+    usr_size = usize;
+}
+
+void region::jump(seekoff_t fpos) {
+    std::lock_guard lock(mtx);
+    remap(fpos, usr_size, map_size);
+    is_dirty = false;
+}
+
+void region::flushJump(seekoff_t fpos) {
+    flush();
+    std::lock_guard lock(mtx);
+    remap(fpos, usr_size, map_size);
+}
+
+void region::flush() {
+    // only flush if dirty and RW mapped.
+    std::lock_guard lock(mtx);
+    if (is_dirty && mfile->is_rw()) {
+        is_dirty = false;
+        auto flush_begin = (void*)roundDown((uintptr_t)usr_ptr);
+        auto flush_len = roundUp(usr_size);
+        if (flush_begin < usr_ptr) flush_len += PAGE_SIZE;
+        if (msync(flush_begin, flush_len, MS_ASYNC)) {
+            std::fprintf(stderr, "Error flushing memory-map:%s\n", std::strerror(errno));
+        }
+    }
+}
+
+void region::sync() {
+    // only flush if dirty and RW mapped.
+    std::lock_guard lock(mtx);
+    if (is_dirty && mfile->is_rw()) {
+        is_dirty = false;
+        auto flush_begin = (void*)roundDown((uintptr_t)usr_ptr);
+        auto flush_len = roundUp(usr_size);
+        if (flush_begin < usr_ptr) flush_len += PAGE_SIZE;
+        if (msync(flush_begin, flush_len, MS_SYNC)) {
+            std::fprintf(stderr, "Error flushing memory-map:%s\n", std::strerror(errno));
+        }
+    }
+}
+
+void region::writeAt(seekoff_t fpos, len_t datasize, const void* data) {
+    auto srcmem = (const char*)data;
+
+    // take file lock so that file size check --> truncate is atomic.
+    std::unique_lock trunclock(mfile->mut);
+    if(mfile->size() < fpos+datasize && mfile->truncate(fpos+datasize)) {
+        return;
+    }
+    trunclock.unlock();
+
+    // does write fall out the mapped area begin?
+    if (fpos < map_fseek) {
+        // max size that can be written before map_fseek
+        ssize_t wr = std::min(map_fseek - fpos, datasize);
+        if (pwrite(mfile->fd, srcmem, wr, fpos) != wr) {
+            std::fprintf(stderr, "Error writing file:%s\n", std::strerror(errno));
+        }
+        srcmem += wr;
+        fpos += wr;
+        datasize -= wr;
+    }
+
+    if (fpos >= map_fseek && fpos < map_fseek + map_size && datasize) {
+        // max size that can be copied into this mapping:
+        ssize_t wr = std::min(map_size - (fpos - map_fseek), datasize);
+        std::memcpy((char*)map_ptr + (fpos - map_fseek), srcmem, wr);
+        srcmem += wr;
+        fpos += wr;
+        datasize -= wr;
+    }
+
+    // does write fall out the mapped area end?
+    if (datasize) {
+        // write into backing file after the mapped area:
+        if (pwrite(mfile->fd, srcmem, datasize, fpos) != ssize_t(datasize)) {
+            std::fprintf(stderr, "Error writing file:%s\n", std::strerror(errno));
+        }
+    }
+}
+
+void region::readAt(seekoff_t fpos, len_t datasize, void* data) const {
+    auto dstmem = (char*)data;
+
+    // does read fall out the mapped area begin?
+    if (fpos < map_fseek) {
+        // max size that can be written before map_fseek
+        ssize_t rd = std::min(map_fseek - fpos, datasize);
+        if (pread(mfile->fd, dstmem, rd, fpos) != rd) {
+            std::fprintf(stderr, "Error reading file:%s\n", std::strerror(errno));
+        }
+        dstmem += rd;
+        fpos += rd;
+        datasize -= rd;
+    }
+
+    if (fpos >= map_fseek && fpos < map_fseek + map_size && datasize) {
+        // max size that can be copied from this mapping:
+        ssize_t rd = std::min(map_size - (fpos - map_fseek), datasize);
+        std::memcpy(dstmem, (char*)map_ptr + (fpos - map_fseek), rd);
+        dstmem += rd;
+        fpos += rd;
+        datasize -= rd;
+    }
+
+    // does read fall out the mapped area end?
+    if (datasize) {
+        // read from backing file after the mapped area:
+        if (pread(mfile->fd, dstmem, datasize, fpos) != ssize_t(datasize)) {
+            std::fprintf(stderr, "Error reading file:%s\n", std::strerror(errno));
+        }
+    }
+}
+
+
+void region::resident(bool resident) {
+    std::lock_guard lock(mtx);
+    if(madvise(map_ptr, map_size, resident ? MADV_WILLNEED : MADV_DONTNEED)) {
+            std::fprintf(stderr,"Error setting memory-map residency:%s\n",std::strerror(errno));
+    }
+}
+
+
+void region::discard(seekoff_t fpos, len_t datasize) {
+
+    auto cur = usr_fseek + fpos;
+
+    if (cur < map_fseek + map_size) {
+        // max size that can discarded from this mapping:
+        ssize_t dm = std::min(map_size - (cur - map_fseek), datasize);
+
+        // Have to be careful here: if we delete too much
+        // caller will not have an good time.
+        // align size down to page size.
+        dm = roundDown(dm);
+        // align file offset up
+        auto _first = roundUp(cur - map_fseek);
+        if(_first > cur - map_fseek)
+            dm -= PAGE_SIZE;
+
+        if(dm >= (signed)PAGE_SIZE) {
+            if(madvise((char*)map_ptr + _first, dm, MADV_REMOVE)) {
+                std::fprintf(stderr,"Error discarding memory-map region:%s\n",std::strerror(errno));
+            }
+        }
+    }
+}
+
+};  // namespace mapped
diff --git a/cpp/libraries/mapped_file.hpp b/cpp/libraries/mapped_file.hpp
new file mode 100644
index 0000000..ce3c2e3
--- /dev/null
+++ b/cpp/libraries/mapped_file.hpp
@@ -0,0 +1,575 @@
+/**
+ * Copyright 2023 Jarmo A Tiitto
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the “Software”), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef MAPPEDFILE_HPP_INCLUDED
+#define MAPPEDFILE_HPP_INCLUDED
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <type_traits>
+
+/**
+ * Memory mapped file I/O utilities
+ * - mapped::file class for opening an file
+ * - mapped::region class for RW/RO memory mapping part the file instance.
+ * - mapped::struct_region<T> template for RW/RO accessing part the file as specified type.
+ * - mapped::array_region<T> template for RW/RO accessing part of the file as array of T elements.
+ *
+ * @note
+ *  When doing read-only mapping the region instance
+ *  should be const qualified as this restricts
+ *  the region class API to read-only operations and prevents
+ *  accidental modification of the file.
+ *  Use std::make_unique<const region>(<args>) in this case.
+ *
+ * @note
+ *  When using the read-write features the backing file is resized
+ *  in multiple PAGE_SIZE blocks even if the actually mapped size is
+ *  something else.
+ *  openrw(...,size,RESIZE) always truncates the file to roundUp(size).
+ *  You should do file->truncate(< sizeInBytes>) to make the file
+ *  size exactly what you want before the file is closed.
+ *
+ *  Modified regions should flush() or sync() before they are destroyed
+ *  or the modified data may not end up in the file.
+ *
+ * TODO:
+ *  - Two region instances should not overlap,
+ *    i.e same portion of the file should not be mapped twice.
+ *    (Not sure if this is actually broken now, but you have been warned)
+ * -  Multi-threading support not tested/written.
+ *    Currently the same mapped region can be used by multiple threads,
+ *    but cannot it be modified.
+ * -  Better error handling. (exceptions?, error codes?)
+ *    Currently critical errors are printed and std::abort() is called.
+ *    How do we handle system errors that happen in constructors?
+ */
+namespace mapped {
+
+const size_t PAGE_SIZE = 4096;
+
+static inline size_t roundToPage(ptrdiff_t x) { return (std::max<ptrdiff_t>(0, x - 1) & ~(PAGE_SIZE - 1)) + PAGE_SIZE; }
+
+constexpr inline size_t roundUp(uintptr_t x) { return (x + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1); }
+
+constexpr inline size_t roundDown(uintptr_t x) { return (x & ~(PAGE_SIZE - 1)); }
+
+/**
+ * seekoff_t: Position of the file cursor
+ */
+using seekoff_t = uint64_t;
+/**
+ * len_t: length of file data
+ */
+using len_t = size_t;
+
+class file;
+
+/**
+ * Memory-mapped region
+ * @brief
+ * the base class memory-maps an raw range of bytes from the backing file.
+ */
+class region {
+   protected:
+    std::mutex mtx;
+    // actually mapped region:
+    void* map_ptr = nullptr;
+    size_t map_size = 0;
+    seekoff_t map_fseek = 0;
+    // what constructor asked:
+    void* usr_ptr = nullptr;
+    size_t usr_size = 0;
+    seekoff_t usr_fseek = 0;
+    // todo: maybe use std::weak_ptr?
+    // that would allow file to be released and
+    // any any existing region(s) would still work.
+    // (but only if remap() is not called)
+    std::shared_ptr<file> mfile;
+    // non-const data access sets is_dirty.
+    bool is_dirty = false;
+
+    void remap(const seekoff_t fpos, const len_t size, const len_t window);
+
+    region() {}
+
+   public:
+    /**
+     * Open memory mapped region into a file.
+     * @brief
+     * Seeks at fpos in file and map size bytes
+     * starting from that position in file.
+     * @param window
+     *  over-extend mapping up to max(size,window) bytes.
+     *  Setting window bigger than size allows more efficient operation:
+     *  [fpos, fpos + window] area is memory mapped
+     *  but region will only operate on the
+     *  [roundDown(fpos), roundup(fpos+size)]
+     *  sub-portion of the memory.
+     * @note
+     * - Seeking past the EOF in file that is read-only will fail.
+     *   The mapped size may extend past EOF but accessing past EOF
+     *   either returns undefined data or program is terminated by OS.
+     *   (EOF is at file->size())
+     * - Seeking past the EOF that is read-write
+     *   grows the backing file to fit the mapping.
+     *   The backing file is always extended in multiple of PAGE_SIZE bytes.
+     * @note
+     *  If size and/or fpos are not aligned to multiple of PAGE_SIZE
+     *  they are forcibly aligned internally. This results in
+     *  regionSize() and regionSeek() that may differ compared to
+     *  size() and getSeek().
+     *  Side-effect is that backing file may grow more than expected.
+     */
+    region(std::shared_ptr<file> src, seekoff_t fpos, len_t size, len_t window = 0);
+
+    /**
+     * Open memory mapped region into the file
+     * @brief
+     *  same as region(myfile, 0, myfile.size())
+     *  and memory maps the entire file.
+     */
+    explicit region(std::shared_ptr<file> src);
+
+    /**
+     * Note: even if region was modified,
+     * destructor will not flush()/sync() before tearing down the mapping.
+     */
+    virtual ~region();
+
+    // region is not copyable
+    region(const region&) =delete;
+    region& operator=(const region&) =delete;
+
+    // region is moveable
+    friend void swap(region& a, region& b) {
+        using std::swap;
+        //std::lock(a.mtx,b.mtx);
+        //std::lock_guard l0(a.mtx, std::adopt_lock);
+        //std::lock_guard l1(b.mtx, std::adopt_lock);
+        swap(a.map_ptr,b.map_ptr);
+        swap(a.map_size,b.map_size);
+        swap(a.map_fseek,b.map_fseek);
+        swap(a.usr_ptr,b.usr_ptr);
+        swap(a.usr_size,b.usr_size);
+        swap(a.usr_fseek,b.usr_fseek);
+        swap(a.mfile,b.mfile);
+        swap(a.is_dirty,b.is_dirty);
+    }
+    region(region&& mv) : region() {
+        swap(*this, mv);
+    }
+    region& operator=(region&& mv) {
+        swap(*this, mv);
+        return *this;
+    }
+
+    /**
+     * Get data pointer.
+     */
+    const void* data() const { return usr_ptr; }
+    void* data() {
+        is_dirty = true;
+        return usr_ptr;
+    }
+
+    std::shared_ptr<file> getFile() { return mfile; }
+
+    /**
+     * Get the seek used to init this region.
+     */
+    seekoff_t getSeek() const { return usr_fseek; }
+    /**
+     * Get the size used to init this region.
+     */
+    len_t size() const { return usr_size; }
+
+    /**
+     * Get page aligned seek <= getSeek()
+     */
+    seekoff_t regionSeek() const { return map_fseek; }
+    /**
+     * Get page aligned size >= size()
+     */
+    len_t regionSize() const { return map_size; }
+
+    /**
+     * Resize the mapped region.
+     * @note the mapped memory address may move,
+     * but current contents are preserved.
+     * @warn all pointers or references into
+     * the mapping are invalidated.
+     */
+    void resize(len_t newsize);
+
+    /**
+     * @brief over-extend mapping up to max(size(),window) bytes.
+     *  Setting window bigger than size() allows more efficient operation:
+     *  [regionSeek(), regionSeek() + window] area is memory mapped
+     *  but region will only operate on the
+     *  [roundDown(getSeek()), roundUp(getSeek()+size())]
+     *  sub-portion of the memory.
+     */
+    void window(len_t window = 0);
+
+    /**
+     * Flush mapped memory region into the file.
+     * @brief this is an hint to operating system that
+     * memory region shall be synchronized to disk.
+     * It may not wait for this to have completed before returning.
+     * @note only the page aligned region
+     * [roundDown(data()), roundUp(data()+size())]
+     * is flushed.
+     * @note Use sync() instead if you must guarantee the data has
+     * reached persistent storage.
+     */
+    void flush();
+
+    /**
+     * Synchronize modified memory region onto disk.
+     */
+    void sync();
+
+    /**
+     * Write data into the backing file.
+     * @brief
+     *  writeAt() stores range of bytes into the backing file.
+     * @note
+     *  The region doesn't need to have this area to be memory-mapped:
+     *  The data that falls into the memory-mapped
+     *  [regionSeek(), regionSeek()+regionSize()] area is simply memcpy'ed.
+     *  Any data that falls out this window is written directly
+     *  into the backing file.
+     *  The backing file is grown to fit the data when needed.
+     */
+    void writeAt(seekoff_t fpos, len_t datasize, const void* data);
+
+    /**
+     * Read data from the backing file.
+     * @brief
+     *  readAt() reads [fpos, fpos+datasize] range of bytes from the backing file
+     * @note
+     *  The region doesn't need to have this area memory-mapped
+     *  The read out area that falls into the memory-mapped
+     *  [regionSeek(), regionSeek()+regionSize()] area is simply memcpy'ed.
+     *  Any data that falls out this window is read directly
+     *  from the backing file.
+     */
+    void readAt(seekoff_t fpos, len_t datasize, void* data) const;
+
+    /**
+     * Set memory region to resident/or released.
+     * @brief setting memory range to non-resident state
+     * causes system to drop the data from system memory.
+     * Reading non-resident memory region again causes system to
+     * fetch data from the disk again.
+     * @warn if memory region is not flushed before setting
+     * resident(false) any writes may be discarded to backing file.
+     */
+    void resident(bool state);
+
+    /**
+     * Discard memory region.
+     * @brief discarding memory range causes system
+     * to reclaim the memory *and* the on-disk area.
+     * This means the data is lost in the mapped memory region,
+     * and any data within will not be written onto disk by sync()
+     * Subsequent reads after discard() return zero filled data.
+     * @note
+     *  The discarded area shall be within the mapped area.
+     * @param fpos
+     *  file offset from begin of this mapping. (getSeek() + fpos)
+     * @param datasize
+     *  length of the data area to discard.
+     */
+    void discard(seekoff_t fpos, len_t datasize);
+
+    /**
+     * Seek in the file to fpos position and
+     * remap the memory region there.
+     * @warn all pointers or references into
+     * the mapping are invalidated.
+     */
+    void jump(seekoff_t fpos);
+
+    /**
+     * Flush the current region and
+     * Seek in the file to fpos position and
+     * remap the memory region there.
+     * @warn all pointers or references into
+     * the mapping are invalidated.
+     */
+    void flushJump(seekoff_t fpos);
+};
+
+static_assert(std::is_move_constructible_v<region>);
+static_assert(std::is_move_assignable_v<region>);
+static_assert(std::is_swappable_v<region>);
+
+/**
+ * Typed region.
+ * struct_region<T> allows directly accessing an on-disk structure.
+ * The region size is implicit from the type.
+ */
+template <typename T>
+class struct_region : protected region {
+   public:
+    using type = typename std::decay<T>::type;
+    static_assert(std::is_standard_layout_v<type>, "T must be plain-old-data type");
+
+    /**
+     * Memory map struct_region<T> at fpos in file.
+     */
+    struct_region(std::shared_ptr<file> f, seekoff_t fpos, len_t window = 0) : region(f, fpos, sizeof(type), window) {}
+
+    type* get() { return static_cast<type*>(data()); }
+    const type* get() const { return static_cast<const type*>(data()); }
+
+    type* operator->() { return get(); }
+    const type* operator->() const { return get(); }
+
+    type& operator*() { return *get(); }
+    const type& operator*() const { return *get(); }
+
+    using region::flush;
+    using region::getFile;
+    using region::getSeek;
+    using region::readAt;
+    using region::sync;
+    using region::writeAt;
+    using region::resident;
+    using region::window;
+    using region::discard;
+
+    // note: size means the sizeof(T)
+    using region::size;
+
+    /**
+     * Get the file seek position just after *this.
+     */
+    seekoff_t getEndSeek() const { return getSeek() + sizeof(T); }
+
+    /**
+     * Seek to fpos in file and remap the region.
+     * @return the pointer into the new position
+     */
+    type* jump(seekoff_t fpos) {
+        region::jump(fpos);
+        return get();
+    }
+
+    type* flushJump(seekoff_t fpos) {
+        region::flushJump(fpos);
+        return get();
+    }
+};
+
+static_assert(std::is_move_constructible_v<struct_region<int>>);
+static_assert(std::is_move_assignable_v<struct_region<int>>);
+static_assert(std::is_swappable_v<struct_region<int>>);
+
+/**
+ * Typed array region.
+ * @brief
+ * array_region<T> allows directly accessing an on-disk array of structures
+ * The element size is implicit from the type and length of the array
+ * is provided by the constructor.
+ * @provides resize(<elements>), operator[], begin(), end()
+ */
+template <typename T>
+class array_region : protected region {
+   protected:
+    size_t num_elements = 0;
+
+   public:
+    using type = typename std::decay<T>::type;
+    static_assert(std::is_standard_layout_v<type>, "T must be plain-old-data type");
+
+    /**
+     * Memory map array_region<T> at fpos in file and map array_size elements.
+     */
+    array_region(std::shared_ptr<file> f, seekoff_t fpos, size_t array_size) : region(f, fpos, sizeof(type) * array_size), num_elements(array_size) {}
+
+    /**
+     * Get pointer to first mapped element.
+     */
+    type* get() { return static_cast<type*>(data()); }
+    const type* get() const { return static_cast<const type*>(data()); }
+
+    using region::flush;
+    using region::getFile;
+    using region::getSeek;
+    using region::readAt;
+    using region::sync;
+    using region::writeAt;
+
+    /**
+     * Resize the mapped array region.
+     */
+    void resize(size_t elements) {
+        region::resize(sizeof(T) * elements);
+        num_elements = elements;
+    }
+
+    /**
+     * Get number of mapped *elements*
+     */
+    size_t size() const { return num_elements; }
+
+    /**
+     * Access the array elements
+     */
+    T& operator[](size_t index) {
+        assert(index < num_elements);
+        return get()[index];
+    }
+    const T& operator[](size_t index) const {
+        assert(index < num_elements);
+        return get()[index];
+    }
+    /**
+     * Iterators
+     */
+    T* begin() { return get(); }
+    T* end() { return get() + num_elements; }
+    const T* begin() const { return get(); }
+    const T* end() const { return get() + num_elements; }
+
+    /**
+     * Get the file seek position just after *this.
+     */
+    seekoff_t getEndSeek() const { return getSeek() + sizeof(T) * num_elements; }
+
+    /**
+     * Seek to fpos in file and remap the region.
+     * @return the pointer into the first element in the array
+     */
+    type* jump(seekoff_t fpos) {
+        region::jump(fpos);
+        return get();
+    }
+
+    type* flushJump(seekoff_t fpos) {
+        region::flushJump(fpos);
+        return get();
+    }
+};
+
+/**
+ * Memory-mapped file I/O class.
+ * @note
+ * file should be created with std::make_shared<file>()
+ * as mapped region(s) take shared ownership of the file.
+ */
+class file : public std::enable_shared_from_this<file> {
+   private:
+    std::mutex mut;
+    int fd;
+    seekoff_t fd_size;
+    bool fd_rw;
+    // the file and region classes are inherently coupled,
+    // and we don't want to expose the internals.
+    friend class region;
+
+   public:
+    enum : int {
+        CREATE = 0x1,  //!< Create new file, if doesn't exist.
+        RESIZE = 0x2,  //!< Resize file.
+        FSTUNE = 0x4   //!< When creating new file attempt to set
+                       //!< file system attributes to improve performance.
+    };
+
+    file();
+    ~file();
+
+    /**
+     * Open file in read-only mode.
+     * @return non-zero if error occurred.
+     */
+    int open(const char* file);
+
+    /**
+     * Create/Open file in read-write mode.
+     * @param flags
+     *  - CREATE|RESIZE creates or replaces existing file
+     *    that is truncated to maxsize.
+     *  - RESIZE  opens existing file and truncates it to
+     *    maxsize. The file must exist already.
+     *  - flags == 0 ignores the maxsize argument and opens
+     *    existing file.
+     * @warn default open mode discards any previous file contents!
+     * @return non-zero if error occurred.
+     */
+    int openrw(const char* file, len_t maxsize, int flags = CREATE | RESIZE);
+
+    /**
+     * Check if file open R/W or RO
+     */
+    bool is_rw() const;
+
+    /**
+     * Resize the open file to newsize bytes.
+     * (file must be open in R/W mode)
+     * @return non-zero if error occurred.
+     */
+    int truncate(seekoff_t newsize);
+
+    /**
+     * Read @size bytes starting at file offset @fpos
+     * @note copies [fpos, fpos+size] into [dataout, dataout+size]
+     * @return non-zero if error occurred.
+     */
+    int readAt(seekoff_t fpos, len_t size, void* dataout) const;
+
+    /**
+     * Write @size bytes starting at file offset @fpos
+     * @note copies [datain, datain+size] into [fpos, fpos+size]
+     * @note the file size after writeAt() is std::max(size(), fpos+size)
+     * @return non-zero if error occurred.
+     */
+    int writeAt(seekoff_t fpos, len_t size, const void* datain);
+
+    /**
+     * Copy @size bytes starting at file offset @other_fpos
+     * from @other file copying the data at @dest_fpos in this file.
+     * @note copies from other:[other_fpos, other_fpos+size] into this:[dest_fpos, dest_fpos+size]
+     * @note if other is same as *this the destination range cannot overlap with the source range.
+     */
+    int copyAt(std::shared_ptr<file> other, seekoff_t other_fpos, len_t size, seekoff_t dest_fpos);
+
+    /**
+     * Current length of the file
+     * The file EOF (end-of-file) is at this position.
+     */
+    seekoff_t size() const;
+
+    // Close the file.
+    void close();
+};
+
+};  // namespace mapped
+#endif
diff --git a/cpp/program.cpp b/cpp/program.cpp
index d9f0169..5c37ab0 100644
--- a/cpp/program.cpp
+++ b/cpp/program.cpp
@@ -1,14 +1,16 @@
 #include <iostream>
 
 #include "cmdparser.hpp"
+#include "config.hpp"
 #include "cubes.hpp"
 
 void configure_arguments(cli::Parser& parser) {
-    parser.set_required<int>("n", "cube_size", "the size of polycube to generate up to");
+    parser.set_optional<int>("n", "cube_size", 1, "the size of polycube to generate up to");
     parser.set_optional<int>("t", "threads", 1, "the number of threads to use while generating");
     parser.set_optional<bool>("c", "use_cache", false, "whether to load cache files");
     parser.set_optional<bool>("w", "write_cache", false, "wheather to save cache files");
     parser.set_optional<bool>("s", "split_cache", false, "wheather to save in sparate cache files per output shape");
+    parser.set_optional<bool>("v", "version", false, "print build version info");
     parser.set_optional<bool>("u", "use_split_cache", false, "use separate cachefile by input shape");
     parser.set_optional<std::string>("f", "cache_file_folder", "./cache/", "where to store cache files");
 }
@@ -17,6 +19,9 @@ int main(int argc, char** argv) {
     cli::Parser parser(argc, argv);
     configure_arguments(parser);
     parser.run_and_exit_if_error();
+    if (parser.get<bool>("v")) {
+        std::printf("Built from %s, %s, %s\n", CONFIG_VERSION, CONFIG_BUILDTYPE, CONFIG_COMPILERID);
+    }
     gen(parser.get<int>("n"), parser.get<int>("t"), parser.get<bool>("c"), parser.get<bool>("w"), parser.get<bool>("s"), parser.get<bool>("u"), parser.get<std::string>("f"));
     return 0;
 }
diff --git a/cpp/src/cache.cpp b/cpp/src/cache.cpp
deleted file mode 100644
index 071ff0f..0000000
--- a/cpp/src/cache.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-#include "cache.hpp"
-
-#include <algorithm>
-#include <fstream>
-#include <limits>
-#include <string>
-#include <unordered_set>
-
-#include "utils.hpp"
-
-/*
-====================
-cache file header
-====================
-
-uint32_t magic = "PCUB"
-uint32_t n = cache file for n cubes in a polycube
-uint32_t numShapes = number of different shapes in cachefile
--------
-
-====================
-shapetable:
-====================
-shapeEntry {
-    uint8_t dim0 // offset by -1
-    uint8_t dim1 // offset by -1
-    uint8_t dim2 // offset by -1
-    uint8_t reserved
-    uint64_t offset in file
-}
-shapeEntry[numShapes]
-
-
-====================
-XYZ data
-====================
-
-*/
-
-void Cache::save(std::string path, Hashy &hashes, uint8_t n) {
-    if (hashes.size() == 0) return;
-    std::ofstream ofs(path, std::ios::binary);
-    Header header;
-    header.magic = MAGIC;
-    header.n = n;
-    header.numShapes = hashes.byshape.size();
-    header.numPolycubes = hashes.size();
-    ofs.write((const char *)&header, sizeof(header));
-
-    std::vector<XYZ> keys;
-    keys.reserve(header.numShapes);
-    for (auto &pair : hashes.byshape) keys.push_back(pair.first);
-    std::sort(keys.begin(), keys.end());
-    uint64_t offset = sizeof(Header) + header.numShapes * sizeof(ShapeEntry);
-    for (auto &key : keys) {
-        ShapeEntry se;
-        se.dim0 = key.x();
-        se.dim1 = key.y();
-        se.dim2 = key.z();
-        se.reserved = 0;
-        se.offset = offset;
-        se.size = hashes.byshape[key].size() * XYZ_SIZE * n;
-        offset += se.size;
-        ofs.write((const char *)&se, sizeof(ShapeEntry));
-    }
-    // put XYZs
-    for (auto &key : keys) {
-        for (auto &subset : hashes.byshape[key].byhash)
-            for (const auto &c : subset.set) {
-                if constexpr (sizeof(XYZ) == XYZ_SIZE) {
-                    ofs.write((const char *)c.data(), sizeof(XYZ) * c.size());
-                } else {
-                    for (const auto &p : c) {
-                        ofs.write((const char *)p.data, XYZ_SIZE);
-                    }
-                }
-            }
-    }
-
-    std::printf("saved %s\n\r", path.c_str());
-}
-
-Hashy Cache::load(std::string path, uint32_t extractShape) {
-    Hashy cubes;
-    auto ifs = std::ifstream(path, std::ios::binary);
-    if (!ifs.is_open()) return cubes;
-    Header header;
-    if (!ifs.read((char *)&header, sizeof(header))) {
-        return cubes;
-    }
-    // check magic
-    if (header.magic != MAGIC) {
-        return cubes;
-    }
-#ifdef CACHE_LOAD_HEADER_ONLY
-    std::printf("loading cache file \"%s\" for N = %u", path.c_str(), header.n);
-    std::printf(", %u shapes, %lu XYZs\n\r", header.numShapes, header.numPolycubes);
-#endif
-    auto cubeSize = XYZ_SIZE * header.n;
-    DEBUG_PRINTF("cubeSize: %u\n\r", cubeSize);
-
-    for (uint32_t i = 0; i < header.numShapes; ++i) {
-        ShapeEntry shapeEntry;
-        if (!ifs.read((char *)&shapeEntry, sizeof(shapeEntry))) {
-            std::printf("ERROR reading ShapeEntry %u\n\r", i);
-            exit(-1);
-        }
-        if (ALL_SHAPES != extractShape && i != extractShape) continue;
-#ifdef CACHE_PRINT_SHAPEENTRIES
-        std::printf("ShapeEntry %3u: [%2d %2d %2d] offset: 0x%08lx size: 0x%08lx (%ld polycubes)\n\r", i, shapeEntry.dim0, shapeEntry.dim1, shapeEntry.dim2,
-                    shapeEntry.offset, shapeEntry.size, shapeEntry.size / cubeSize);
-#endif
-        if (shapeEntry.size % cubeSize != 0) {
-            std::printf("ERROR shape block is not divisible by cubeSize!\n\r");
-            exit(-1);
-        }
-#ifndef CACHE_LOAD_HEADER_ONLY
-        // remember pos in file
-        auto pos = ifs.tellg();
-
-        // read XYZ contents
-        ifs.seekg(shapeEntry.offset);
-        const uint32_t CHUNK_SIZE = 512 * XYZ_SIZE;
-        uint8_t buf[CHUNK_SIZE] = {0};
-        uint64_t buf_offset = 0;
-        uint32_t numCubes = shapeEntry.size / cubeSize;
-        XYZ shape(shapeEntry.dim0, shapeEntry.dim1, shapeEntry.dim2);
-        uint64_t readsize = shapeEntry.size - buf_offset;
-        if (readsize > CHUNK_SIZE) readsize = CHUNK_SIZE;
-        if (!ifs.read((char *)&buf, readsize)) {
-            std::printf("ERROR reading XYZs for Shape %u\n\r", i);
-            exit(-1);
-        }
-        for (uint32_t j = 0; j < numCubes; ++j) {
-            Cube next(header.n);
-            for (uint32_t k = 0; k < header.n; ++k) {
-                // check if buf contains next XYZ
-                uint64_t curr_offset = j * cubeSize + k * XYZ_SIZE;
-                if (curr_offset >= buf_offset + CHUNK_SIZE) {
-                    // std::printf("reload buffer\n\r");
-                    buf_offset += CHUNK_SIZE;
-                    readsize = shapeEntry.size - buf_offset;
-                    if (readsize > CHUNK_SIZE) readsize = CHUNK_SIZE;
-                    if (!ifs.read((char *)&buf, readsize)) {
-                        std::printf("ERROR reading XYZs for Shape %u\n\r", i);
-                        exit(-1);
-                    }
-                }
-
-                next.data()[k].data[0] = buf[curr_offset - buf_offset + 0];
-                next.data()[k].data[1] = buf[curr_offset - buf_offset + 1];
-                next.data()[k].data[2] = buf[curr_offset - buf_offset + 2];
-            }
-            cubes.insert(next, shape);
-        }
-
-        // restore pos
-        ifs.seekg(pos);
-#endif
-    }
-    std::printf("  loaded %lu cubes\n\r", cubes.size());
-    return cubes;
-}
diff --git a/cpp/src/cubeSwapSet.cpp b/cpp/src/cubeSwapSet.cpp
new file mode 100644
index 0000000..979e7ab
--- /dev/null
+++ b/cpp/src/cubeSwapSet.cpp
@@ -0,0 +1,212 @@
+#include "cubeSwapSet.hpp"
+
+#include <filesystem>
+#include <list>
+#include <unordered_map>
+
+/**
+ * thread-local read-cache for Cube(s)
+ */
+class ThreadCache {
+   public:
+    static ThreadCache& get();
+
+    struct entry {
+        // read-cache "key"
+        const CubeStorage* storage;
+        mapped::seekoff_t seek;
+        int version;
+
+        friend bool operator==(const entry& a, const entry& b) { return std::tie(a.storage, a.seek, a.version) == std::tie(b.storage, b.seek, b.version); }
+    };
+
+    struct state {
+        // cached data.
+        Cube cube;
+        std::list<entry>::iterator lru;
+    };
+
+    struct entry_hash {
+        size_t operator()(const entry& x) const {
+            size_t seed = uintptr_t(x.storage);
+            seed ^= x.seek + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+            seed ^= x.version + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+            return seed;
+        };
+    };
+
+    // Least-recently-used, LRU eviction policy list.
+    std::list<entry> lru;
+    // trick: make map with reference_wrapper<entry>
+    // as key so we don't need to duplicate the data from the lru list.
+    // surprisingly C++17 cache.find(entry) works.
+    std::unordered_map<std::reference_wrapper<const entry>, state, entry_hash, std::equal_to<entry>> cache;
+
+    bool local_enabled = false;
+    mapped::seekoff_t local_seek = -1;
+    Cube local;
+};
+
+ThreadCache& ThreadCache::get() {
+    static thread_local ThreadCache instance;
+    return instance;
+}
+
+std::atomic<int> CubeStorage::m_init_num(0);
+
+CubeStorage::CubeStorage(std::filesystem::path path, size_t n) : m_cube_size(n) {
+    // Generate file name:
+    m_fpath = path / ("storage_" + std::to_string(m_init_num.fetch_add(1)) + ".bin");
+}
+
+CubeStorage::~CubeStorage() { discard(); }
+
+CubeStorage::CubeStorage(CubeStorage&& mv)
+    : m_fpath(std::move(mv.m_fpath)), m_file(std::move(mv.m_file)), m_cube_size(mv.m_cube_size), m_alloc_seek(mv.m_alloc_seek) {
+    // no allocations can exist in the moved from object:
+    assert(m_alloc_seek == 0);
+}
+
+CubePtr CubeStorage::local(const Cube& cube) const {
+    auto& ctx = ThreadCache::get();
+    ctx.local = cube;
+    ctx.local_seek = m_alloc_seek;
+    ctx.local_enabled = true;
+    return CubePtr(ctx.local_seek);
+}
+
+void CubeStorage::commit() {
+    std::unique_lock lock(m_mtx);
+
+    if (!m_file) {
+        using namespace mapped;
+        // file not open yet.
+        m_file = std::make_shared<file>();
+        if (m_file->openrw(m_fpath.c_str(), 0, file::CREATE | file::RESIZE | file::FSTUNE)) {
+            std::printf("CubeStorage::allocate() ERROR: Failed to create file: %s\n", m_fpath.c_str());
+            std::abort();
+        }
+
+        // memory map 2 MiB chunk for writing.
+        // This also works as "pre-read-cache" for read():
+        // Any CubePtr(s) in this window even if they
+        // are not yet in thread's read-cache have fast readAt().
+        m_file_head = std::make_unique<mapped::region>(m_file, 0, 2 * 1024 * 1024);
+    }
+    auto datasize = m_cube_size * sizeof(XYZ);
+    auto write_fpos = m_alloc_seek;
+
+    if(m_reserved_end < m_alloc_seek + datasize) {
+        // advance the backing file m_file_head to next 2 MiB chunk.
+        m_reserved_end += 2 * 1024 * 1024;
+        m_file_head->flushJump(m_reserved_end);
+    }
+    // advance write offset:
+    m_alloc_seek = write_fpos + datasize;
+    // allow parallel m_file_head->writeAt() calls:
+    lock.unlock();
+
+    auto& ctx = ThreadCache::get();
+    assert(ctx.local_enabled);
+    assert(ctx.local_seek == write_fpos);
+    ctx.local_enabled = false;
+
+    m_file_head->writeAt(write_fpos, datasize, ctx.local.data());
+}
+
+void CubeStorage::drop() const {
+    auto& ctx = ThreadCache::get();
+    assert(ctx.local_enabled);
+    ctx.local_enabled = false;
+    ctx.local_seek = -1;
+}
+
+const Cube& CubeStorage::read(const CubePtr& x) const {
+    // Get thread's cache instance:
+    auto& ctx = ThreadCache::get();
+
+    // Check if x is actually the object returned by local():
+    if (ctx.local_enabled && x.seek() == ctx.local_seek) {
+        assert(ctx.local.size() == m_cube_size);
+        return ctx.local;
+    }
+
+    ThreadCache::entry key{this, x.seek(), m_storage_version};
+    auto itr = ctx.cache.find(key);
+    if (itr != ctx.cache.end()) {
+        // cache-hit.
+        // LRU policy simply moves the element at back of the list:
+        if (std::next(itr->second.lru) != ctx.lru.end()) {
+            ctx.lru.splice(itr->second.lru, ctx.lru, ctx.lru.end());
+        }
+        return itr->second.cube;
+    } else {
+        // cache-miss.
+        // Evict entry at front if read-cache is full:
+        if (ctx.cache.size() >= 1024) {
+            auto rm = ctx.cache.find(ctx.lru.front());
+            ctx.cache.erase(rm);
+            ctx.lru.pop_front();
+        }
+
+        // Read Cube data
+        Cube tmp(m_cube_size);
+        m_file_head->readAt(x.seek(), m_cube_size * sizeof(XYZ), tmp.data());
+
+        // Move it into an new read-cache entry:
+        auto nitr = ctx.lru.insert(ctx.lru.end(), key);
+        auto [itr, ok] = ctx.cache.emplace(std::ref(*nitr), ThreadCache::state{std::move(tmp), nitr});
+        assert(ok);
+        return itr->second.cube;
+    }
+}
+
+void CubeStorage::resetReadCache() const {
+    auto& ctx = ThreadCache::get();
+    ctx.cache.clear();
+    ctx.lru.clear();
+}
+
+void CubeStorage::copydata(const CubePtr& x, size_t n, XYZ* destination) const {
+    // copydata() doesn't use thread's read-cache
+    // so local() cannot be active:
+    assert(!ThreadCache::get().local_enabled);
+    m_file_head->readAt(x.seek(), n * sizeof(XYZ), destination);
+}
+
+void CubeStorage::discard() {
+    std::lock_guard lock(m_mtx);
+
+    if (m_file) {
+        // The backing file is kept intact
+        // so that CacheWriter can process it.
+        m_file_head->flush();
+        m_file_head.reset();
+        m_file.reset();
+        m_alloc_seek = 0;
+        m_reserved_end = 0;
+        // Thread read-cache problem:
+        // discard() must cause eviction of all read-cache
+        // entries for each thread's read cache that point into this.
+        // This done by incrementing m_storage_version:
+        // the entries can't simply be found as they are
+        // made with m_storage_version - 1 value.
+        // The entries are eventually evicted by
+        // the read-cache this way.
+        ++m_storage_version;
+    }
+}
+
+const Cube& CubePtr::get(const CubeStorage& storage) const {
+    // CubePtr::get() is really just an convenience function...
+    // However this cannot be implemented in the header file because
+    // CubeStorage definition is not known.
+    return storage.read(*this);
+}
+
+void CubePtr::copyout(const CubeStorage& storage, size_t n, XYZ* out) const {
+    // CubePtr::copyout() is really just an convenience function...
+    // However this cannot be implemented in the header file because
+    // CubeStorage definition is not known.
+    storage.copydata(*this, n, out);
+}
\ No newline at end of file
diff --git a/cpp/src/cubes.cpp b/cpp/src/cubes.cpp
index cdcc5b4..ec34936 100644
--- a/cpp/src/cubes.cpp
+++ b/cpp/src/cubes.cpp
@@ -2,13 +2,14 @@
 
 #include <algorithm>
 #include <chrono>
+#include <condition_variable>
 #include <cstdint>
+#include <deque>
 #include <filesystem>
 #include <iostream>
 #include <mutex>
 #include <thread>
 
-#include "cache.hpp"
 #include "cube.hpp"
 #include "hashes.hpp"
 #include "newCache.hpp"
@@ -19,28 +20,29 @@ const int PERF_STEP = 500;
 
 struct Workset {
     std::mutex mu;
-    CubeIterator _begin_total;
-    CubeIterator _begin;
-    CubeIterator _end;
+
+    CacheReader cr;
+    CacheIterator _begin_total;
+    CacheIterator _begin;
+    CacheIterator _end;
     Hashy &hashes;
     XYZ targetShape, shape, expandDim;
     bool notSameShape;
-    Workset(ShapeRange &data, Hashy &hashes, XYZ targetShape, XYZ shape, XYZ expandDim, bool notSameShape)
-        : _begin_total(data.begin())
-        , _begin(data.begin())
-        , _end(data.end())
-        , hashes(hashes)
-        , targetShape(targetShape)
-        , shape(shape)
-        , expandDim(expandDim)
-        , notSameShape(notSameShape) {}
+    Workset(Hashy &hashes, XYZ targetShape, XYZ shape, XYZ expandDim, bool notSameShape)
+        : hashes(hashes), targetShape(targetShape), shape(shape), expandDim(expandDim), notSameShape(notSameShape) {}
+
+    void setRange(IShapeRange &data) {
+        _begin_total = data.begin();
+        _begin = data.begin();
+        _end = data.end();
+    }
 
     struct Subset {
-        CubeIterator _begin, _end;
+        CacheIterator _begin, _end;
         bool valid;
         float percent;
-        auto begin() { return _begin; }
-        auto end() { return _end; }
+        CacheIterator begin() { return _begin; }
+        CacheIterator end() { return _end; }
     };
 
     Subset getPart() {
@@ -48,7 +50,7 @@ struct Workset {
         auto a = _begin;
         _begin += 500;
         if (_begin > _end) _begin = _end;
-        return {a, _begin, a < _end, 100 * (float)((uint64_t)a.m_ptr - (uint64_t)_begin_total.m_ptr) / ((uint64_t)_end.m_ptr - (uint64_t)_begin_total.m_ptr)};
+        return {a, _begin, a < _end, 100 * float(a.seek() - _begin_total.seek() + 1) / (_end.seek() - _begin_total.seek() + 1)};
     }
 
     void expand(const Cube &c) {
@@ -87,14 +89,14 @@ struct Workset {
         std::set_difference(candidates.begin(), end, c.begin(), c.end(), std::back_inserter(tmp));
         candidates = std::move(tmp);
 
-        DEBUG_PRINTF("candidates: %lu\n\r", candidates.size());
+        DEBUG1_PRINTF("candidates: %lu\n\r", candidates.size());
 
         Cube newCube(c.size() + 1);
         Cube lowestHashCube(newCube.size());
         Cube rotatedCube(newCube.size());
 
         for (const auto &p : candidates) {
-            DEBUG_PRINTF("(%2d %2d %2d)\n\r", p.x(), p.y(), p.z());
+            DEBUG2_PRINTF("(%2d %2d %2d)\n\r", p.x(), p.y(), p.z());
             int ax = (p.x() < 0) ? 1 : 0;
             int ay = (p.y() < 0) ? 1 : 0;
             int az = (p.z() < 0) ? 1 : 0;
@@ -131,26 +133,67 @@ struct Workset {
 };
 
 struct Worker {
-    Workset &ws;
+    std::shared_ptr<Workset> ws;
     int id;
-    Worker(Workset &ws_, int id_) : ws(ws_), id(id_) {}
+    int state = 3;  // 1 == completed/waiting for job, 2 == processing, 3 == job assigned.
+    std::mutex mtx;
+    std::condition_variable cond;
+    std::condition_variable cond2;
+    std::thread thr;
+
+    Worker(int id_) : id(id_), thr(&Worker::run, this) {}
+    ~Worker() {
+        std::unique_lock lock(mtx);
+        state = 0;
+        cond.notify_one();
+        lock.unlock();
+        thr.join();
+    }
+
+    void launch(std::shared_ptr<Workset> ws_) {
+        std::unique_lock lock(mtx);
+        while (state != 1) {
+            cond2.wait(lock);
+        }
+        ws = ws_;
+        state = 3;
+        cond.notify_one();
+    }
+
+    void sync() {
+        std::unique_lock lock(mtx);
+        while (state != 1) {
+            cond2.wait(lock);
+        }
+        ws.reset();
+    }
+
     void run() {
-        // std::printf("start %d\n", id);
-        auto subset = ws.getPart();
-        while (subset.valid) {
-            if (id == 0) {
-                std::printf("  %5.2f%%\r", subset.percent);
-                std::flush(std::cout);
-            }
-            // std::cout << id << " next subset " << &*subset.begin() << " to " << &*subset.end() << "\n";
-            for (auto &c : subset) {
-                // std::printf("%p\n", (void *)&c);
-                // c.print();
-                ws.expand(c);
+        std::unique_lock lock(mtx);
+        std::printf("thread nro. %d started.\n", id);
+        while (state) {
+            state = 1;
+            cond2.notify_one();
+            while (state == 1) cond.wait(lock);
+            if (!state) return;
+            state = 2;
+            // std::printf("start %d\n", id);
+            auto subset = ws->getPart();
+            while (subset.valid) {
+                if (id == 0) {
+                    std::printf("  %5.2f%%\r", subset.percent);
+                    std::flush(std::cout);
+                }
+                // std::cout << id << " next subset " << &*subset.begin() << " to " << &*subset.end() << "\n";
+                for (auto &c : subset) {
+                    // std::printf("%p\n", (void *)&c);
+                    // c.print();
+                    ws->expand(c);
+                }
+                subset = ws->getPart();
             }
-            subset = ws.getPart();
+            // std::printf("finished %d\n", id);
         }
-        // std::printf("finished %d\n", id);
     }
 };
 
@@ -158,7 +201,7 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
     if (!std::filesystem::is_directory(base_path)) {
         std::filesystem::create_directory(base_path);
     }
-    Hashy hashes;
+    Hashy hashes(base_path);
     if (n < 1)
         return {};
     else if (n == 1) {
@@ -166,7 +209,8 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
         hashes.insert(Cube{{XYZ(0, 0, 0)}}, XYZ(0, 0, 0));
         std::printf("%ld elements for %d\n\r", hashes.size(), n);
         if (write_cache) {
-            Cache::save(base_path + "cubes_" + std::to_string(n) + ".bin", hashes, n);
+            CacheWriter cw(1);
+            cw.save(base_path + "cubes_" + std::to_string(n) + ".bin", hashes, n);
         }
         return FlatCache(hashes, n);
     }
@@ -185,15 +229,26 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
     }
     std::printf("N = %d || generating new cubes from %lu base cubes.\n\r", n, base->size());
     hashes.init(n);
+
+    // Start worker threads.
+    std::deque<Worker> workers;
+    for (int i = 0; i < threads; ++i) {
+        workers.emplace_back(i);
+    }
+
+    CacheWriter cw(threads);
+
     uint64_t totalSum = 0;
     auto start = std::chrono::steady_clock::now();
-    uint32_t totalOutputShapes = hashes.byshape.size();
+    uint32_t totalOutputShapes = hashes.numShapes();
     uint32_t outShapeCount = 0;
     auto prevShapes = Hashy::generateShapes(n - 1);
-    for (auto &tup : hashes.byshape) {
+
+    for (const auto &tup : hashes) {
         outShapeCount++;
         XYZ targetShape = tup.first;
         std::printf("process output shape %3d/%d [%2d %2d %2d]\n\r", outShapeCount, totalOutputShapes, targetShape.x(), targetShape.y(), targetShape.z());
+
         for (uint32_t sid = 0; sid < prevShapes.size(); ++sid) {
             auto &shape = prevShapes[sid];
             int diffx = targetShape.x() - shape.x();
@@ -210,51 +265,62 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
             if (diffy == 1)
                 if (shape.y() == shape.x()) diffx = 1;
 
-            std::printf("  shape %d %d %d\n\r", shape.x(), shape.y(), shape.z());
+            auto ws = std::make_shared<Workset>(hashes, targetShape, shape, XYZ(diffx, diffy, diffz), abssum);
 
             if (use_split_cache) {
                 // load cache file only for this shape
                 std::string cachefile = base_path + "cubes_" + std::to_string(n - 1) + "_" + std::to_string(prevShapes[sid].x()) + "-" +
                                         std::to_string(prevShapes[sid].y()) + "-" + std::to_string(prevShapes[sid].z()) + ".bin";
-                cr.loadFile(cachefile);
+                ws->cr.loadFile(cachefile);
+                base = &ws->cr;
                 // cr.printHeader();
             }
-            auto s = base->getCubesByShape(sid);
+            auto& s = base->getCubesByShape(sid);
             if (shape != s.shape()) {
                 std::printf("ERROR caches shape does not match expected shape!\n");
                 exit(-1);
             }
-            // std::printf("starting %d threads\n\r", threads);
-            std::vector<std::thread> ts;
-            Workset ws(s, hashes, targetShape, shape, XYZ(diffx, diffy, diffz), abssum);
-            std::vector<Worker> workers;
-            ts.reserve(threads);
-            workers.reserve(threads);
-            for (int i = 0; i < threads; ++i) {
-                workers.emplace_back(ws, i);
-                ts.emplace_back(&Worker::run, std::ref(workers[i]));
+
+            ws->setRange(s);
+
+            // Wait for jobs to complete.
+            for (auto &thr : workers) {
+                thr.sync();
             }
-            for (int i = 0; i < threads; ++i) {
-                ts[i].join();
+            std::printf("  shape %d %d %d\n\r", shape.x(), shape.y(), shape.z());
+            // launch the new jobs.
+            // Because the workset is held by shared_ptr
+            // main thread can do above preparation work in parallel
+            // while the jobs are running.
+            for (auto &thr : workers) {
+                thr.launch(ws);
             }
         }
-        std::printf("  num: %lu\n\r", hashes.byshape[targetShape].size());
-        totalSum += hashes.byshape[targetShape].size();
+        // Wait for jobs to complete.
+        for (auto &thr : workers) {
+            thr.sync();
+        }
+        std::printf("  num: %lu\n\r", hashes.at(targetShape).size());
+        totalSum += hashes.at(targetShape).size();
         if (write_cache && split_cache) {
-            Cache::save(base_path + "cubes_" + std::to_string(n) + "_" + std::to_string(targetShape.x()) + "-" + std::to_string(targetShape.y()) + "-" +
-                            std::to_string(targetShape.z()) + ".bin",
-                        hashes, n);
+            cw.save(base_path + "cubes_" + std::to_string(n) + "_" + std::to_string(targetShape.x()) + "-" + std::to_string(targetShape.y()) + "-" +
+                        std::to_string(targetShape.z()) + ".bin",
+                    hashes, n);
         }
         if (split_cache) {
-            for (auto &subset : hashes.byshape[targetShape].byhash) {
-                subset.set.clear();
-                subset.set.reserve(1);
-            }
+            hashes.at(targetShape).clear();
         }
     }
+
+    // Stop the workers.
+    workers.clear();
+
     if (write_cache && !split_cache) {
-        Cache::save(base_path + "cubes_" + std::to_string(n) + ".bin", hashes, n);
+        cw.save(base_path + "cubes_" + std::to_string(n) + ".bin", hashes, n);
     }
+
+    cw.flush();
+
     auto end = std::chrono::steady_clock::now();
     auto dt_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
     std::printf("took %.2f s\033[0K\n\r", dt_ms / 1000.f);
diff --git a/cpp/src/newCache.cpp b/cpp/src/newCache.cpp
index ef925dc..9e0c54e 100644
--- a/cpp/src/newCache.cpp
+++ b/cpp/src/newCache.cpp
@@ -1,13 +1,10 @@
-#include "../include/newCache.hpp"
-
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <unistd.h>
+#include "newCache.hpp"
 
 #include <iostream>
 
-CacheReader::CacheReader()
-    : filePointer(nullptr), path_(""), fileDescriptor_(-1), fileSize_(0), fileLoaded_(false), dummyHeader{0, 0, 0, 0}, header(&dummyHeader), shapes(nullptr) {}
+#include "cubeSwapSet.hpp"
+
+CacheReader::CacheReader() : path_(""), fileLoaded_(false), dummyHeader{0, 0, 0, 0}, header(&dummyHeader), shapes(nullptr) {}
 
 void CacheReader::printHeader() {
     if (fileLoaded_) {
@@ -33,59 +30,272 @@ int CacheReader::printShapes(void) {
 int CacheReader::loadFile(const std::string path) {
     unload();
     path_ = path;
-    fileDescriptor_ = open(path.c_str(), O_RDONLY);
 
-    if (fileDescriptor_ == -1) {
+    // open read-only backing file:
+    file_ = std::make_shared<mapped::file>();
+    if (file_->open(path.c_str())) {
         std::printf("error opening file\n");
         return 1;
     }
 
-    // get filesize
-    fileSize_ = lseek(fileDescriptor_, 0, SEEK_END);
-    lseek(fileDescriptor_, 0, SEEK_SET);
+    // map the header struct
+    header_ = std::make_unique<const mapped::struct_region<cacheformat::Header>>(file_, 0);
+    header = header_->get();
+
+    if (header->magic != cacheformat::MAGIC) {
+        std::printf("error opening file: file not recognized\n");
+        return 1;
+    }
+
+    // map the ShapeEntry array:
+    shapes_ = std::make_unique<const mapped::array_region<cacheformat::ShapeEntry>>(file_, header_->getEndSeek(), (*header_)->numShapes);
+    shapes = shapes_->get();
 
-    // memory map file
-    filePointer = (uint8_t*)mmap(NULL, fileSize_, PROT_READ, MAP_SHARED, fileDescriptor_, 0);
-    if (filePointer == MAP_FAILED) {
-        // error handling
-        std::printf("errorm mapping file memory");
-        close(fileDescriptor_);
-        return 2;
+    // Initialize ShapeRanges
+    size_t datasize = 0;
+    for (unsigned int i = 0; i < header->numShapes; ++i) {
+        datasize += shapes[i].size;
     }
 
-    header = (Header*)(filePointer);
-    shapes = (ShapeEntry*)(filePointer + sizeof(Header));
+    if (file_->size() != shapes_->getEndSeek() + datasize) {
+        std::printf("warn: file size does not match expected value\n");
+    }
+
+    // Initialize shapeRanges array:
+    for (unsigned int i = 0; i < header->numShapes; ++i) {
+        if (shapes[i].size) {
+            auto start = shapes[i].offset;
+            auto end = start + shapes[i].size;
+            shapeRanges.emplace_back(file_, start, end, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2));
+        } else {
+            // table entry has no data.
+            // shapes[i].offset may have bogus value.
+            shapeRanges.emplace_back(file_, -1, -1, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2));
+        }
+    }
+
+    // Add dummy entry at back:
+    shapeRanges.emplace_back(file_, -1, -1, header->n, XYZ(0, 0, 0));
 
     fileLoaded_ = true;
 
     return 0;
 }
 
-ShapeRange CacheReader::getCubesByShape(uint32_t i) {
+Cube CubeReadIterator::read() const {
+    Cube tmp(n);
+    m_file->readAt(m_seek, n * sizeof(XYZ), tmp.data());
+    return tmp;
+}
+
+
+IShapeRange &CacheReader::getCubesByShape(uint32_t i) {
     if (i >= header->numShapes) {
-        return ShapeRange{nullptr, nullptr, 0, XYZ(0, 0, 0)};
+        return shapeRanges.back();
     }
-    XYZ* start = reinterpret_cast<XYZ*>(filePointer + shapes[i].offset);
-    XYZ* end = reinterpret_cast<XYZ*>(filePointer + shapes[i].offset + shapes[i].size);
-    return ShapeRange(start, end, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2));
+
+    return shapeRanges[i];
 }
 
 void CacheReader::unload() {
-    // unmap file from memory
+    // unload file from memory
     if (fileLoaded_) {
-        if (munmap(filePointer, fileSize_) == -1) {
-            // error handling
-            std::printf("error unmapping file\n");
-        }
-
-        // close file descriptor
-        close(fileDescriptor_);
+        shapeRanges.clear();
+        shapes_.reset();
+        header_.reset();
+        file_.reset();
         fileLoaded_ = false;
     }
-    fileDescriptor_ = -1;
-    filePointer = nullptr;
     header = &dummyHeader;
     shapes = nullptr;
 }
 
 CacheReader::~CacheReader() { unload(); }
+
+CacheWriter::CacheWriter(int num_threads) {
+    for (int i = 0; i < num_threads; ++i) {
+        m_flushers.emplace_back(&CacheWriter::run, this);
+    }
+}
+
+CacheWriter::CacheWriter::~CacheWriter() {
+    flush();
+    // stop the threads.
+    std::unique_lock lock(m_mtx);
+    m_active = false;
+    m_run.notify_all();
+    lock.unlock();
+    for (auto &thr : m_flushers) thr.join();
+}
+
+void CacheWriter::run() {
+    std::unique_lock lock(m_mtx);
+    while (m_active) {
+        // do copy jobs:
+        if (!m_copy.empty()) {
+            auto task = std::move(m_copy.front());
+            m_copy.pop_front();
+            lock.unlock();
+
+            task();
+
+            lock.lock();
+            --m_num_copys;
+            continue;
+        }
+        // file flushes:
+        if (!m_flushes.empty()) {
+            auto task = std::move(m_flushes.front());
+            m_flushes.pop_front();
+            lock.unlock();
+
+            task();
+
+            lock.lock();
+            --m_num_flushes;
+            continue;
+        }
+        // notify that we are done here.
+        m_wait.notify_one();
+        // wait for jobs.
+        m_run.wait(lock);
+    }
+    m_wait.notify_one();
+}
+
+void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
+    if (hashes.size() == 0) return;
+
+    using namespace mapped;
+    using namespace cacheformat;
+
+    auto file_ = std::make_shared<file>();
+    if (file_->openrw(path.c_str(), 0)) {
+        std::printf("error opening file\n");
+        return;
+    }
+
+    // Write header:
+    auto header = std::make_shared<struct_region<Header>>(file_, 0);
+    (*header)->magic = cacheformat::MAGIC;
+    (*header)->n = n;
+    (*header)->numShapes = hashes.numShapes();
+    (*header)->numPolycubes = hashes.size();
+    header->flush();
+
+    std::vector<XYZ> keys;
+    keys.reserve((*header)->numShapes);
+    for (auto &pair : hashes) keys.push_back(pair.first);
+    std::sort(keys.begin(), keys.end());
+
+    // Write shape table:
+    auto shapeEntry = std::make_shared<array_region<ShapeEntry>>(file_, header->getEndSeek(), (*header)->numShapes);
+    header.reset();
+
+    static_assert(XYZ_SIZE == sizeof(XYZ), "XYZ_SIZE differs from sizeof(XYZ)");
+
+    uint64_t offset = shapeEntry->getEndSeek();
+    size_t num_cubes = 0;
+    int i = 0;
+    for (auto &key : keys) {
+        auto &se = (*shapeEntry)[i++];
+        se.dim0 = key.x();
+        se.dim1 = key.y();
+        se.dim2 = key.z();
+        se.reserved = 0;
+        se.offset = offset;
+        auto count = hashes.at(key).size();
+        num_cubes += count;
+        se.size = count * XYZ_SIZE * n;
+        offset += se.size;
+    }
+    shapeEntry->flush();
+
+    // put XYZs
+    // Schedule merging of the cache file.
+    // CubeSwapSet enables massive optimizations in how
+    // CacheWriter can merge the SubsubHashy's data into the final cache file:
+    // - copystorage lambda takes the source file and it's file name from the
+    //   SubsubHashy::storage() returned CubeStorage.
+    // - mapped::file::copyAt() is used to efficiently copy the source file contents into this cache file
+    // - Finally the copystorage lambda *deletes* the source storage file
+    // The main program does not need to wait for this process to complete.
+
+    // copystorage takes shared ownership of the file_
+    auto copystorage = [n, file = file_](std::shared_ptr<mapped::file> src, std::filesystem::path rmname, size_t num, mapped::seekoff_t dest) -> void {
+        file->copyAt(src, 0, num * n * sizeof(XYZ), dest);
+        src.reset();
+
+        // Try remove the source storage file.
+        std::error_code ec;
+        auto stat = std::filesystem::status(rmname, ec);
+        if (!ec && std::filesystem::is_regular_file(stat)) {
+            if (!std::filesystem::remove(rmname, ec)) {
+                std::printf("WARN: failed to remove file: %s", rmname.c_str());
+            }
+        } else {
+            std::printf("WARN: failed to get file status: %s", rmname.c_str());
+        }
+    };
+
+    mapped::seekoff_t fileEnd = shapeEntry->getEndSeek();
+    auto time_start = std::chrono::steady_clock::now();
+    for (size_t i = 0; i < keys.size(); ++i) {
+        auto put = (*shapeEntry)[i].offset;
+        for (auto &subset : hashes.at(keys[i])) {
+            ptrdiff_t num = subset.size();
+            if (num) {
+                // By pass iterating the Subsubhashy entirely
+                // and copy the data from CubeStorage file *directly* into this file.
+                // the Cube data does end up in different order than when copying one-by-one.
+                // But we don't care as the order is random already.
+                // the copy job also deletes the CubeStorage::fileName() file from the disk
+                // once the data copy completes.
+                std::unique_lock lock(m_mtx);
+                m_copy.emplace_back(std::bind(copystorage, subset.storage().getFile(), subset.storage().fileName(), num, put));
+                ++m_num_copys;
+                m_run.notify_all();
+                std::printf("scheduled copy jobs: %*d ...  \r", 3, (int)m_num_copys);
+                std::flush(std::cout);
+            }
+            put += num * n * XYZ_SIZE;
+        }
+        fileEnd = std::max(fileEnd, put);
+    }
+    shapeEntry.reset();
+
+    // sync up a bit.
+    // don't allow the copy job queue to grow indefinitely
+    // if the disk can't keep up.
+    std::unique_lock lock(m_mtx);
+    while (m_num_copys > m_flushers.size()) {
+        std::printf("waiting for %*d copy jobs to complete ...  \r", 3, (int)m_num_copys);
+        std::flush(std::cout);
+        m_wait.wait(lock);
+    }
+
+    // move the file into flush job.
+    m_flushes.emplace_back(std::bind(
+        [fileEnd](auto &&file) -> void {
+            file->truncate(fileEnd);
+            file->close();
+            file.reset();
+        },
+        std::move(file_)));
+    ++m_num_flushes;
+    m_run.notify_all();
+
+    auto time_end = std::chrono::steady_clock::now();
+    auto dt_ms = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+
+    std::printf("saved %s, took %.2f s\n\r", path.c_str(), dt_ms / 1000.f);
+}
+
+void CacheWriter::flush() {
+    std::unique_lock lock(m_mtx);
+    while (m_num_flushes) {
+        std::printf("%*d copy jobs total remaining on %*d files  ...  \r", 3, (int)m_num_copys, 2, (int)m_num_flushes);
+        std::flush(std::cout);
+        m_wait.wait(lock);
+    }
+}
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b30d160..42e0014 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -19,4 +19,5 @@ add_executable(${PROJECT_NAME} $<TARGET_OBJECTS:CubeObjs> ${TESTS})
 
 target_link_libraries(GTest::GTest INTERFACE gtest_main)
 target_link_libraries(${PROJECT_NAME} pthread GTest::GTest)
+target_link_libraries(${PROJECT_NAME} mapped_file)
 ConfigureTarget(${PROJECT_NAME})
diff --git a/cpp/tests/src/test_cache.cpp b/cpp/tests/src/test_cache.cpp
deleted file mode 100644
index ae10cbf..0000000
--- a/cpp/tests/src/test_cache.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-#include <gtest/gtest.h>
-
-#include "cache.hpp"
-
-TEST(CacheTests, TestCacheLoadDoesNotThrow) { EXPECT_NO_THROW(Cache::load("./test_data.bin")); }
-
-TEST(CacheTests, TestCacheSaveDoesNotThrow) {
-    auto data = Cache::load("./test_data.bin");
-    EXPECT_NO_THROW(Cache::save("./temp.bin", data, 255));
-}
\ No newline at end of file
diff --git a/rust/src/cli/cli.rs b/rust/src/cli/cli.rs
index b298e5d..d50654b 100644
--- a/rust/src/cli/cli.rs
+++ b/rust/src/cli/cli.rs
@@ -1,16 +1,16 @@
 use std::{
     collections::{BTreeMap, HashSet},
     path::PathBuf,
-    time::{Duration, Instant},
+    time::Duration,
 };
 
 use clap::{Args, Parser, Subcommand, ValueEnum};
 use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
 use opencubes::{naive_polycube::NaivePolyCube, pcube::PCubeFile};
-use rayon::prelude::{IntoParallelIterator, ParallelIterator};
 
 mod enumerate;
 use enumerate::enumerate;
+use rayon::prelude::{IntoParallelIterator, IntoParallelRefIterator, ParallelIterator};
 
 fn finish_bar(bar: &ProgressBar, duration: Duration, expansions: usize, n: usize) {
     let time = duration.as_micros();
@@ -37,7 +37,17 @@ fn finish_bar(bar: &ProgressBar, duration: Duration, expansions: usize, n: usize
 }
 
 fn unknown_bar() -> ProgressBar {
-    let style = ProgressStyle::with_template("[{elapsed_precise}] [{spinner:10.cyan/blue}] {msg}")
+    unknown_bar_with_pos(false)
+}
+
+fn unknown_bar_with_pos(with_pos: bool) -> ProgressBar {
+    let template = if with_pos {
+        "[{elapsed_precise}] [{spinner:10.cyan/blue}] {pos} {msg}"
+    } else {
+        "[{elapsed_precise}] [{spinner:10.cyan/blue}] {msg}"
+    };
+
+    let style = ProgressStyle::with_template(template)
         .unwrap()
         .tick_strings(&[
             ">---------",
@@ -204,33 +214,36 @@ pub fn validate(opts: &ValidateArgs) -> std::io::Result<()> {
     let in_memory = !opts.no_in_memory;
     let n = opts.n;
 
-    println!("Validating {}", path);
+    let file = PCubeFile::new_file(path)?;
+    let canonical = file.canonical();
+    let len = file.len();
+
+    let bar = if let Some(len) = len {
+        make_bar(len as u64)
+    } else {
+        unknown_bar_with_pos(true)
+    };
+
+    bar.set_message("cubes validated");
+
+    bar.println(format!("Validating {}", path));
 
     let mut uniqueness = match (in_memory, uniqueness) {
         (true, true) => {
-            eprintln!("Verifying uniqueness.");
+            bar.println("Verifying uniqueness.");
             Some(HashSet::new())
         }
         (false, true) => {
+            bar.abandon();
             println!("Cannot verify uniqueness without placing all entries in memory. Re-run with `--no-uniqueness` enabled to run.");
             std::process::exit(1);
         }
         (_, false) => {
-            eprintln!("Not verifying uniqueness");
+            bar.println("Not verifying uniqueness");
             None
         }
     };
 
-    let file = PCubeFile::new_file(path)?;
-    let canonical = file.canonical();
-    let len = file.len();
-
-    let bar = if let Some(len) = len {
-        make_bar(len as u64)
-    } else {
-        unknown_bar()
-    };
-
     let exit = |msg: &str| {
         bar.abandon();
         println!("{msg}");
@@ -238,21 +251,18 @@ pub fn validate(opts: &ValidateArgs) -> std::io::Result<()> {
     };
 
     match (canonical, validate_canonical) {
-        (true, true) => eprintln!("Verifying entry canonicality. File indicates that entries are canonical."),
-        (false, true) => eprintln!("Not verifying entry canonicality. File header does not indicate that entries are canonical"),
-        (true, false) => eprintln!("Not verifying entry canonicality. File header indicates that they are, but check is disabled."),
-        (false, false) => eprintln!("Not verifying canonicality. File header does not indicate that entries are canonical, and check is disabled.")
+        (true, true) => bar.println("Verifying entry canonicality. File indicates that entries are canonical."),
+        (false, true) => bar.println("Not verifying entry canonicality. File header does not indicate that entries are canonical"),
+        (true, false) => bar.println("Not verifying entry canonicality. File header indicates that they are, but check is disabled."),
+        (false, false) => bar.println("Not verifying canonicality. File header does not indicate that entries are canonical, and check is disabled.")
     }
 
     if let Some(n) = n {
-        eprintln!("Verifying that all entries are N = {n}");
+        bar.println(format!("Verifying that all entries are N = {n}"));
     }
 
     let mut total_read = 0;
 
-    let mut last_tick = Instant::now();
-    bar.tick();
-
     for cube in file {
         let cube = match cube {
             Ok(c) => NaivePolyCube::from(c),
@@ -264,14 +274,7 @@ pub fn validate(opts: &ValidateArgs) -> std::io::Result<()> {
 
         total_read += 1;
 
-        if len.is_some() {
-            bar.inc(1);
-        } else if last_tick.elapsed() >= Duration::from_millis(66) {
-            last_tick = Instant::now();
-            bar.set_message(format!("{total_read}"));
-            bar.inc(1);
-            bar.tick();
-        }
+        bar.inc(1);
 
         let mut form: Option<NaivePolyCube> = None;
         let canonical_form = || cube.pcube_canonical_form();
@@ -297,10 +300,10 @@ pub fn validate(opts: &ValidateArgs) -> std::io::Result<()> {
                 exit("Found non-unique polycubes.");
             }
         }
-
-        bar.finish();
     }
 
+    bar.finish();
+
     println!("Success: {path}, containing {total_read} cubes, is valid");
 
     Ok(())
@@ -318,7 +321,7 @@ pub fn convert(opts: &ConvertArgs) {
     // that the longest files are yielded last.
     let files: BTreeMap<_, _> = opts
         .path
-        .iter()
+        .par_iter()
         .map(|path| {
             let input_file = match PCubeFile::new_file(&path) {
                 Ok(f) => f,
@@ -327,6 +330,7 @@ pub fn convert(opts: &ConvertArgs) {
                     std::process::exit(1);
                 }
             };
+
             (input_file.len(), (input_file, path.to_string()))
         })
         .collect();
@@ -334,36 +338,43 @@ pub fn convert(opts: &ConvertArgs) {
     // Iterate over the files and do some printing, in-order
     let files: Vec<_> = files
         .into_iter()
-        .map(|(_, (input_file, path))| {
+        .map(|(len, (input_file, path))| {
             let output_path = opts.output_path.clone().unwrap_or(path.clone());
 
-            println!("Converting file {}", path);
-            println!("Final output path: {output_path}");
+            multi_bar
+                .println(format!("Converting file {}", path))
+                .unwrap();
+            multi_bar
+                .println(format!("Final output path: {output_path}"))
+                .unwrap();
+
             if opts.canonicalize {
-                println!("Canonicalizing output");
+                multi_bar.println("Canonicalizing output").unwrap();
             }
-            println!("Input compression: {:?}", input_file.compression());
-            println!("Output compression: {:?}", opts.compression);
-
-            let len = input_file.len();
+            multi_bar
+                .println(format!("Input compression: {:?}", input_file.compression()))
+                .unwrap();
+            multi_bar
+                .println(format!("Output compression: {:?}", opts.compression))
+                .unwrap();
 
             let bar = if let Some(len) = len {
                 make_bar(len as u64)
             } else {
-                unknown_bar()
+                unknown_bar_with_pos(true)
             };
 
             let bar = multi_bar.add(bar);
 
-            (input_file, path, output_path, len, bar)
+            (input_file, path, output_path, bar)
         })
         .collect();
 
     // Convert, in parallel
     files
         .into_par_iter()
-        .for_each(|(input_file, path, output_path, len, bar)| {
-            bar.set_message(path.to_string());
+        .for_each(|(input_file, path, output_path, bar)| {
+            bar.set_message(format!("cubes converted for {path}"));
 
             let canonical = input_file.canonical();
             let mut output_path_temp = PathBuf::from(&output_path);
@@ -373,12 +384,7 @@ pub fn convert(opts: &ConvertArgs) {
             output_path_temp.pop();
             output_path_temp.push(filename);
 
-            let mut total_read = 0;
-            let mut last_tick = Instant::now();
-
             let input = input_file.filter_map(|v| {
-                total_read += 1;
-
                 let cube = match v {
                     Ok(v) => Some(v),
                     Err(e) => {
@@ -388,14 +394,7 @@ pub fn convert(opts: &ConvertArgs) {
                     }
                 }?;
 
-                if len.is_some() {
-                    bar.inc(1);
-                } else if last_tick.elapsed() >= Duration::from_millis(66) {
-                    last_tick = Instant::now();
-                    bar.set_message(format!("{total_read}"));
-                    bar.inc(1);
-                    bar.tick();
-                }
+                bar.inc(1);
 
                 if opts.canonicalize {
                     Some(NaivePolyCube::from(cube).canonical_form().into())
@@ -421,7 +420,7 @@ pub fn convert(opts: &ConvertArgs) {
 
             if !bar.is_finished() {
                 match std::fs::rename(output_path_temp, output_path) {
-                    Ok(_) => bar.finish_with_message(format!("{path} Done!")),
+                    Ok(_) => bar.finish(),
                     Err(e) => {
                         bar.abandon_with_message(format!("{path} Failed to write final file: {e}"));
                         return;
diff --git a/rust/src/pcube/compression.rs b/rust/src/pcube/compression.rs
index 61ea73b..8e09bdc 100644
--- a/rust/src/pcube/compression.rs
+++ b/rust/src/pcube/compression.rs
@@ -1,7 +1,9 @@
-use std::io::{BufReader, Read, Write};
+use std::io::{BufReader, BufWriter, Read, Write};
 
 use flate2::{read::GzDecoder, write::GzEncoder};
 
+const BUF_SIZE: usize = 1024 * 16384;
+
 /// Compression types supported for `.pcube` files.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum Compression {
@@ -45,7 +47,7 @@ where
 {
     pub fn new(compression: Compression, reader: T) -> Self {
         match compression {
-            Compression::None => Self::Uncompressed(BufReader::new(reader)),
+            Compression::None => Self::Uncompressed(BufReader::with_capacity(BUF_SIZE, reader)),
             Compression::Gzip => Self::Gzip(GzDecoder::new(reader)),
         }
     }
@@ -74,7 +76,7 @@ pub enum Writer<T>
 where
     T: Write,
 {
-    Uncompressed(T),
+    Uncompressed(BufWriter<T>),
     Gzip(GzEncoder<T>),
 }
 
@@ -84,7 +86,7 @@ where
 {
     pub fn new(compression: Compression, writer: T) -> Self {
         match compression {
-            Compression::None => Self::Uncompressed(writer),
+            Compression::None => Self::Uncompressed(BufWriter::with_capacity(BUF_SIZE, writer)),
             Compression::Gzip => Self::Gzip(GzEncoder::new(writer, flate2::Compression::default())),
         }
     }
diff --git a/rust/src/pcube/mod.rs b/rust/src/pcube/mod.rs
index 8dc6175..c9885d2 100644
--- a/rust/src/pcube/mod.rs
+++ b/rust/src/pcube/mod.rs
@@ -2,7 +2,7 @@
 
 use std::{
     fs::File,
-    io::{ErrorKind, Read, Seek, Write},
+    io::{ErrorKind, Read, Write},
     iter::Peekable,
     path::Path,
 };
@@ -93,25 +93,7 @@ where
         let [orientation, compression] = header;
         let canonicalized = orientation != 0;
 
-        let mut cube_count: u64 = 0;
-        let mut shift = 0;
-        loop {
-            let mut next_byte = [0u8; 1];
-            input.read_exact(&mut next_byte)?;
-
-            let [next_byte] = next_byte;
-
-            cube_count |= ((next_byte & 0x7F) as u64) << shift;
-
-            shift += 7;
-            if shift > 64 {
-                panic!("Cannot load possibly more than u64 cubes...");
-            }
-
-            if next_byte & 0x80 == 0 {
-                break;
-            }
-        }
+        let cube_count = PCubeFile::read_leb128(&mut input)?;
 
         let len = if cube_count == 0 {
             None
@@ -177,54 +159,102 @@ impl PCubeFile {
         Self::new(file)
     }
 
-    /// Write implementation
-    fn write_impl<I, W>(
-        write_magic: bool,
-        mut cubes: I,
-        is_canonical: bool,
-        compression: Compression,
-        mut write: W,
-    ) -> std::io::Result<()>
-    where
-        I: Iterator<Item = RawPCube>,
-        W: Write,
-    {
-        if write_magic {
-            write.write_all(&MAGIC)?;
-        }
+    fn read_leb128(mut reader: impl Read) -> std::io::Result<u64> {
+        let mut cube_count: u64 = 0;
+        let mut shift = 0;
+        loop {
+            let mut next_byte = [0u8; 1];
+            reader.read_exact(&mut next_byte)?;
 
-        let compression_val = compression.into();
-        let orientation_val = if is_canonical { 1 } else { 0 };
+            let [next_byte] = next_byte;
 
-        write.write_all(&[orientation_val, compression_val])?;
+            let is_last_byte = (next_byte & 0x80) == 0x00;
+            let value = (next_byte & 0x7F) as u64;
 
-        let mut cube_count = 0;
-        let (_, max) = cubes.size_hint();
+            if shift > 63 && value != 0 || shift > 56 && value > 1 {
+                return Err(std::io::Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    "Cannot load more than u64 cubes",
+                ));
+            }
+
+            cube_count |= value.overflowing_shl(shift).0;
+            shift += 7;
 
-        if let Some(max) = max {
-            cube_count = max;
+            if is_last_byte {
+                break;
+            }
         }
 
+        return Ok(cube_count);
+    }
+
+    /// Write a leb128 value
+    ///
+    /// If `prefill` is `true`, this function will always
+    /// write 10 bytes of data describing `number`.
+    fn write_leb128(mut number: u64, mut writer: impl Write, prefill: bool) -> std::io::Result<()> {
         let mut ran_once = false;
-        while cube_count > 0 || !ran_once {
+        let mut bytes_written = 0;
+        while number > 0 || !ran_once || (prefill && bytes_written < 10) {
             ran_once = true;
-            let mut next_byte = (cube_count as u8) & 0x7F;
-            cube_count >>= 7;
+            let mut next_byte = (number as u8) & 0x7F;
+            number >>= 7;
 
-            if cube_count > 0 {
+            if number > 0 || (prefill && bytes_written != 9) {
                 next_byte |= 0x80;
             }
 
-            write.write_all(&[next_byte])?;
+            writer.write_all(&[next_byte])?;
+            bytes_written += 1;
         }
 
+        Ok(())
+    }
+
+    /// Write the header
+    ///
+    /// If `prefill_len` is `true`, the length is _always_ written
+    /// as 10 bytes. This way, rewriting the header in-place is possible.
+    fn write_header(
+        mut write: impl Write,
+        magic: [u8; 4],
+        is_canonical: bool,
+        compression: Compression,
+        cube_count: Option<u64>,
+        prefill_len: bool,
+    ) -> std::io::Result<()> {
+        let compression_val = compression.into();
+        let orientation_val = if is_canonical { 1 } else { 0 };
+
+        let cube_count = cube_count.unwrap_or(0);
+
+        write.write_all(&magic)?;
+        write.write_all(&[orientation_val, compression_val])?;
+        Self::write_leb128(cube_count, &mut write, prefill_len)?;
+
+        Ok(())
+    }
+
+    /// Write implementation
+    fn write_impl<I, W>(cubes: I, compression: Compression, write: W) -> std::io::Result<usize>
+    where
+        I: Iterator<Item = RawPCube>,
+        W: Write,
+    {
         let mut writer = Writer::new(compression, write);
 
-        if let Some(e) = cubes.find_map(|v| v.pack(&mut writer).err()) {
+        let mut cube_count = 0;
+        if let Some(e) = cubes
+            .inspect(|_| cube_count += 1)
+            .find_map(|v| v.pack(&mut writer).err())
+        {
             return Err(e);
         }
 
-        Ok(())
+        writer.flush()?;
+
+        Ok(cube_count)
     }
 
     /// Write the [`RawPCube`]s produced by `I` into `W`.
@@ -235,13 +265,43 @@ impl PCubeFile {
         is_canonical: bool,
         compression: Compression,
         cubes: I,
-        write: W,
+        mut write: W,
+    ) -> std::io::Result<usize>
+    where
+        I: Iterator<Item = RawPCube>,
+        W: std::io::Write,
+    {
+        let len = cubes.size_hint().1.map(|v| v as u64);
+
+        Self::write_header(&mut write, MAGIC, is_canonical, compression, len, false)?;
+
+        Self::write_impl(cubes, compression, write)
+    }
+
+    pub fn write_seekable<S, I>(
+        mut seekable: S,
+        is_canonical: bool,
+        compression: Compression,
+        cubes: I,
     ) -> std::io::Result<()>
     where
+        S: std::io::Seek + std::io::Write,
         I: Iterator<Item = RawPCube>,
-        W: Write,
     {
-        Self::write_impl(true, cubes, is_canonical, compression, write)
+        let len = cubes.size_hint().1.map(|v| v as u64);
+        let magic = [0, 0, 0, 0];
+        Self::write_header(&mut seekable, magic, is_canonical, compression, len, true)?;
+
+        let len = Self::write_impl(cubes, compression, &mut seekable)?;
+        let len = Some(len as u64);
+
+        // Write magic and cube length at the end
+        seekable.rewind()?;
+        Self::write_header(&mut seekable, MAGIC, is_canonical, compression, len, true)?;
+
+        seekable.flush()?;
+
+        Ok(())
     }
 
     /// Write the [`RawPCube`]s produced by `I` to the file at `path`.
@@ -264,19 +324,10 @@ impl PCubeFile {
     where
         I: Iterator<Item = RawPCube>,
     {
-        let mut file = std::fs::File::create(path.as_ref())?;
-
+        let file = std::fs::File::create(path.as_ref())?;
         file.set_len(0)?;
-        file.seek(std::io::SeekFrom::Start(0))?;
-        file.write_all(&[0, 0, 0, 0])?;
-
-        Self::write_impl(false, cubes, is_canonical, compression, &mut file)?;
 
-        // Write magic last
-        file.seek(std::io::SeekFrom::Start(0))?;
-        file.write_all(&MAGIC)?;
-
-        Ok(())
+        Self::write_seekable(file, is_canonical, compression, cubes)
     }
 }
 
@@ -407,3 +458,34 @@ where
 }
 
 impl<T> AllUniquePolycubeIterator for AllUnique<T> where T: Read {}
+
+#[test]
+pub fn leb128_len() {
+    let values = [0, 1, 24, 150283, 0x7FFFF_FFFF, u64::MAX - 1, u64::MAX];
+
+    for value in values {
+        let mut data = Vec::new();
+        PCubeFile::write_leb128(value, &mut data, true).unwrap();
+
+        assert_eq!(value, PCubeFile::read_leb128(&data[..]).unwrap());
+    }
+
+    let mut many_zeros = [0x80; 20];
+    many_zeros[19] = 0x00;
+
+    assert!(PCubeFile::read_leb128(&many_zeros[..]).is_ok());
+}
+
+#[test]
+pub fn leb128_unparseable() {
+    let unparseable_values = [
+        &[0x81, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x02][..],
+        &[
+            0x81, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01,
+        ][..],
+    ];
+
+    for unparseable in unparseable_values {
+        assert!(PCubeFile::read_leb128(unparseable).is_err());
+    }
+}