From ce06e53b3629e1d37cdc707c8dce87f807154c29 Mon Sep 17 00:00:00 2001
From: datdenkikniet <jcdra1@gmail.com>
Date: Wed, 2 Aug 2023 17:12:32 +0200
Subject: [PATCH 01/42] also buffer write output, with same size as for read
 buffer

---
 rust/src/pcube/compression.rs | 10 ++++++----
 rust/src/pcube/mod.rs         |  2 ++
 2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/rust/src/pcube/compression.rs b/rust/src/pcube/compression.rs
index 61ea73b..8e09bdc 100644
--- a/rust/src/pcube/compression.rs
+++ b/rust/src/pcube/compression.rs
@@ -1,7 +1,9 @@
-use std::io::{BufReader, Read, Write};
+use std::io::{BufReader, BufWriter, Read, Write};
 
 use flate2::{read::GzDecoder, write::GzEncoder};
 
+const BUF_SIZE: usize = 1024 * 16384;
+
 /// Compression types supported for `.pcube` files.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum Compression {
@@ -45,7 +47,7 @@ where
 {
     pub fn new(compression: Compression, reader: T) -> Self {
         match compression {
-            Compression::None => Self::Uncompressed(BufReader::new(reader)),
+            Compression::None => Self::Uncompressed(BufReader::with_capacity(BUF_SIZE, reader)),
             Compression::Gzip => Self::Gzip(GzDecoder::new(reader)),
         }
     }
@@ -74,7 +76,7 @@ pub enum Writer<T>
 where
     T: Write,
 {
-    Uncompressed(T),
+    Uncompressed(BufWriter<T>),
     Gzip(GzEncoder<T>),
 }
 
@@ -84,7 +86,7 @@ where
 {
     pub fn new(compression: Compression, writer: T) -> Self {
         match compression {
-            Compression::None => Self::Uncompressed(writer),
+            Compression::None => Self::Uncompressed(BufWriter::with_capacity(BUF_SIZE, writer)),
             Compression::Gzip => Self::Gzip(GzEncoder::new(writer, flate2::Compression::default())),
         }
     }
diff --git a/rust/src/pcube/mod.rs b/rust/src/pcube/mod.rs
index 8dc6175..468649c 100644
--- a/rust/src/pcube/mod.rs
+++ b/rust/src/pcube/mod.rs
@@ -224,6 +224,8 @@ impl PCubeFile {
             return Err(e);
         }
 
+        writer.flush()?;
+
         Ok(())
     }
 

From d43d13815ff52e42219e1a1ddb0977569d38ceb3 Mon Sep 17 00:00:00 2001
From: datdenkikniet <jcdra1@gmail.com>
Date: Tue, 1 Aug 2023 20:12:56 +0200
Subject: [PATCH 02/42] Add counting option to converting, so that all cubes
 are counted

---
 rust/src/cli/cli.rs | 97 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 84 insertions(+), 13 deletions(-)

diff --git a/rust/src/cli/cli.rs b/rust/src/cli/cli.rs
index b298e5d..ee7edfa 100644
--- a/rust/src/cli/cli.rs
+++ b/rust/src/cli/cli.rs
@@ -5,12 +5,15 @@ use std::{
 };
 
 use clap::{Args, Parser, Subcommand, ValueEnum};
-use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
-use opencubes::{naive_polycube::NaivePolyCube, pcube::PCubeFile};
-use rayon::prelude::{IntoParallelIterator, ParallelIterator};
+use indicatif::{MultiProgress, ProgressBar, ProgressIterator, ProgressStyle};
+use opencubes::{
+    naive_polycube::NaivePolyCube,
+    pcube::{PCubeFile, RawPCube},
+};
 
 mod enumerate;
 use enumerate::enumerate;
+use rayon::prelude::{IntoParallelIterator, IntoParallelRefIterator, ParallelIterator};
 
 fn finish_bar(bar: &ProgressBar, duration: Duration, expansions: usize, n: usize) {
     let time = duration.as_micros();
@@ -37,7 +40,17 @@ fn finish_bar(bar: &ProgressBar, duration: Duration, expansions: usize, n: usize
 }
 
 fn unknown_bar() -> ProgressBar {
-    let style = ProgressStyle::with_template("[{elapsed_precise}] [{spinner:10.cyan/blue}] {msg}")
+    unknown_bar_with_pos(false)
+}
+
+fn unknown_bar_with_pos(with_pos: bool) -> ProgressBar {
+    let template = if with_pos {
+        "[{elapsed_precise}] [{spinner:10.cyan/blue}] {pos} {msg}"
+    } else {
+        "[{elapsed_precise}] [{spinner:10.cyan/blue}] {msg}"
+    };
+
+    let style = ProgressStyle::with_template(template)
         .unwrap()
         .tick_strings(&[
             ">---------",
@@ -154,6 +167,13 @@ pub struct ConvertArgs {
     /// the conversion is complete.
     #[clap(short, long)]
     pub output_path: Option<String>,
+
+    /// Count the cubes in stream-oriented files before writing the converted file.
+    ///
+    /// Counting requires 2 passes for the conversion to be completed, which
+    /// can be slow.
+    #[clap(long, short = 'n')]
+    pub count: bool,
 }
 
 #[derive(Clone, Args)]
@@ -318,7 +338,7 @@ pub fn convert(opts: &ConvertArgs) {
     // that the longest files are yielded last.
     let files: BTreeMap<_, _> = opts
         .path
-        .iter()
+        .par_iter()
         .map(|path| {
             let input_file = match PCubeFile::new_file(&path) {
                 Ok(f) => f,
@@ -327,6 +347,7 @@ pub fn convert(opts: &ConvertArgs) {
                     std::process::exit(1);
                 }
             };
+
             (input_file.len(), (input_file, path.to_string()))
         })
         .collect();
@@ -334,18 +355,25 @@ pub fn convert(opts: &ConvertArgs) {
     // Iterate over the files and do some printing, in-order
     let files: Vec<_> = files
         .into_iter()
-        .map(|(_, (input_file, path))| {
+        .map(|(len, (input_file, path))| {
             let output_path = opts.output_path.clone().unwrap_or(path.clone());
 
-            println!("Converting file {}", path);
-            println!("Final output path: {output_path}");
+            multi_bar
+                .println(format!("Converting file {}", path))
+                .unwrap();
+            multi_bar
+                .println(format!("Final output path: {output_path}"))
+                .unwrap();
+
             if opts.canonicalize {
-                println!("Canonicalizing output");
+                multi_bar.println("Canonicalizing output").unwrap();
             }
-            println!("Input compression: {:?}", input_file.compression());
-            println!("Output compression: {:?}", opts.compression);
-
-            let len = input_file.len();
+            multi_bar
+                .println(format!("Input compression: {:?}", input_file.compression()))
+                .unwrap();
+            multi_bar
+                .println(format!("Output compression: {:?}", opts.compression))
+                .unwrap();
 
             let bar = if let Some(len) = len {
                 make_bar(len as u64)
@@ -363,6 +391,24 @@ pub fn convert(opts: &ConvertArgs) {
     files
         .into_par_iter()
         .for_each(|(input_file, path, output_path, len, bar)| {
+            let len = if opts.count && len.is_none() {
+                let bar = unknown_bar_with_pos(true);
+                let counting_bar = multi_bar.add(bar);
+                counting_bar.set_message(format!("polycubes counted in {path}"));
+
+                let with_progress = PCubeFile::new_file(&path)
+                    .unwrap()
+                    .progress_with(counting_bar.clone());
+
+                let output = Some(with_progress.count());
+
+                counting_bar.finish_and_clear();
+
+                output
+            } else {
+                input_file.len()
+            };
+
             bar.set_message(path.to_string());
 
             let canonical = input_file.canonical();
@@ -376,6 +422,30 @@ pub fn convert(opts: &ConvertArgs) {
             let mut total_read = 0;
             let mut last_tick = Instant::now();
 
+            struct InputIter<I> {
+                inner: I,
+                len: Option<usize>,
+            }
+
+            impl<I> Iterator for InputIter<I>
+            where
+                I: Iterator<Item = RawPCube>,
+            {
+                type Item = RawPCube;
+
+                fn next(&mut self) -> Option<Self::Item> {
+                    self.inner.next()
+                }
+
+                fn size_hint(&self) -> (usize, Option<usize>) {
+                    if let Some(len) = self.len {
+                        (len, Some(len))
+                    } else {
+                        (0, None)
+                    }
+                }
+            }
+
             let input = input_file.filter_map(|v| {
                 total_read += 1;
 
@@ -404,6 +474,7 @@ pub fn convert(opts: &ConvertArgs) {
                 }
             });
 
+            let input = InputIter { inner: input, len };
             let canonical = canonical || opts.canonicalize;
 
             match PCubeFile::write_file(

From e6fe2381f0cf3e55f370c55333b2bb3533232156 Mon Sep 17 00:00:00 2001
From: datdenkikniet <jcdra1@gmail.com>
Date: Fri, 4 Aug 2023 19:44:59 +0200
Subject: [PATCH 03/42] Fix bar & progress for validate

---
 rust/src/cli/cli.rs | 55 ++++++++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 31 deletions(-)

diff --git a/rust/src/cli/cli.rs b/rust/src/cli/cli.rs
index ee7edfa..3fa6f1a 100644
--- a/rust/src/cli/cli.rs
+++ b/rust/src/cli/cli.rs
@@ -224,33 +224,36 @@ pub fn validate(opts: &ValidateArgs) -> std::io::Result<()> {
     let in_memory = !opts.no_in_memory;
     let n = opts.n;
 
-    println!("Validating {}", path);
+    let file = PCubeFile::new_file(path)?;
+    let canonical = file.canonical();
+    let len = file.len();
+
+    let bar = if let Some(len) = len {
+        make_bar(len as u64)
+    } else {
+        unknown_bar_with_pos(true)
+    };
+
+    bar.set_message("cubes validated");
+
+    bar.println(format!("Validating {}", path));
 
     let mut uniqueness = match (in_memory, uniqueness) {
         (true, true) => {
-            eprintln!("Verifying uniqueness.");
+            bar.println("Verifying uniqueness.");
             Some(HashSet::new())
         }
         (false, true) => {
+            bar.abandon();
             println!("Cannot verify uniqueness without placing all entries in memory. Re-run with `--no-uniqueness` enabled to run.");
             std::process::exit(1);
         }
         (_, false) => {
-            eprintln!("Not verifying uniqueness");
+            bar.println("Not verifying uniqueness");
             None
         }
     };
 
-    let file = PCubeFile::new_file(path)?;
-    let canonical = file.canonical();
-    let len = file.len();
-
-    let bar = if let Some(len) = len {
-        make_bar(len as u64)
-    } else {
-        unknown_bar()
-    };
-
     let exit = |msg: &str| {
         bar.abandon();
         println!("{msg}");
@@ -258,21 +261,18 @@ pub fn validate(opts: &ValidateArgs) -> std::io::Result<()> {
     };
 
     match (canonical, validate_canonical) {
-        (true, true) => eprintln!("Verifying entry canonicality. File indicates that entries are canonical."),
-        (false, true) => eprintln!("Not verifying entry canonicality. File header does not indicate that entries are canonical"),
-        (true, false) => eprintln!("Not verifying entry canonicality. File header indicates that they are, but check is disabled."),
-        (false, false) => eprintln!("Not verifying canonicality. File header does not indicate that entries are canonical, and check is disabled.")
+        (true, true) => bar.println("Verifying entry canonicality. File indicates that entries are canonical."),
+        (false, true) => bar.println("Not verifying entry canonicality. File header does not indicate that entries are canonical"),
+        (true, false) => bar.println("Not verifying entry canonicality. File header indicates that they are, but check is disabled."),
+        (false, false) => bar.println("Not verifying canonicality. File header does not indicate that entries are canonical, and check is disabled.")
     }
 
     if let Some(n) = n {
-        eprintln!("Verifying that all entries are N = {n}");
+        bar.println(format!("Verifying that all entries are N = {n}"));
     }
 
     let mut total_read = 0;
 
-    let mut last_tick = Instant::now();
-    bar.tick();
-
     for cube in file {
         let cube = match cube {
             Ok(c) => NaivePolyCube::from(c),
@@ -284,14 +284,7 @@ pub fn validate(opts: &ValidateArgs) -> std::io::Result<()> {
 
         total_read += 1;
 
-        if len.is_some() {
-            bar.inc(1);
-        } else if last_tick.elapsed() >= Duration::from_millis(66) {
-            last_tick = Instant::now();
-            bar.set_message(format!("{total_read}"));
-            bar.inc(1);
-            bar.tick();
-        }
+        bar.inc(1);
 
         let mut form: Option<NaivePolyCube> = None;
         let canonical_form = || cube.pcube_canonical_form();
@@ -317,10 +310,10 @@ pub fn validate(opts: &ValidateArgs) -> std::io::Result<()> {
                 exit("Found non-unique polycubes.");
             }
         }
-
-        bar.finish();
     }
 
+    bar.finish();
+
     println!("Success: {path}, containing {total_read} cubes, is valid");
 
     Ok(())

From eeb64884c83c74374f3b8c79c954eb68a1531d31 Mon Sep 17 00:00:00 2001
From: datdenkikniet <jcdra1@gmail.com>
Date: Fri, 4 Aug 2023 19:39:58 +0200
Subject: [PATCH 04/42] (ab)use LEB128 for fixed-width header so we can write
 the count without having to re-read the whole file

Put this in a const

Just get rid of this limit, we can handle it without
---
 rust/src/pcube/mod.rs | 206 +++++++++++++++++++++++++++++-------------
 1 file changed, 143 insertions(+), 63 deletions(-)

diff --git a/rust/src/pcube/mod.rs b/rust/src/pcube/mod.rs
index 468649c..c9885d2 100644
--- a/rust/src/pcube/mod.rs
+++ b/rust/src/pcube/mod.rs
@@ -2,7 +2,7 @@
 
 use std::{
     fs::File,
-    io::{ErrorKind, Read, Seek, Write},
+    io::{ErrorKind, Read, Write},
     iter::Peekable,
     path::Path,
 };
@@ -93,25 +93,7 @@ where
         let [orientation, compression] = header;
         let canonicalized = orientation != 0;
 
-        let mut cube_count: u64 = 0;
-        let mut shift = 0;
-        loop {
-            let mut next_byte = [0u8; 1];
-            input.read_exact(&mut next_byte)?;
-
-            let [next_byte] = next_byte;
-
-            cube_count |= ((next_byte & 0x7F) as u64) << shift;
-
-            shift += 7;
-            if shift > 64 {
-                panic!("Cannot load possibly more than u64 cubes...");
-            }
-
-            if next_byte & 0x80 == 0 {
-                break;
-            }
-        }
+        let cube_count = PCubeFile::read_leb128(&mut input)?;
 
         let len = if cube_count == 0 {
             None
@@ -177,56 +159,102 @@ impl PCubeFile {
         Self::new(file)
     }
 
-    /// Write implementation
-    fn write_impl<I, W>(
-        write_magic: bool,
-        mut cubes: I,
-        is_canonical: bool,
-        compression: Compression,
-        mut write: W,
-    ) -> std::io::Result<()>
-    where
-        I: Iterator<Item = RawPCube>,
-        W: Write,
-    {
-        if write_magic {
-            write.write_all(&MAGIC)?;
-        }
+    fn read_leb128(mut reader: impl Read) -> std::io::Result<u64> {
+        let mut cube_count: u64 = 0;
+        let mut shift = 0;
+        loop {
+            let mut next_byte = [0u8; 1];
+            reader.read_exact(&mut next_byte)?;
 
-        let compression_val = compression.into();
-        let orientation_val = if is_canonical { 1 } else { 0 };
+            let [next_byte] = next_byte;
 
-        write.write_all(&[orientation_val, compression_val])?;
+            let is_last_byte = (next_byte & 0x80) == 0x00;
+            let value = (next_byte & 0x7F) as u64;
 
-        let mut cube_count = 0;
-        let (_, max) = cubes.size_hint();
+            if shift > 63 && value != 0 || shift > 56 && value > 1 {
+                return Err(std::io::Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    "Cannot load more than u64 cubes",
+                ));
+            }
+
+            cube_count |= value.overflowing_shl(shift).0;
+            shift += 7;
 
-        if let Some(max) = max {
-            cube_count = max;
+            if is_last_byte {
+                break;
+            }
         }
 
+        return Ok(cube_count);
+    }
+
+    /// Write a leb128 value
+    ///
+    /// If `prefill` is `true`, this function will always
+    /// write 10 bytes of data describing `number`.
+    fn write_leb128(mut number: u64, mut writer: impl Write, prefill: bool) -> std::io::Result<()> {
         let mut ran_once = false;
-        while cube_count > 0 || !ran_once {
+        let mut bytes_written = 0;
+        while number > 0 || !ran_once || (prefill && bytes_written < 10) {
             ran_once = true;
-            let mut next_byte = (cube_count as u8) & 0x7F;
-            cube_count >>= 7;
+            let mut next_byte = (number as u8) & 0x7F;
+            number >>= 7;
 
-            if cube_count > 0 {
+            if number > 0 || (prefill && bytes_written != 9) {
                 next_byte |= 0x80;
             }
 
-            write.write_all(&[next_byte])?;
+            writer.write_all(&[next_byte])?;
+            bytes_written += 1;
         }
 
+        Ok(())
+    }
+
+    /// Write the header
+    ///
+    /// If `prefill_len` is `true`, the length is _always_ written
+    /// as 10 bytes. This way, rewriting the header in-place is possible.
+    fn write_header(
+        mut write: impl Write,
+        magic: [u8; 4],
+        is_canonical: bool,
+        compression: Compression,
+        cube_count: Option<u64>,
+        prefill_len: bool,
+    ) -> std::io::Result<()> {
+        let compression_val = compression.into();
+        let orientation_val = if is_canonical { 1 } else { 0 };
+
+        let cube_count = cube_count.unwrap_or(0);
+
+        write.write_all(&magic)?;
+        write.write_all(&[orientation_val, compression_val])?;
+        Self::write_leb128(cube_count, &mut write, prefill_len)?;
+
+        Ok(())
+    }
+
+    /// Write implementation
+    fn write_impl<I, W>(cubes: I, compression: Compression, write: W) -> std::io::Result<usize>
+    where
+        I: Iterator<Item = RawPCube>,
+        W: Write,
+    {
         let mut writer = Writer::new(compression, write);
 
-        if let Some(e) = cubes.find_map(|v| v.pack(&mut writer).err()) {
+        let mut cube_count = 0;
+        if let Some(e) = cubes
+            .inspect(|_| cube_count += 1)
+            .find_map(|v| v.pack(&mut writer).err())
+        {
             return Err(e);
         }
 
         writer.flush()?;
 
-        Ok(())
+        Ok(cube_count)
     }
 
     /// Write the [`RawPCube`]s produced by `I` into `W`.
@@ -237,13 +265,43 @@ impl PCubeFile {
         is_canonical: bool,
         compression: Compression,
         cubes: I,
-        write: W,
+        mut write: W,
+    ) -> std::io::Result<usize>
+    where
+        I: Iterator<Item = RawPCube>,
+        W: std::io::Write,
+    {
+        let len = cubes.size_hint().1.map(|v| v as u64);
+
+        Self::write_header(&mut write, MAGIC, is_canonical, compression, len, false)?;
+
+        Self::write_impl(cubes, compression, write)
+    }
+
+    pub fn write_seekable<S, I>(
+        mut seekable: S,
+        is_canonical: bool,
+        compression: Compression,
+        cubes: I,
     ) -> std::io::Result<()>
     where
+        S: std::io::Seek + std::io::Write,
         I: Iterator<Item = RawPCube>,
-        W: Write,
     {
-        Self::write_impl(true, cubes, is_canonical, compression, write)
+        let len = cubes.size_hint().1.map(|v| v as u64);
+        let magic = [0, 0, 0, 0];
+        Self::write_header(&mut seekable, magic, is_canonical, compression, len, true)?;
+
+        let len = Self::write_impl(cubes, compression, &mut seekable)?;
+        let len = Some(len as u64);
+
+        // Write magic and cube length at the end
+        seekable.rewind()?;
+        Self::write_header(&mut seekable, MAGIC, is_canonical, compression, len, true)?;
+
+        seekable.flush()?;
+
+        Ok(())
     }
 
     /// Write the [`RawPCube`]s produced by `I` to the file at `path`.
@@ -266,19 +324,10 @@ impl PCubeFile {
     where
         I: Iterator<Item = RawPCube>,
     {
-        let mut file = std::fs::File::create(path.as_ref())?;
-
+        let file = std::fs::File::create(path.as_ref())?;
         file.set_len(0)?;
-        file.seek(std::io::SeekFrom::Start(0))?;
-        file.write_all(&[0, 0, 0, 0])?;
-
-        Self::write_impl(false, cubes, is_canonical, compression, &mut file)?;
-
-        // Write magic last
-        file.seek(std::io::SeekFrom::Start(0))?;
-        file.write_all(&MAGIC)?;
 
-        Ok(())
+        Self::write_seekable(file, is_canonical, compression, cubes)
     }
 }
 
@@ -409,3 +458,34 @@ where
 }
 
 impl<T> AllUniquePolycubeIterator for AllUnique<T> where T: Read {}
+
+#[test]
+pub fn leb128_len() {
+    let values = [0, 1, 24, 150283, 0x7FFFF_FFFF, u64::MAX - 1, u64::MAX];
+
+    for value in values {
+        let mut data = Vec::new();
+        PCubeFile::write_leb128(value, &mut data, true).unwrap();
+
+        assert_eq!(value, PCubeFile::read_leb128(&data[..]).unwrap());
+    }
+
+    let mut many_zeros = [0x80; 20];
+    many_zeros[19] = 0x00;
+
+    assert!(PCubeFile::read_leb128(&many_zeros[..]).is_ok());
+}
+
+#[test]
+pub fn leb128_unparseable() {
+    let unparseable_values = [
+        &[0x81, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x02][..],
+        &[
+            0x81, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x01,
+        ][..],
+    ];
+
+    for unparseable in unparseable_values {
+        assert!(PCubeFile::read_leb128(unparseable).is_err());
+    }
+}

From a37c83c9f96aa7e7f2c2400606f5a5509f41dcd0 Mon Sep 17 00:00:00 2001
From: datdenkikniet <jcdra1@gmail.com>
Date: Sat, 5 Aug 2023 21:55:46 +0200
Subject: [PATCH 05/42] Converting now longer needs a counting option, as it is
 always done

---
 rust/src/cli/cli.rs | 83 +++++----------------------------------------
 1 file changed, 9 insertions(+), 74 deletions(-)

diff --git a/rust/src/cli/cli.rs b/rust/src/cli/cli.rs
index 3fa6f1a..d50654b 100644
--- a/rust/src/cli/cli.rs
+++ b/rust/src/cli/cli.rs
@@ -1,15 +1,12 @@
 use std::{
     collections::{BTreeMap, HashSet},
     path::PathBuf,
-    time::{Duration, Instant},
+    time::Duration,
 };
 
 use clap::{Args, Parser, Subcommand, ValueEnum};
-use indicatif::{MultiProgress, ProgressBar, ProgressIterator, ProgressStyle};
-use opencubes::{
-    naive_polycube::NaivePolyCube,
-    pcube::{PCubeFile, RawPCube},
-};
+use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
+use opencubes::{naive_polycube::NaivePolyCube, pcube::PCubeFile};
 
 mod enumerate;
 use enumerate::enumerate;
@@ -167,13 +164,6 @@ pub struct ConvertArgs {
     /// the conversion is complete.
     #[clap(short, long)]
     pub output_path: Option<String>,
-
-    /// Count the cubes in stream-oriented files before writing the converted file.
-    ///
-    /// Counting requires 2 passes for the conversion to be completed, which
-    /// can be slow.
-    #[clap(long, short = 'n')]
-    pub count: bool,
 }
 
 #[derive(Clone, Args)]
@@ -371,38 +361,20 @@ pub fn convert(opts: &ConvertArgs) {
             let bar = if let Some(len) = len {
                 make_bar(len as u64)
             } else {
-                unknown_bar()
+                unknown_bar_with_pos(true)
             };
 
             let bar = multi_bar.add(bar);
 
-            (input_file, path, output_path, len, bar)
+            (input_file, path, output_path, bar)
         })
         .collect();
 
     // Convert, in parallel
     files
         .into_par_iter()
-        .for_each(|(input_file, path, output_path, len, bar)| {
-            let len = if opts.count && len.is_none() {
-                let bar = unknown_bar_with_pos(true);
-                let counting_bar = multi_bar.add(bar);
-                counting_bar.set_message(format!("polycubes counted in {path}"));
-
-                let with_progress = PCubeFile::new_file(&path)
-                    .unwrap()
-                    .progress_with(counting_bar.clone());
-
-                let output = Some(with_progress.count());
-
-                counting_bar.finish_and_clear();
-
-                output
-            } else {
-                input_file.len()
-            };
-
-            bar.set_message(path.to_string());
+        .for_each(|(input_file, path, output_path, bar)| {
+            bar.set_message(format!("cubes converted for {path}"));
 
             let canonical = input_file.canonical();
             let mut output_path_temp = PathBuf::from(&output_path);
@@ -412,36 +384,7 @@ pub fn convert(opts: &ConvertArgs) {
             output_path_temp.pop();
             output_path_temp.push(filename);
 
-            let mut total_read = 0;
-            let mut last_tick = Instant::now();
-
-            struct InputIter<I> {
-                inner: I,
-                len: Option<usize>,
-            }
-
-            impl<I> Iterator for InputIter<I>
-            where
-                I: Iterator<Item = RawPCube>,
-            {
-                type Item = RawPCube;
-
-                fn next(&mut self) -> Option<Self::Item> {
-                    self.inner.next()
-                }
-
-                fn size_hint(&self) -> (usize, Option<usize>) {
-                    if let Some(len) = self.len {
-                        (len, Some(len))
-                    } else {
-                        (0, None)
-                    }
-                }
-            }
-
             let input = input_file.filter_map(|v| {
-                total_read += 1;
-
                 let cube = match v {
                     Ok(v) => Some(v),
                     Err(e) => {
@@ -451,14 +394,7 @@ pub fn convert(opts: &ConvertArgs) {
                     }
                 }?;
 
-                if len.is_some() {
-                    bar.inc(1);
-                } else if last_tick.elapsed() >= Duration::from_millis(66) {
-                    last_tick = Instant::now();
-                    bar.set_message(format!("{total_read}"));
-                    bar.inc(1);
-                    bar.tick();
-                }
+                bar.inc(1);
 
                 if opts.canonicalize {
                     Some(NaivePolyCube::from(cube).canonical_form().into())
@@ -467,7 +403,6 @@ pub fn convert(opts: &ConvertArgs) {
                 }
             });
 
-            let input = InputIter { inner: input, len };
             let canonical = canonical || opts.canonicalize;
 
             match PCubeFile::write_file(
@@ -485,7 +420,7 @@ pub fn convert(opts: &ConvertArgs) {
 
             if !bar.is_finished() {
                 match std::fs::rename(output_path_temp, output_path) {
-                    Ok(_) => bar.finish_with_message(format!("{path} Done!")),
+                    Ok(_) => bar.finish(),
                     Err(e) => {
                         bar.abandon_with_message(format!("{path} Failed to write final file: {e}"));
                         return;

From 15977da327def8382288d9a1c418638cd0b1ea69 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Fri, 28 Jul 2023 16:44:01 +0300
Subject: [PATCH 06/42] Memory mapped file API (mapped_file library)

MIT license in mapped_file.hpp and mapped_file.cpp

- Supports 64-bit file seeking. (+4GiB files)
- Can memory map portions of the opened file or entire file.
- Can flush modified read-write mappings back into disk.
- Read-write regions will grow the backing file in multiple 4096 blocks.
- mapped::file class for accessing an file on disk.
- mapped::region class for memory mapping raw area of file.
- mapped::struct_region<T> template for accessing an on-disk structure
- mapped::array_region<T> template for accessing an on-disk array of T

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/CMakeLists.txt            |   4 +
 cpp/libraries/mapped_file.cpp | 317 +++++++++++++++++++++++
 cpp/libraries/mapped_file.hpp | 467 ++++++++++++++++++++++++++++++++++
 3 files changed, 788 insertions(+)
 create mode 100644 cpp/libraries/mapped_file.cpp
 create mode 100644 cpp/libraries/mapped_file.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6151054..05e50f0 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -38,6 +38,9 @@ macro(ConfigureTarget Target)
 	)
 endmacro()
 
+add_library(mapped_file STATIC "libraries/mapped_file.cpp")
+ConfigureTarget(mapped_file)
+
 # Source files
 add_library(CubeObjs OBJECT
 	"src/cubes.cpp"
@@ -50,6 +53,7 @@ ConfigureTarget(CubeObjs)
 # Build main program
 add_executable(${PROJECT_NAME} "program.cpp" $<TARGET_OBJECTS:CubeObjs>)
 target_link_libraries(${PROJECT_NAME} pthread)
+target_link_libraries(${PROJECT_NAME} mapped_file)
 ConfigureTarget(${PROJECT_NAME})
 
 # Optionally build tests
diff --git a/cpp/libraries/mapped_file.cpp b/cpp/libraries/mapped_file.cpp
new file mode 100644
index 0000000..a3731cd
--- /dev/null
+++ b/cpp/libraries/mapped_file.cpp
@@ -0,0 +1,317 @@
+/**
+ * Copyright 2023 Jarmo A Tiitto
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the “Software”), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "mapped_file.hpp"
+
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+#include <string>
+
+// POSIX/Linux APIs
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#ifndef MAP_HUGE_2MB
+#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
+#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
+#endif
+
+namespace mapped {
+
+/**
+ * Mapped file POSIX/Linux compatible implementation
+ */
+file::file() : fd(-1), fd_size(0) {}
+
+file::~file() { close(); }
+
+void file::close() {
+    if (fd >= 0) {
+        ::fsync(fd);
+        ::close(fd);
+        fd = -1;
+        fd_size = 0;
+    }
+}
+
+int file::open(const char* fname) {
+    close();
+
+    fd = ::open64(fname, O_RDONLY);
+    if (fd == -1) {
+        std::fprintf(stderr, "Error opening file for reading\n");
+        return -1;
+    }
+
+    struct stat64 finfo;
+    if (fstat64(fd, &finfo)) {
+        std::fprintf(stderr, "Error opening file for reading\n");
+        return -1;
+    }
+    fd_size = finfo.st_size;
+    fd_rw = false;
+    return 0;
+}
+
+int file::openrw(const char* fname, size_t maxsize, int flags) {
+    // create new files with "normal" permissions: "-rw-r--r--"
+    const mode_t fperms = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH;
+
+    close();
+
+    maxsize = roundUp(maxsize);
+
+    if (!flags) {
+        fd = ::open64(fname, O_RDWR | O_CLOEXEC);
+        if (fd == -1) {
+            std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
+            return -1;
+        }
+
+        fd_rw = true;
+
+        struct stat64 finfo;
+        if (fstat64(fd, &finfo)) {
+            std::fprintf(stderr, "Error getting file size:%s\n", std::strerror(errno));
+            return -1;
+        }
+        return truncate(finfo.st_size);
+
+    } else if ((flags & (CREATE | RESIZE)) == (CREATE | RESIZE)) {
+        fd = ::open64(fname, O_CREAT | O_RDWR | O_TRUNC | O_CLOEXEC, fperms);
+        if (fd == -1) {
+            std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
+            return -1;
+        }
+        fd_rw = true;
+        return truncate(maxsize);
+
+    } else if ((flags & RESIZE) != 0) {
+        fd = ::open64(fname, O_RDWR | O_CLOEXEC, fperms);
+        if (fd == -1) {
+            std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
+            return -1;
+        }
+        fd_rw = true;
+        return truncate(maxsize);
+    } else {
+        std::fprintf(stderr, "Invalid open flags:%s\n", std::strerror(errno));
+        return -1;
+    }
+}
+
+bool file::is_rw() const { return fd_rw; }
+
+seekoff_t file::size() const { return fd_size; }
+
+int file::truncate(seekoff_t newsize) {
+    // resize the backing file
+    if (newsize != fd_size && ftruncate64(fd, newsize)) {
+        std::fprintf(stderr, "Error resizing backing file:%s\n", std::strerror(errno));
+        return -1;
+    }
+    fd_size = newsize;
+    return 0;
+}
+
+/**
+ * Mapped region POSIX/Linux compatible implementation.
+ */
+
+region::region(std::shared_ptr<file> src, seekoff_t fpos, len_t size) : mfile(src) {
+    std::lock_guard lock(mfile->mut);
+    remap(fpos, size);
+}
+
+region::region(std::shared_ptr<file> src) : mfile(src) {
+    std::lock_guard lock(mfile->mut);
+    remap(0, mfile->size());
+}
+
+region::~region() {
+    std::lock_guard lock(mfile->mut);
+    map_fseek = 0;
+    remap(0, 0);
+}
+
+/**
+ * This is the core implementation of mapped_file:
+ * remap(0,0) releases the mapping.
+ * remap(0, n) mmap roundUp(n) bytes at offset 0
+ * remap(0, k) mremap roundUp(n) bytes at offset 0 (grows the existing mapping)
+ * remap(n, j) munmap old region, mmap new at offset roundDown(n)
+ *
+ * In read-write mode the backing file is grown to fit the mapping.
+ */
+void region::remap(const seekoff_t fpos, const len_t size) {
+    if (fpos == usr_fseek && size == usr_size) return;  // No-op
+    // check if [fpos, fpos+size] fits into the existing
+    // mmap() window and only adjust the user region.
+    if (size && map_ptr && (map_fseek <= fpos && fpos + size <= map_fseek + map_size)) {
+        usr_fseek = fpos;
+        usr_ptr = (uint8_t*)map_ptr + (fpos - map_fseek);
+        usr_size = size;
+        return;
+    }
+
+    // if size == 0 or the usr_fseek != fpos,
+    // we have to unmap the old region first, if any.
+    if (!!map_ptr && (size == 0 || usr_fseek != fpos)) {
+        if (::munmap(map_ptr, map_size) == -1) {
+            std::fprintf(stderr, "Error mapping file memory\n");
+            return;
+        }
+        map_ptr = nullptr;
+        map_size = 0;
+        usr_ptr = nullptr;
+        usr_size = 0;
+        if (size == 0) return;
+    }
+    // keep what user tried to ask:
+    usr_fseek = fpos;
+    usr_size = size;
+
+    if (map_ptr && map_fseek == fpos) {
+        // this mapping exists already at same map_fseek
+        // remap it to grow the region.
+        auto newsize = roundUp(size);
+        void* newptr = mremap(map_ptr, map_size, newsize, MREMAP_MAYMOVE);
+        if (newptr == MAP_FAILED) {
+            std::fprintf(stderr, "Error resizing memory-map of file:%s\n", std::strerror(errno));
+            std::abort();
+            return;
+        }
+        map_ptr = newptr;
+        map_size = size;
+        return;
+    }
+
+    // create new mapping
+    if (mfile->is_rw()) {
+        // RW mapping
+        auto newsize = roundUp(size);
+        if (mfile->size() < fpos + newsize && mfile->truncate(fpos + newsize)) {
+            // failed. Disk full?
+            std::abort();
+            return;
+        }
+        // mmap requires fpos && size to be multiple of PAGE_SIZE
+        map_fseek = roundDown(fpos);
+        if (map_fseek < fpos) {
+            // adjust size to cover.
+            newsize += PAGE_SIZE;
+        }
+        map_size = newsize;
+        map_ptr = mmap(0, map_size, PROT_READ | PROT_WRITE, MAP_SHARED, mfile->fd, map_fseek);
+        if (map_ptr == MAP_FAILED) {
+            std::fprintf(stderr, "Error memory-mapping file:%s %lu %d %lu\n", std::strerror(errno), size, mfile->fd, fpos);
+            std::abort();
+            return;
+        }
+    } else {
+        // RO mapping
+        if (mfile->size() < fpos) {
+            // can't: the backing file is too small.
+            std::fprintf(stderr, "Error seeking past end of file.\n");
+            std::abort();
+            return;
+        }
+        map_size = roundUp(size);
+        map_fseek = roundDown(fpos);
+        // Map the region. (use huge pages, don't reserve backing store)
+        map_ptr = mmap(0, map_size, PROT_READ, MAP_SHARED | MAP_NORESERVE | MAP_HUGE_2MB, mfile->fd, map_fseek);
+
+        if (!map_ptr || map_ptr == MAP_FAILED) {
+            std::fprintf(stderr, "Error mapping file\n");
+            std::abort();
+            return;
+        }
+    }
+    // adjust the usr_ptr to fix
+    // any page misalignment.
+    usr_ptr = (uint8_t*)map_ptr + (fpos - map_fseek);
+}
+
+void region::jump(seekoff_t fpos) {
+    std::lock_guard lock(mfile->mut);
+    remap(fpos, map_size);
+    is_dirty = false;
+}
+
+void region::flushJump(seekoff_t fpos) {
+    flush();
+    std::lock_guard lock(mfile->mut);
+    remap(fpos, map_size);
+}
+
+void region::flush() {
+    // only flush if dirty and RW mapped.
+    std::lock_guard lock(mfile->mut);
+    if (is_dirty && mfile->is_rw()) {
+        is_dirty = false;
+        if (msync(map_ptr, map_size, MS_ASYNC)) {
+            std::fprintf(stderr, "Error flushing memory-map:%s\n", std::strerror(errno));
+        }
+    }
+}
+
+void region::sync() {
+    // only flush if dirty and RW mapped.
+    std::lock_guard lock(mfile->mut);
+    if (is_dirty && mfile->is_rw()) {
+        is_dirty = false;
+        if (msync(map_ptr, map_size, MS_SYNC)) {
+            std::fprintf(stderr, "Error flushing memory-map:%s\n", std::strerror(errno));
+        }
+    }
+}
+
+/*
+TODO:
+void region::resident(void * paddr, size_t lenght, bool resident) {
+        // Align paddr to PAGE_SIZE
+        void * start = reinterpret_cast<void*>(uintptr_t(paddr) & ~(PAGE_SIZE-1));
+        lenght = roundToPage(lenght);
+
+        if(madvise(start, lenght, resident ? MADV_WILLNEED : MADV_DONTNEED)) {
+                std::fprintf(stderr,"Error setting memory-map residency:%s\n",std::strerror(errno));
+        }
+}
+
+void region::discard(void * paddr, size_t lenght) {
+        // get range of pages that may be discarded.
+        // this is always an subset of [paddr, paddr+lenght] range.
+        void * start = (void*)roundUp((uintptr_t)paddr, PAGE_SIZE);
+        lenght = roundDown(lenght, PAGE_SIZE);
+
+        if(start < (char*)paddr + lenght && lenght >= PAGE_SIZE) {
+                // note: errors are ignored here.
+                madvise(start, lenght, MADV_REMOVE);
+        }
+}
+*/
+
+};  // namespace mapped
diff --git a/cpp/libraries/mapped_file.hpp b/cpp/libraries/mapped_file.hpp
new file mode 100644
index 0000000..0aaefff
--- /dev/null
+++ b/cpp/libraries/mapped_file.hpp
@@ -0,0 +1,467 @@
+/**
+ * Copyright 2023 Jarmo A Tiitto
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the “Software”), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef MAPPEDFILE_HPP_INCLUDED
+#define MAPPEDFILE_HPP_INCLUDED
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <type_traits>
+
+/**
+ * Memory mapped file I/O utilities
+ * - mapped::file class for opening an file
+ * - mapped::region class for RW/RO memory mapping part the file instance.
+ * - mapped::struct_region<T> template for RW/RO accessing part the file as specified type.
+ * - mapped::array_region<T> template for RW/RO accessing part of the file as array of T elements.
+ *
+ * @note
+ *  When doing read-only mapping the region instance
+ *  should be const qualified as this restricts
+ *  the region class API to read-only operations and prevents
+ *  accidental modification of the file.
+ *  Use std::make_unique<const region>(<args>) in this case.
+ *
+ * @note
+ *  When using the read-write features the backing file is resized
+ *  in multiple PAGE_SIZE blocks even if the actually mapped size is
+ *  something else.
+ *  openrw(...,size,RESIZE) always truncates the file to roundUp(size).
+ *  You should do file->truncate(< sizeInBytes>) to make the file
+ *  size exactly what you want before the file is closed.
+ *
+ *  Modified regions should flush() or sync() before they are destroyed
+ *  or the modified data may not end up in the file.
+ *
+ * TODO:
+ *  - Two region instances should not overlap,
+ *    i.e same portion of the file should not be mapped twice.
+ *    (Not sure if this is actually broken now, but you have been warned)
+ * -  Multi-threading support not tested/written.
+ *    Currently the same mapped region can be used by multiple threads,
+ *    but cannot it be modified.
+ * -  Better error handling. (exceptions?, error codes?)
+ *    Currently critical errors are printed and std::abort() is called.
+ *    How do we handle system errors that happen in constructors?
+ */
+namespace mapped {
+
+const size_t PAGE_SIZE = 4096;
+
+static inline size_t roundToPage(ptrdiff_t x) { return (std::max<ptrdiff_t>(0, x - 1) & ~(PAGE_SIZE - 1)) + PAGE_SIZE; }
+
+constexpr inline size_t roundUp(uintptr_t x) { return (x + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1); }
+
+constexpr inline size_t roundDown(uintptr_t x) { return (x & ~(PAGE_SIZE - 1)); }
+
+/**
+ * seekoff_t: Position of the file cursor
+ */
+using seekoff_t = uint64_t;
+/**
+ * len_t: length of file data
+ */
+using len_t = size_t;
+
+class file;
+
+/**
+ * Memory-mapped region
+ * @brief
+ * the region base class implementation memory maps
+ * an raw memory range from the file.
+ */
+class region {
+   protected:
+    // actually mapped region:
+    void* map_ptr = nullptr;
+    size_t map_size = 0;
+    seekoff_t map_fseek = 0;
+    // what constructor asked:
+    void* usr_ptr = nullptr;
+    size_t usr_size = 0;
+    seekoff_t usr_fseek = 0;
+    // todo: maybe use std::weak_ptr?
+    // that would allow file to be released and
+    // any any existing region(s) would still work.
+    // (but only if remap() is not called)
+    std::shared_ptr<file> mfile;
+    // non-const data access sets is_dirty.
+    bool is_dirty = false;
+
+    void remap(const seekoff_t fpos, const len_t size);
+
+   public:
+    /**
+     * Open memory mapped region into a file.
+     * @brief
+     * Seeks at fpos in file and map size bytes
+     * starting from that position in file.
+     * @note
+     * - Seeking past the EOF in file that is read-only will fail.
+     *   The mapped size may extend past EOF but accessing past EOF
+     *   either returns undefined data or program is terminated by OS.
+     *   (EOF is at file->size())
+     * - Seeking past the EOF that is read-write
+     *   grows the backing file to fit the mapping.
+     *   The backing file is always extended in multiple of PAGE_SIZE bytes.
+     * @note
+     *  If size and/or fpos are not aligned to multiple of PAGE_SIZE
+     *  they are forcibly aligned internally. This results in
+     *  regionSize() and regionSeek() that may differ compared to
+     *  size() and getSeek().
+     *  Side-effect is that backing file may grow more than expected.
+     */
+    region(std::shared_ptr<file> src, seekoff_t fpos, len_t size);
+
+    /**
+     * Open memory mapped region into the file
+     * @brief
+     *  same as region(myfile, 0, myfile.size())
+     *  and memory maps the entire file.
+     */
+    explicit region(std::shared_ptr<file> src);
+
+    /**
+     * Note: even if region was modified,
+     * destructor will not flush()/sync() before tearing down the mapping.
+     */
+    virtual ~region();
+
+    /**
+     * Get data pointer.
+     */
+    const void* data() const { return usr_ptr; }
+    void* data() {
+        is_dirty = true;
+        return usr_ptr;
+    }
+
+    std::shared_ptr<file> getFile() { return mfile; }
+
+    /**
+     * Get the seek used to init this region.
+     */
+    seekoff_t getSeek() const { return usr_fseek; }
+    /**
+     * Get the size used to init this region.
+     */
+    len_t size() const { return usr_size; }
+
+    /**
+     * Get page aligned seek <= getSeek()
+     */
+    seekoff_t regionSeek() const { return map_fseek; }
+    /**
+     * Get page aligned size >= size()
+     */
+    len_t regionSize() const { return map_size; }
+
+    /**
+     * Resize the mapped region.
+     * @note the mapped memory address may move,
+     * but current contents are preserved.
+     * @warn all pointers or references into
+     * the mapping are invalidated.
+     */
+    void resize(len_t newsize);
+
+    // todo: window(len_t virtsize)
+    // since region() is already lying that it can map
+    // non-page-aligned offsets and sizes
+    // window() would grow this over-aligned window
+    // to arbitrary size and keep the initialized
+    // user size.
+    // This allows remap() to just adjust the usr_ptr
+    // if the region window fits in.
+
+    /**
+     * Flush mapped memory region into the file.
+     * @brief this is an hint to operating system that
+     * memory region shall be synchronized to disk.
+     * It may not wait for this to have completed before returning.
+     * @note Use sync() instead if you must guarantee the data has
+     * reached persistent storage.
+     */
+    void flush();
+
+    /**
+     * Synchronize modified memory region onto disk.
+     */
+    void sync();
+
+    /**
+     * Set memory region to resident/or released.
+     * @brief setting memory range to non-resident state
+     * causes system to drop the data from system memory.
+     * Reading non-resident memory region again causes system to
+     * fetch data from the disk again.
+     * @warn if memory region is not flushed before setting
+     * it non-resident any writes may be discarded to backing file.
+     */
+    // void resident(bool state);
+
+    /**
+     * Discard memory region.
+     * @brief discarding memory range causes system
+     * to reclaim the memory *and* the on-disk area.
+     * This means the data is lost in the mapped memory region,
+     * and any data within will not be written onto disk by sync()
+     * Subsequent reads after discard() return undefined data.
+     */
+    // void discard();
+
+    /**
+     * Seek in the file to fpos position and
+     * remap the memory region there.
+     * @warn all pointers or references into
+     * the mapping are invalidated.
+     */
+    void jump(seekoff_t fpos);
+
+    /**
+     * Flush the current region and
+     * Seek in the file to fpos position and
+     * remap the memory region there.
+     * @warn all pointers or references into
+     * the mapping are invalidated.
+     */
+    void flushJump(seekoff_t fpos);
+};
+
+static_assert(std::is_move_constructible_v<region>);
+static_assert(std::is_move_assignable_v<region>);
+static_assert(std::is_swappable_v<region>);
+
+/**
+ * Typed region.
+ * struct_region<T> allows directly accessing an on-disk structure.
+ * The region size is implicit from the type.
+ */
+template <typename T>
+class struct_region : protected region {
+   public:
+    using type = typename std::decay<T>::type;
+    static_assert(std::is_standard_layout_v<type>, "T must be plain-old-data type");
+
+    /**
+     * Memory map struct_region<T> at fpos in file.
+     */
+    struct_region(std::shared_ptr<file> f, seekoff_t fpos) : region(f, fpos, sizeof(type)) {}
+
+    type* get() { return static_cast<type*>(data()); }
+    const type* get() const { return static_cast<const type*>(data()); }
+
+    type* operator->() { return get(); }
+    const type* operator->() const { return get(); }
+
+    type& operator*() { return *get(); }
+    const type& operator*() const { return *get(); }
+
+    using region::flush;
+    using region::getFile;
+    using region::getSeek;
+    using region::sync;
+
+    // note: size means the sizeof(T)
+    using region::size;
+
+    /**
+     * Get the file seek position just after *this.
+     */
+    seekoff_t getEndSeek() const { return getSeek() + sizeof(T); }
+
+    /**
+     * Seek to fpos in file and remap the region.
+     * @return the pointer into the new position
+     */
+    type* jump(seekoff_t fpos) {
+        region::jump(fpos);
+        return get();
+    }
+
+    type* flushJump(seekoff_t fpos) {
+        region::flushJump(fpos);
+        return get();
+    }
+};
+
+/**
+ * Typed array region.
+ * @brief
+ * array_region<T> allows directly accessing an on-disk array of structures
+ * The element size is implicit from the type and length of the array
+ * is provided by the constructor.
+ * @provides resize(<elements>), operator[], begin(), end()
+ */
+template <typename T>
+class array_region : protected region {
+   protected:
+    size_t num_elements = 0;
+
+   public:
+    using type = typename std::decay<T>::type;
+    static_assert(std::is_standard_layout_v<type>, "T must be plain-old-data type");
+
+    /**
+     * Memory map array_region<T> at fpos in file and map array_size elements.
+     */
+    array_region(std::shared_ptr<file> f, seekoff_t fpos, size_t array_size) : region(f, fpos, sizeof(type) * array_size), num_elements(array_size) {}
+
+    /**
+     * Get pointer to first mapped element.
+     */
+    type* get() { return static_cast<type*>(data()); }
+    const type* get() const { return static_cast<type*>(data()); }
+
+    using region::flush;
+    using region::getFile;
+    using region::getSeek;
+    using region::sync;
+
+    /**
+     * Resize the mapped array region.
+     */
+    void resize(size_t elements) {
+        region::resize(sizeof(T) * elements);
+        num_elements = elements;
+    }
+
+    /**
+     * Get number of mapped *elements*
+     */
+    size_t size() const { return num_elements; }
+
+    /**
+     * Access the array elements
+     */
+    T& operator[](size_t index) {
+        assert(index < num_elements);
+        return get()[index];
+    }
+    const T& operator[](size_t index) const {
+        assert(index < num_elements);
+        return get()[index];
+    }
+    /**
+     * Iterators
+     */
+    T* begin() { return get(); }
+    T* end() { return get() + num_elements; }
+    const T* begin() const { return get(); }
+    const T* end() const { return get() + num_elements; }
+
+    /**
+     * Get the file seek position just after *this.
+     */
+    seekoff_t getEndSeek() const { return getSeek() + sizeof(T) * num_elements; }
+
+    /**
+     * Seek to fpos in file and remap the region.
+     * @return the pointer into the first element in the array
+     */
+    type* jump(seekoff_t fpos) {
+        region::jump(fpos);
+        return get();
+    }
+
+    type* flushJump(seekoff_t fpos) {
+        region::flushJump(fpos);
+        return get();
+    }
+};
+
+/**
+ * Memory-mapped file I/O class.
+ * @note
+ * file should be created with std::make_shared<file>()
+ * as mapped region(s) take shared ownership of the file.
+ */
+class file : public std::enable_shared_from_this<file> {
+   private:
+    std::mutex mut;
+    int fd;
+    seekoff_t fd_size;
+    bool fd_rw;
+    // the file and region classes are inherently coupled,
+    // and we don't want to expose the internals.
+    friend class region;
+
+   public:
+    enum : int {
+        CREATE = 0x1,  // Create new file, if doesn't exist.
+        RESIZE = 0x2,  // Resize file.
+        RO = 0x4       // <reserved flag>
+    };
+
+    file();
+    ~file();
+
+    /**
+     * Open file in read-only mode.
+     * @return non-zero if error occurred.
+     */
+    int open(const char* file);
+
+    /**
+     * Create/Open file in read-write mode.
+     * @param flags
+     *  - CREATE|RESIZE creates or replaces existing file
+     *    that is truncated to maxsize.
+     *  - RESIZE  opens existing file and truncates it to
+     *    maxsize. The file must exist already.
+     *  - flags == 0 ignores the maxsize argument and opens
+     *    existing file.
+     * @warn default open mode discards any previous file contents!
+     * @return non-zero if error occurred.
+     */
+    int openrw(const char* file, len_t maxsize, int flags = CREATE | RESIZE);
+
+    /**
+     * Check if file open R/W or RO
+     */
+    bool is_rw() const;
+
+    /**
+     * Resize the open file to newsize bytes.
+     * (file must be open in R/W mode)
+     * @return non-zero if error occurred.
+     */
+    int truncate(seekoff_t newsize);
+
+    /**
+     * Current length of the file
+     * The file EOF (end-of-file) is at this position.
+     */
+    seekoff_t size() const;
+
+    // Close the file.
+    void close();
+};
+
+};  // namespace mapped
+#endif

From cd076b51b3bb9321302c0d9410f49fb2607051ba Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Sun, 30 Jul 2023 22:04:24 +0300
Subject: [PATCH 07/42] - fixup region::remap() mremap case not saving the
 correct size. - silence few std::printf's since opening non-existing file  
 is handled by returning -1

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/libraries/mapped_file.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/libraries/mapped_file.cpp b/cpp/libraries/mapped_file.cpp
index a3731cd..698a673 100644
--- a/cpp/libraries/mapped_file.cpp
+++ b/cpp/libraries/mapped_file.cpp
@@ -62,13 +62,13 @@ int file::open(const char* fname) {
 
     fd = ::open64(fname, O_RDONLY);
     if (fd == -1) {
-        std::fprintf(stderr, "Error opening file for reading\n");
+        //std::fprintf(stderr, "Error opening file for reading\n");
         return -1;
     }
 
     struct stat64 finfo;
     if (fstat64(fd, &finfo)) {
-        std::fprintf(stderr, "Error opening file for reading\n");
+        std::fprintf(stderr, "Error getting file size: %s\n", std::strerror(errno));
         return -1;
     }
     fd_size = finfo.st_size;
@@ -87,7 +87,7 @@ int file::openrw(const char* fname, size_t maxsize, int flags) {
     if (!flags) {
         fd = ::open64(fname, O_RDWR | O_CLOEXEC);
         if (fd == -1) {
-            std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
+            //std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
             return -1;
         }
 
@@ -103,7 +103,7 @@ int file::openrw(const char* fname, size_t maxsize, int flags) {
     } else if ((flags & (CREATE | RESIZE)) == (CREATE | RESIZE)) {
         fd = ::open64(fname, O_CREAT | O_RDWR | O_TRUNC | O_CLOEXEC, fperms);
         if (fd == -1) {
-            std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
+            //std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
             return -1;
         }
         fd_rw = true;
@@ -112,7 +112,7 @@ int file::openrw(const char* fname, size_t maxsize, int flags) {
     } else if ((flags & RESIZE) != 0) {
         fd = ::open64(fname, O_RDWR | O_CLOEXEC, fperms);
         if (fd == -1) {
-            std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
+            //std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
             return -1;
         }
         fd_rw = true;
@@ -205,7 +205,7 @@ void region::remap(const seekoff_t fpos, const len_t size) {
             return;
         }
         map_ptr = newptr;
-        map_size = size;
+        map_size = newsize;
         return;
     }
 
@@ -233,7 +233,7 @@ void region::remap(const seekoff_t fpos, const len_t size) {
         }
     } else {
         // RO mapping
-        if (mfile->size() < fpos) {
+        if (mfile->size() <= fpos) {
             // can't: the backing file is too small.
             std::fprintf(stderr, "Error seeking past end of file.\n");
             std::abort();

From fc6b4109297b8adbd5afc0951c7082a21d448846 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Sun, 30 Jul 2023 22:24:03 +0300
Subject: [PATCH 08/42] fixup missing const in struct_region and array_region

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/libraries/mapped_file.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/libraries/mapped_file.hpp b/cpp/libraries/mapped_file.hpp
index 0aaefff..59f89ac 100644
--- a/cpp/libraries/mapped_file.hpp
+++ b/cpp/libraries/mapped_file.hpp
@@ -336,7 +336,7 @@ class array_region : protected region {
      * Get pointer to first mapped element.
      */
     type* get() { return static_cast<type*>(data()); }
-    const type* get() const { return static_cast<type*>(data()); }
+    const type* get() const { return static_cast<const type*>(data()); }
 
     using region::flush;
     using region::getFile;

From b0ff53ef9d2f467f88d94bc1b244a832e18465d1 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Mon, 31 Jul 2023 04:12:10 +0300
Subject: [PATCH 09/42] libmappedfile: implement oversized mapped region

The memory map now supports mapping oversized "window" into the file:
- flush(), sync() only flush the user area
- jump(), flushJump() have fast path speed up when new user area
  fits into the oversized window.

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/libraries/mapped_file.cpp | 33 +++++++++++++++++++++------------
 cpp/libraries/mapped_file.hpp | 13 ++++++++++---
 2 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/cpp/libraries/mapped_file.cpp b/cpp/libraries/mapped_file.cpp
index 698a673..e69ddbb 100644
--- a/cpp/libraries/mapped_file.cpp
+++ b/cpp/libraries/mapped_file.cpp
@@ -141,20 +141,21 @@ int file::truncate(seekoff_t newsize) {
  * Mapped region POSIX/Linux compatible implementation.
  */
 
-region::region(std::shared_ptr<file> src, seekoff_t fpos, len_t size) : mfile(src) {
+region::region(std::shared_ptr<file> src, seekoff_t fpos, len_t size, len_t window) : mfile(src) {
     std::lock_guard lock(mfile->mut);
-    remap(fpos, size);
+    remap(fpos, size, window);
 }
 
 region::region(std::shared_ptr<file> src) : mfile(src) {
     std::lock_guard lock(mfile->mut);
-    remap(0, mfile->size());
+    auto sz = mfile->size();
+    remap(0, sz, sz);
 }
 
 region::~region() {
     std::lock_guard lock(mfile->mut);
     map_fseek = 0;
-    remap(0, 0);
+    remap(0, 0, 0);
 }
 
 /**
@@ -166,7 +167,7 @@ region::~region() {
  *
  * In read-write mode the backing file is grown to fit the mapping.
  */
-void region::remap(const seekoff_t fpos, const len_t size) {
+void region::remap(const seekoff_t fpos, const len_t size, const len_t window) {
     if (fpos == usr_fseek && size == usr_size) return;  // No-op
     // check if [fpos, fpos+size] fits into the existing
     // mmap() window and only adjust the user region.
@@ -197,7 +198,7 @@ void region::remap(const seekoff_t fpos, const len_t size) {
     if (map_ptr && map_fseek == fpos) {
         // this mapping exists already at same map_fseek
         // remap it to grow the region.
-        auto newsize = roundUp(size);
+        auto newsize = roundUp(std::max(size, window));
         void* newptr = mremap(map_ptr, map_size, newsize, MREMAP_MAYMOVE);
         if (newptr == MAP_FAILED) {
             std::fprintf(stderr, "Error resizing memory-map of file:%s\n", std::strerror(errno));
@@ -212,7 +213,7 @@ void region::remap(const seekoff_t fpos, const len_t size) {
     // create new mapping
     if (mfile->is_rw()) {
         // RW mapping
-        auto newsize = roundUp(size);
+        auto newsize = roundUp(std::max(size, window));
         if (mfile->size() < fpos + newsize && mfile->truncate(fpos + newsize)) {
             // failed. Disk full?
             std::abort();
@@ -239,7 +240,7 @@ void region::remap(const seekoff_t fpos, const len_t size) {
             std::abort();
             return;
         }
-        map_size = roundUp(size);
+        map_size = roundUp(std::max(size, window));
         map_fseek = roundDown(fpos);
         // Map the region. (use huge pages, don't reserve backing store)
         map_ptr = mmap(0, map_size, PROT_READ, MAP_SHARED | MAP_NORESERVE | MAP_HUGE_2MB, mfile->fd, map_fseek);
@@ -257,14 +258,14 @@ void region::remap(const seekoff_t fpos, const len_t size) {
 
 void region::jump(seekoff_t fpos) {
     std::lock_guard lock(mfile->mut);
-    remap(fpos, map_size);
+    remap(fpos, usr_size, map_size);
     is_dirty = false;
 }
 
 void region::flushJump(seekoff_t fpos) {
     flush();
     std::lock_guard lock(mfile->mut);
-    remap(fpos, map_size);
+    remap(fpos, usr_size, map_size);
 }
 
 void region::flush() {
@@ -272,7 +273,11 @@ void region::flush() {
     std::lock_guard lock(mfile->mut);
     if (is_dirty && mfile->is_rw()) {
         is_dirty = false;
-        if (msync(map_ptr, map_size, MS_ASYNC)) {
+        auto flush_begin = (void*)roundDown((uintptr_t)usr_ptr);
+        auto flush_len = roundUp(usr_size);
+        if(flush_begin < usr_ptr)
+            flush_len += PAGE_SIZE;
+        if (msync(flush_begin, flush_len, MS_ASYNC)) {
             std::fprintf(stderr, "Error flushing memory-map:%s\n", std::strerror(errno));
         }
     }
@@ -283,7 +288,11 @@ void region::sync() {
     std::lock_guard lock(mfile->mut);
     if (is_dirty && mfile->is_rw()) {
         is_dirty = false;
-        if (msync(map_ptr, map_size, MS_SYNC)) {
+        auto flush_begin = (void*)roundDown((uintptr_t)usr_ptr);
+        auto flush_len = roundUp(usr_size);
+        if(flush_begin < usr_ptr)
+            flush_len += PAGE_SIZE;
+        if (msync(flush_begin, flush_len, MS_SYNC)) {
             std::fprintf(stderr, "Error flushing memory-map:%s\n", std::strerror(errno));
         }
     }
diff --git a/cpp/libraries/mapped_file.hpp b/cpp/libraries/mapped_file.hpp
index 59f89ac..57b1521 100644
--- a/cpp/libraries/mapped_file.hpp
+++ b/cpp/libraries/mapped_file.hpp
@@ -113,7 +113,7 @@ class region {
     // non-const data access sets is_dirty.
     bool is_dirty = false;
 
-    void remap(const seekoff_t fpos, const len_t size);
+    void remap(const seekoff_t fpos, const len_t size, const len_t window);
 
    public:
     /**
@@ -121,6 +121,13 @@ class region {
      * @brief
      * Seeks at fpos in file and map size bytes
      * starting from that position in file.
+     * @param window
+     *  over-extend mapping up to max(size,window) bytes.
+     *  Setting window bigger than size allows more efficient operation:
+     *  [fpos, fpos + window] area is memory mapped
+     *  but region will only operate on the
+     *  [roundDown(fpos), roundup(fpos+size)]
+     *  sub-portion of the memory.
      * @note
      * - Seeking past the EOF in file that is read-only will fail.
      *   The mapped size may extend past EOF but accessing past EOF
@@ -136,7 +143,7 @@ class region {
      *  size() and getSeek().
      *  Side-effect is that backing file may grow more than expected.
      */
-    region(std::shared_ptr<file> src, seekoff_t fpos, len_t size);
+    region(std::shared_ptr<file> src, seekoff_t fpos, len_t size, len_t window = 0);
 
     /**
      * Open memory mapped region into the file
@@ -271,7 +278,7 @@ class struct_region : protected region {
     /**
      * Memory map struct_region<T> at fpos in file.
      */
-    struct_region(std::shared_ptr<file> f, seekoff_t fpos) : region(f, fpos, sizeof(type)) {}
+    struct_region(std::shared_ptr<file> f, seekoff_t fpos, len_t window = 0) : region(f, fpos, sizeof(type), window) {}
 
     type* get() { return static_cast<type*>(data()); }
     const type* get() const { return static_cast<const type*>(data()); }

From adba81838f6d874c3fae65c4e4fb12fabc411d45 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Wed, 2 Aug 2023 13:14:26 +0300
Subject: [PATCH 10/42] libmappedfile: Provide writeAt() readAt() API

- Provide region::writeAt() and region::readAt() that
  enable copying data into/from the backing file even if the
  target area of the backing file is not memory-mapped.
- Fixup flushed length in flush() sync()
- Run clang-format

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/libraries/mapped_file.cpp | 85 +++++++++++++++++++++++++++++++----
 cpp/libraries/mapped_file.hpp | 38 ++++++++++++++--
 2 files changed, 112 insertions(+), 11 deletions(-)

diff --git a/cpp/libraries/mapped_file.cpp b/cpp/libraries/mapped_file.cpp
index e69ddbb..ef888f9 100644
--- a/cpp/libraries/mapped_file.cpp
+++ b/cpp/libraries/mapped_file.cpp
@@ -62,7 +62,7 @@ int file::open(const char* fname) {
 
     fd = ::open64(fname, O_RDONLY);
     if (fd == -1) {
-        //std::fprintf(stderr, "Error opening file for reading\n");
+        // std::fprintf(stderr, "Error opening file for reading\n");
         return -1;
     }
 
@@ -87,7 +87,7 @@ int file::openrw(const char* fname, size_t maxsize, int flags) {
     if (!flags) {
         fd = ::open64(fname, O_RDWR | O_CLOEXEC);
         if (fd == -1) {
-            //std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
+            // std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
             return -1;
         }
 
@@ -103,7 +103,7 @@ int file::openrw(const char* fname, size_t maxsize, int flags) {
     } else if ((flags & (CREATE | RESIZE)) == (CREATE | RESIZE)) {
         fd = ::open64(fname, O_CREAT | O_RDWR | O_TRUNC | O_CLOEXEC, fperms);
         if (fd == -1) {
-            //std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
+            // std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
             return -1;
         }
         fd_rw = true;
@@ -112,7 +112,7 @@ int file::openrw(const char* fname, size_t maxsize, int flags) {
     } else if ((flags & RESIZE) != 0) {
         fd = ::open64(fname, O_RDWR | O_CLOEXEC, fperms);
         if (fd == -1) {
-            //std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
+            // std::fprintf(stderr, "Error opening file:%s\n", std::strerror(errno));
             return -1;
         }
         fd_rw = true;
@@ -275,8 +275,7 @@ void region::flush() {
         is_dirty = false;
         auto flush_begin = (void*)roundDown((uintptr_t)usr_ptr);
         auto flush_len = roundUp(usr_size);
-        if(flush_begin < usr_ptr)
-            flush_len += PAGE_SIZE;
+        if (flush_begin < usr_ptr) flush_len += PAGE_SIZE;
         if (msync(flush_begin, flush_len, MS_ASYNC)) {
             std::fprintf(stderr, "Error flushing memory-map:%s\n", std::strerror(errno));
         }
@@ -290,14 +289,84 @@ void region::sync() {
         is_dirty = false;
         auto flush_begin = (void*)roundDown((uintptr_t)usr_ptr);
         auto flush_len = roundUp(usr_size);
-        if(flush_begin < usr_ptr)
-            flush_len += PAGE_SIZE;
+        if (flush_begin < usr_ptr) flush_len += PAGE_SIZE;
         if (msync(flush_begin, flush_len, MS_SYNC)) {
             std::fprintf(stderr, "Error flushing memory-map:%s\n", std::strerror(errno));
         }
     }
 }
 
+void region::writeAt(seekoff_t fpos, len_t datasize, const void* data) {
+    auto srcmem = (const char*)data;
+
+    std::lock_guard lock(mfile->mut);
+    if(mfile->size() < fpos+datasize && mfile->truncate(fpos+datasize)) {
+        return;
+    }
+
+    // does write fall out the mapped area begin?
+    if (fpos < map_fseek) {
+        // max size that can be written before map_fseek
+        ssize_t wr = std::min(map_fseek - fpos, datasize);
+        if (pwrite(mfile->fd, srcmem, wr, fpos) != wr) {
+            std::fprintf(stderr, "Error writing file:%s\n", std::strerror(errno));
+        }
+        srcmem += wr;
+        fpos += wr;
+        datasize -= wr;
+    }
+
+    if (fpos >= map_fseek && fpos < map_fseek + map_size && datasize) {
+        // max size that can be copied into this mapping:
+        ssize_t wr = std::min(map_size - (fpos - map_fseek), datasize);
+        std::memcpy((char*)map_ptr + (fpos - map_fseek), srcmem, wr);
+        srcmem += wr;
+        fpos += wr;
+        datasize -= wr;
+    }
+
+    // does write fall out the mapped area end?
+    if (datasize) {
+        // write into backing file after the mapped area:
+        if (pwrite(mfile->fd, srcmem, datasize, fpos) != ssize_t(datasize)) {
+            std::fprintf(stderr, "Error writing file:%s\n", std::strerror(errno));
+        }
+    }
+}
+
+void region::readAt(seekoff_t fpos, len_t datasize, void* data) {
+    auto dstmem = (char*)data;
+
+    // does read fall out the mapped area begin?
+    if (fpos < map_fseek) {
+        // max size that can be written before map_fseek
+        ssize_t rd = std::min(map_fseek - fpos, datasize);
+        if (pread(mfile->fd, dstmem, rd, fpos) != rd) {
+            std::fprintf(stderr, "Error reading file:%s\n", std::strerror(errno));
+        }
+        dstmem += rd;
+        fpos += rd;
+        datasize -= rd;
+    }
+
+    if (fpos >= map_fseek && fpos < map_fseek + map_size && datasize) {
+        // max size that can be copied from this mapping:
+        ssize_t rd = std::min(map_size - (fpos - map_fseek), datasize);
+        std::memcpy(dstmem, (char*)map_ptr + (fpos - map_fseek), rd);
+        dstmem += rd;
+        fpos += rd;
+        datasize -= rd;
+    }
+
+    // does read fall out the mapped area end?
+    if (datasize) {
+        // read from backing file after the mapped area:
+        if (pread(mfile->fd, dstmem, datasize, fpos) != ssize_t(datasize)) {
+            std::fprintf(stderr, "Error reading file:%s\n", std::strerror(errno));
+        }
+    }
+}
+
 /*
 TODO:
 void region::resident(void * paddr, size_t lenght, bool resident) {
diff --git a/cpp/libraries/mapped_file.hpp b/cpp/libraries/mapped_file.hpp
index 57b1521..dd2f61f 100644
--- a/cpp/libraries/mapped_file.hpp
+++ b/cpp/libraries/mapped_file.hpp
@@ -200,17 +200,18 @@ class region {
     // todo: window(len_t virtsize)
     // since region() is already lying that it can map
     // non-page-aligned offsets and sizes
-    // window() would grow this over-aligned window
+    // window() would grow this over-extended the memory mapping
     // to arbitrary size and keep the initialized
     // user size.
-    // This allows remap() to just adjust the usr_ptr
-    // if the region window fits in.
 
     /**
      * Flush mapped memory region into the file.
      * @brief this is an hint to operating system that
      * memory region shall be synchronized to disk.
      * It may not wait for this to have completed before returning.
+     * @note only the page aligned region
+     * [roundDown(data()), roundUp(data()+size())]
+     * is flushed.
      * @note Use sync() instead if you must guarantee the data has
      * reached persistent storage.
      */
@@ -221,6 +222,33 @@ class region {
      */
     void sync();
 
+    /**
+     * Write data into the backing file.
+     * @brief
+     *  writeAt() stores range of bytes into the backing file.
+     * @note
+     *  The region doesn't need to have this area to be memory-mapped:
+     *  The data that falls into the memory-mapped
+     *  [regionSeek(), regionSeek()+regionSize()] area is simply memcpy'ed.
+     *  Any data that falls out this window is written directly
+     *  into the backing file.
+     *  The backing file is grown to fit the data when needed.
+     */
+    void writeAt(seekoff_t fpos, len_t datasize, const void* data);
+
+    /**
+     * Read data from the backing file.
+     * @brief
+     *  readAt() reads [fpos, fpos+datasize] range of bytes from the backing file
+     * @note
+     *  The region doesn't need to have this area memory-mapped
+     *  The read out area that falls into the memory-mapped
+     *  [regionSeek(), regionSeek()+regionSize()] area is simply memcpy'ed.
+     *  Any data that falls out this window is read directly
+     *  from the backing file.
+     */
+    void readAt(seekoff_t fpos, len_t datasize, void* data);
+
     /**
      * Set memory region to resident/or released.
      * @brief setting memory range to non-resident state
@@ -292,7 +320,9 @@ class struct_region : protected region {
     using region::flush;
     using region::getFile;
     using region::getSeek;
+    using region::readAt;
     using region::sync;
+    using region::writeAt;
 
     // note: size means the sizeof(T)
     using region::size;
@@ -348,7 +378,9 @@ class array_region : protected region {
     using region::flush;
     using region::getFile;
     using region::getSeek;
+    using region::readAt;
     using region::sync;
+    using region::writeAt;
 
     /**
      * Resize the mapped array region.

From 747cca1aa541bfb5a7eb4948c55bc812921cb0ae Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Wed, 2 Aug 2023 23:26:33 +0300
Subject: [PATCH 11/42] libmappedfile: Misc changes

- Provide FSTUNE flag that attempts to speed up file access
  when new file created with CREATE|RESIZE.
  It effectievely sets chattr +X and +A flags on the file.
- Make readAt() const qualified.

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/libraries/mapped_file.cpp | 12 +++++++++++-
 cpp/libraries/mapped_file.hpp |  9 +++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/cpp/libraries/mapped_file.cpp b/cpp/libraries/mapped_file.cpp
index ef888f9..5318596 100644
--- a/cpp/libraries/mapped_file.cpp
+++ b/cpp/libraries/mapped_file.cpp
@@ -34,6 +34,9 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+
 #ifndef MAP_HUGE_2MB
 #define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
 #define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
@@ -107,6 +110,13 @@ int file::openrw(const char* fname, size_t maxsize, int flags) {
             return -1;
         }
         fd_rw = true;
+
+        if(flags & FSTUNE) {
+            int flags = 0;
+            ioctl(fd, FS_IOC_GETFLAGS, &flags);
+            flags |= FS_NOATIME_FL | FS_NOCOW_FL;
+            ioctl(fd, FS_IOC_SETFLAGS, &flags);
+        }
         return truncate(maxsize);
 
     } else if ((flags & RESIZE) != 0) {
@@ -334,7 +344,7 @@ void region::writeAt(seekoff_t fpos, len_t datasize, const void* data) {
     }
 }
 
-void region::readAt(seekoff_t fpos, len_t datasize, void* data) {
+void region::readAt(seekoff_t fpos, len_t datasize, void* data) const {
     auto dstmem = (char*)data;
 
     // does read fall out the mapped area begin?
diff --git a/cpp/libraries/mapped_file.hpp b/cpp/libraries/mapped_file.hpp
index dd2f61f..7cc3b1e 100644
--- a/cpp/libraries/mapped_file.hpp
+++ b/cpp/libraries/mapped_file.hpp
@@ -247,7 +247,7 @@ class region {
      *  Any data that falls out this window is read directly
      *  from the backing file.
      */
-    void readAt(seekoff_t fpos, len_t datasize, void* data);
+    void readAt(seekoff_t fpos, len_t datasize, void* data) const;
 
     /**
      * Set memory region to resident/or released.
@@ -452,9 +452,10 @@ class file : public std::enable_shared_from_this<file> {
 
    public:
     enum : int {
-        CREATE = 0x1,  // Create new file, if doesn't exist.
-        RESIZE = 0x2,  // Resize file.
-        RO = 0x4       // <reserved flag>
+        CREATE = 0x1,  //!< Create new file, if doesn't exist.
+        RESIZE = 0x2,  //!< Resize file.
+        FSTUNE = 0x4   //!< When creating new file attempt to set
+                       //!< file system attributes to improve performance.
     };
 
     file();

From 9b8f46310181e990ac551bf9c0e5b73959118a8e Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Thu, 10 Aug 2023 00:06:19 +0300
Subject: [PATCH 12/42] libmapped_file: Make region moveable

- Provide proper move aware object.
  region objects are now safe to use in STL containers like vector/deque.
- Implement region::resident() (not tested)

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/libraries/mapped_file.cpp | 47 +++++++++++++++++++++++++++--------
 cpp/libraries/mapped_file.hpp | 38 ++++++++++++++++++++++++++--
 2 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/cpp/libraries/mapped_file.cpp b/cpp/libraries/mapped_file.cpp
index 5318596..c101f3a 100644
--- a/cpp/libraries/mapped_file.cpp
+++ b/cpp/libraries/mapped_file.cpp
@@ -26,6 +26,7 @@
 #include <algorithm>
 #include <cstring>
 #include <iostream>
+#include <fstream>
 #include <string>
 
 // POSIX/Linux APIs
@@ -163,7 +164,7 @@ region::region(std::shared_ptr<file> src) : mfile(src) {
 }
 
 region::~region() {
-    std::lock_guard lock(mfile->mut);
+    // destructor is not thread-safe.
     map_fseek = 0;
     remap(0, 0, 0);
 }
@@ -238,7 +239,30 @@ void region::remap(const seekoff_t fpos, const len_t size, const len_t window) {
         map_size = newsize;
         map_ptr = mmap(0, map_size, PROT_READ | PROT_WRITE, MAP_SHARED, mfile->fd, map_fseek);
         if (map_ptr == MAP_FAILED) {
+            // If this gets triggered we are in deep trouble
             std::fprintf(stderr, "Error memory-mapping file:%s %lu %d %lu\n", std::strerror(errno), size, mfile->fd, fpos);
+            std::fprintf(stderr, "Dumping /proc/self/maps:\n");
+            // for debugging information try print /proc/self/mmaps contents
+            // as this explains why we hit some limit of the system.
+            std::ifstream fmaps("/proc/self/maps");
+            std::string buf;
+            int count = 0;
+            while(std::getline(fmaps, buf)) {
+                std::fprintf(stderr, "%s\n", buf.c_str());
+                ++count;
+            }
+            std::fprintf(stderr, "counted %d memory-maps in process.\n", count);
+
+
+
+            // todo: if this really is an hard limit of the hardware
+            // for *number of mmap() areas* this means we forced to:
+            // - register all regions in ordered list by mapped seek offset in the mapped::file
+            // - when mmap fails we have to merge adjacent regions
+            // - reference count the regions
+            // - data() returned memory address becomes even more unstable:
+            //   it is invalidated by adjacent construction/deconstruction of region objects
+            // - destruction gets complicated.
             std::abort();
             return;
         }
@@ -377,18 +401,19 @@ void region::readAt(seekoff_t fpos, len_t datasize, void* data) const {
     }
 }
 
-/*
-TODO:
-void region::resident(void * paddr, size_t lenght, bool resident) {
-        // Align paddr to PAGE_SIZE
-        void * start = reinterpret_cast<void*>(uintptr_t(paddr) & ~(PAGE_SIZE-1));
-        lenght = roundToPage(lenght);
-
-        if(madvise(start, lenght, resident ? MADV_WILLNEED : MADV_DONTNEED)) {
-                std::fprintf(stderr,"Error setting memory-map residency:%s\n",std::strerror(errno));
-        }
+
+void region::resident(bool resident) {
+    std::lock_guard lock(mfile->mut);
+    auto _begin = (void*)roundDown((uintptr_t)usr_ptr);
+    auto _len = roundUp(usr_size);
+    if (_begin < usr_ptr) _len += PAGE_SIZE;
+
+    if(madvise(_begin, _len, resident ? MADV_WILLNEED : MADV_DONTNEED)) {
+            std::fprintf(stderr,"Error setting memory-map residency:%s\n",std::strerror(errno));
+    }
 }
 
+/*
 void region::discard(void * paddr, size_t lenght) {
         // get range of pages that may be discarded.
         // this is always an subset of [paddr, paddr+lenght] range.
diff --git a/cpp/libraries/mapped_file.hpp b/cpp/libraries/mapped_file.hpp
index 7cc3b1e..5cda7c1 100644
--- a/cpp/libraries/mapped_file.hpp
+++ b/cpp/libraries/mapped_file.hpp
@@ -115,6 +115,8 @@ class region {
 
     void remap(const seekoff_t fpos, const len_t size, const len_t window);
 
+    region() {}
+
    public:
     /**
      * Open memory mapped region into a file.
@@ -159,6 +161,31 @@ class region {
      */
     virtual ~region();
 
+    // region is not copyable
+    region(const region&) =delete;
+    region& operator=(const region&) =delete;
+
+    // region is moveable
+    friend void swap(region& a, region& b) {
+        using std::swap;
+        // thread-safety? None.
+        swap(a.map_ptr,b.map_ptr);
+        swap(a.map_size,b.map_size);
+        swap(a.map_fseek,b.map_fseek);
+        swap(a.usr_ptr,b.usr_ptr);
+        swap(a.usr_size,b.usr_size);
+        swap(a.usr_fseek,b.usr_fseek);
+        swap(a.mfile,b.mfile);
+        swap(a.is_dirty,b.is_dirty);
+    }
+    region(region&& mv) : region() {
+        swap(*this, mv);
+    }
+    region& operator=(region&& mv) {
+        swap(*this, mv);
+        return *this;
+    }
+
     /**
      * Get data pointer.
      */
@@ -256,9 +283,11 @@ class region {
      * Reading non-resident memory region again causes system to
      * fetch data from the disk again.
      * @warn if memory region is not flushed before setting
-     * it non-resident any writes may be discarded to backing file.
+     * resident(false) any writes may be discarded to backing file.
+     * @todo: more strict version?
+     *  actually unmap the region() until data() is called.
      */
-    // void resident(bool state);
+    void resident(bool state);
 
     /**
      * Discard memory region.
@@ -323,6 +352,7 @@ class struct_region : protected region {
     using region::readAt;
     using region::sync;
     using region::writeAt;
+    using region::resident;
 
     // note: size means the sizeof(T)
     using region::size;
@@ -347,6 +377,10 @@ class struct_region : protected region {
     }
 };
 
+static_assert(std::is_move_constructible_v<struct_region<int>>);
+static_assert(std::is_move_assignable_v<struct_region<int>>);
+static_assert(std::is_swappable_v<struct_region<int>>);
+
 /**
  * Typed array region.
  * @brief

From 774760ebf357dc98c04b87f7e728e5ef7e9500af Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Thu, 10 Aug 2023 03:55:44 +0300
Subject: [PATCH 13/42] libmappedfile: Implement region::window()

- region::window() allows over-extending the memory-mapping
  The "user mapped" portions stays same but regionSize() is changed.

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/libraries/mapped_file.cpp |  8 ++++++++
 cpp/libraries/mapped_file.hpp | 16 ++++++++++------
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/cpp/libraries/mapped_file.cpp b/cpp/libraries/mapped_file.cpp
index c101f3a..f4a02e1 100644
--- a/cpp/libraries/mapped_file.cpp
+++ b/cpp/libraries/mapped_file.cpp
@@ -290,6 +290,14 @@ void region::remap(const seekoff_t fpos, const len_t size, const len_t window) {
     usr_ptr = (uint8_t*)map_ptr + (fpos - map_fseek);
 }
 
+void region::window(len_t window) {
+    std::lock_guard lock(mfile->mut);
+    auto usize = usr_size;
+    // note: remap() does nothing if window == usr_size
+    remap(usr_fseek, window, window);
+    usr_size = usize;
+}
+
 void region::jump(seekoff_t fpos) {
     std::lock_guard lock(mfile->mut);
     remap(fpos, usr_size, map_size);
diff --git a/cpp/libraries/mapped_file.hpp b/cpp/libraries/mapped_file.hpp
index 5cda7c1..e9893e1 100644
--- a/cpp/libraries/mapped_file.hpp
+++ b/cpp/libraries/mapped_file.hpp
@@ -224,12 +224,15 @@ class region {
      */
     void resize(len_t newsize);
 
-    // todo: window(len_t virtsize)
-    // since region() is already lying that it can map
-    // non-page-aligned offsets and sizes
-    // window() would grow this over-extended the memory mapping
-    // to arbitrary size and keep the initialized
-    // user size.
+    /**
+     * @brief over-extend mapping up to max(size(),window) bytes.
+     *  Setting window bigger than size() allows more efficient operation:
+     *  [regionSeek(), regionSeek() + window] area is memory mapped
+     *  but region will only operate on the
+     *  [roundDown(getSeek()), roundUp(getSeek()+size())]
+     *  sub-portion of the memory.
+     */
+    void window(len_t window = 0);
 
     /**
      * Flush mapped memory region into the file.
@@ -353,6 +356,7 @@ class struct_region : protected region {
     using region::sync;
     using region::writeAt;
     using region::resident;
+    using region::window;
 
     // note: size means the sizeof(T)
     using region::size;

From 1a5c4265998912160c45ca95774ce9b18595ed40 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Thu, 10 Aug 2023 07:11:47 +0300
Subject: [PATCH 14/42] libmapped_file: Tune the memory mapping a bit

- For resident() it is better to mark the entire mapped region
  rather than just the user area.

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/libraries/mapped_file.cpp | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/cpp/libraries/mapped_file.cpp b/cpp/libraries/mapped_file.cpp
index f4a02e1..56e1d11 100644
--- a/cpp/libraries/mapped_file.cpp
+++ b/cpp/libraries/mapped_file.cpp
@@ -252,18 +252,6 @@ void region::remap(const seekoff_t fpos, const len_t size, const len_t window) {
                 ++count;
             }
             std::fprintf(stderr, "counted %d memory-maps in process.\n", count);
-
-
-
-            // todo: if this really is an hard limit of the hardware
-            // for *number of mmap() areas* this means we forced to:
-            // - register all regions in ordered list by mapped seek offset in the mapped::file
-            // - when mmap fails we have to merge adjacent regions
-            // - reference count the regions
-            // - data() returned memory address becomes even more unstable:
-            //   it is invalidated by adjacent construction/deconstruction of region objects
-            // - destruction gets complicated.
-            std::abort();
             return;
         }
     } else {
@@ -285,6 +273,11 @@ void region::remap(const seekoff_t fpos, const len_t size, const len_t window) {
             return;
         }
     }
+
+    // hint that this memory is accessed in random order.
+    if(madvise(map_ptr, map_size, MADV_RANDOM)) {
+        std::fprintf(stderr, "warn: madvice(MADV_RANDOM) failed: %s\n", std::strerror(errno));
+    }
     // adjust the usr_ptr to fix
     // any page misalignment.
     usr_ptr = (uint8_t*)map_ptr + (fpos - map_fseek);
@@ -412,11 +405,7 @@ void region::readAt(seekoff_t fpos, len_t datasize, void* data) const {
 
 void region::resident(bool resident) {
     std::lock_guard lock(mfile->mut);
-    auto _begin = (void*)roundDown((uintptr_t)usr_ptr);
-    auto _len = roundUp(usr_size);
-    if (_begin < usr_ptr) _len += PAGE_SIZE;
-
-    if(madvise(_begin, _len, resident ? MADV_WILLNEED : MADV_DONTNEED)) {
+    if(madvise(map_ptr, map_size, resident ? MADV_WILLNEED : MADV_DONTNEED)) {
             std::fprintf(stderr,"Error setting memory-map residency:%s\n",std::strerror(errno));
     }
 }

From 574876b4617eb6c40d45b23e884cff6cfbdd607a Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Fri, 11 Aug 2023 21:28:43 +0300
Subject: [PATCH 15/42] libmappedfile: comment fixups

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/libraries/mapped_file.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/libraries/mapped_file.hpp b/cpp/libraries/mapped_file.hpp
index e9893e1..a2fccbd 100644
--- a/cpp/libraries/mapped_file.hpp
+++ b/cpp/libraries/mapped_file.hpp
@@ -92,8 +92,7 @@ class file;
 /**
  * Memory-mapped region
  * @brief
- * the region base class implementation memory maps
- * an raw memory range from the file.
+ * the region base class memory-maps an raw memory range from the file.
  */
 class region {
    protected:

From 3d197a1175a64838e9639ff40d8b763d5d5e70c3 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Sat, 12 Aug 2023 05:20:40 +0300
Subject: [PATCH 16/42] libmappedfile: Locking and discard work

- Implement more fine-grained locking for region.
- Implement region::discard()
  This effectively zero fills memory area within
  the mapping and punches hole into the backing file.

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/libraries/mapped_file.cpp | 62 ++++++++++++++++++++++++-----------
 cpp/libraries/mapped_file.hpp | 20 +++++++----
 2 files changed, 56 insertions(+), 26 deletions(-)

diff --git a/cpp/libraries/mapped_file.cpp b/cpp/libraries/mapped_file.cpp
index 56e1d11..e7261dc 100644
--- a/cpp/libraries/mapped_file.cpp
+++ b/cpp/libraries/mapped_file.cpp
@@ -153,18 +153,19 @@ int file::truncate(seekoff_t newsize) {
  */
 
 region::region(std::shared_ptr<file> src, seekoff_t fpos, len_t size, len_t window) : mfile(src) {
-    std::lock_guard lock(mfile->mut);
+    std::lock_guard lock(mtx);
     remap(fpos, size, window);
 }
 
 region::region(std::shared_ptr<file> src) : mfile(src) {
-    std::lock_guard lock(mfile->mut);
+    std::lock_guard lock(mtx);
     auto sz = mfile->size();
     remap(0, sz, sz);
 }
 
 region::~region() {
     // destructor is not thread-safe.
+    std::lock_guard lock(mtx);
     map_fseek = 0;
     remap(0, 0, 0);
 }
@@ -177,6 +178,8 @@ region::~region() {
  * remap(n, j) munmap old region, mmap new at offset roundDown(n)
  *
  * In read-write mode the backing file is grown to fit the mapping.
+ *
+ * @warn this->mtx must be held when this function is called.
  */
 void region::remap(const seekoff_t fpos, const len_t size, const len_t window) {
     if (fpos == usr_fseek && size == usr_size) return;  // No-op
@@ -225,11 +228,16 @@ void region::remap(const seekoff_t fpos, const len_t size, const len_t window) {
     if (mfile->is_rw()) {
         // RW mapping
         auto newsize = roundUp(std::max(size, window));
+
+        // take file lock so size() check --> truncate is atomic.
+        std::unique_lock trunclock(mfile->mut);
         if (mfile->size() < fpos + newsize && mfile->truncate(fpos + newsize)) {
             // failed. Disk full?
             std::abort();
             return;
         }
+        trunclock.unlock();
+
         // mmap requires fpos && size to be multiple of PAGE_SIZE
         map_fseek = roundDown(fpos);
         if (map_fseek < fpos) {
@@ -265,7 +273,7 @@ void region::remap(const seekoff_t fpos, const len_t size, const len_t window) {
         map_size = roundUp(std::max(size, window));
         map_fseek = roundDown(fpos);
         // Map the region. (use huge pages, don't reserve backing store)
-        map_ptr = mmap(0, map_size, PROT_READ, MAP_SHARED | MAP_NORESERVE | MAP_HUGE_2MB, mfile->fd, map_fseek);
+        map_ptr = mmap(0, map_size, PROT_READ, MAP_SHARED | MAP_NORESERVE, mfile->fd, map_fseek);
 
         if (!map_ptr || map_ptr == MAP_FAILED) {
             std::fprintf(stderr, "Error mapping file\n");
@@ -284,7 +292,7 @@ void region::remap(const seekoff_t fpos, const len_t size, const len_t window) {
 }
 
 void region::window(len_t window) {
-    std::lock_guard lock(mfile->mut);
+    std::lock_guard lock(mtx);
     auto usize = usr_size;
     // note: remap() does nothing if window == usr_size
     remap(usr_fseek, window, window);
@@ -292,20 +300,20 @@ void region::window(len_t window) {
 }
 
 void region::jump(seekoff_t fpos) {
-    std::lock_guard lock(mfile->mut);
+    std::lock_guard lock(mtx);
     remap(fpos, usr_size, map_size);
     is_dirty = false;
 }
 
 void region::flushJump(seekoff_t fpos) {
     flush();
-    std::lock_guard lock(mfile->mut);
+    std::lock_guard lock(mtx);
     remap(fpos, usr_size, map_size);
 }
 
 void region::flush() {
     // only flush if dirty and RW mapped.
-    std::lock_guard lock(mfile->mut);
+    std::lock_guard lock(mtx);
     if (is_dirty && mfile->is_rw()) {
         is_dirty = false;
         auto flush_begin = (void*)roundDown((uintptr_t)usr_ptr);
@@ -319,7 +327,7 @@ void region::flush() {
 
 void region::sync() {
     // only flush if dirty and RW mapped.
-    std::lock_guard lock(mfile->mut);
+    std::lock_guard lock(mtx);
     if (is_dirty && mfile->is_rw()) {
         is_dirty = false;
         auto flush_begin = (void*)roundDown((uintptr_t)usr_ptr);
@@ -334,10 +342,12 @@ void region::sync() {
 void region::writeAt(seekoff_t fpos, len_t datasize, const void* data) {
     auto srcmem = (const char*)data;
 
-    std::lock_guard lock(mfile->mut);
+    // take file lock so that file size check --> truncate is atomic.
+    std::unique_lock trunclock(mfile->mut);
     if(mfile->size() < fpos+datasize && mfile->truncate(fpos+datasize)) {
         return;
     }
+    trunclock.unlock();
 
     // does write fall out the mapped area begin?
     if (fpos < map_fseek) {
@@ -404,24 +414,36 @@ void region::readAt(seekoff_t fpos, len_t datasize, void* data) const {
 
 
 void region::resident(bool resident) {
-    std::lock_guard lock(mfile->mut);
+    std::lock_guard lock(mtx);
     if(madvise(map_ptr, map_size, resident ? MADV_WILLNEED : MADV_DONTNEED)) {
             std::fprintf(stderr,"Error setting memory-map residency:%s\n",std::strerror(errno));
     }
 }
 
-/*
-void region::discard(void * paddr, size_t lenght) {
-        // get range of pages that may be discarded.
-        // this is always an subset of [paddr, paddr+lenght] range.
-        void * start = (void*)roundUp((uintptr_t)paddr, PAGE_SIZE);
-        lenght = roundDown(lenght, PAGE_SIZE);
 
-        if(start < (char*)paddr + lenght && lenght >= PAGE_SIZE) {
-                // note: errors are ignored here.
-                madvise(start, lenght, MADV_REMOVE);
+void region::discard(seekoff_t fpos, len_t datasize) {
+
+    auto cur = usr_fseek + fpos;
+
+    if (cur < map_fseek + map_size) {
+        // max size that can discarded from this mapping:
+        ssize_t dm = std::min(map_size - (cur - map_fseek), datasize);
+
+        // Have to be careful here: if we delete too much
+        // caller will not have an good time.
+        // align size down to page size.
+        dm = roundDown(dm);
+        // align file offset up
+        auto _first = roundUp(cur - map_fseek);
+        if(_first > cur - map_fseek)
+            dm -= PAGE_SIZE;
+
+        if(dm >= (signed)PAGE_SIZE) {
+            if(madvise((char*)map_ptr + _first, dm, MADV_REMOVE)) {
+                std::fprintf(stderr,"Error discarding memory-map region:%s\n",std::strerror(errno));
+            }
         }
+    }
 }
-*/
 
 };  // namespace mapped
diff --git a/cpp/libraries/mapped_file.hpp b/cpp/libraries/mapped_file.hpp
index a2fccbd..b86657a 100644
--- a/cpp/libraries/mapped_file.hpp
+++ b/cpp/libraries/mapped_file.hpp
@@ -92,10 +92,11 @@ class file;
 /**
  * Memory-mapped region
  * @brief
- * the region base class memory-maps an raw memory range from the file.
+ * the base class memory-maps an raw range of bytes from the backing file.
  */
 class region {
    protected:
+    std::mutex mtx;
     // actually mapped region:
     void* map_ptr = nullptr;
     size_t map_size = 0;
@@ -167,7 +168,9 @@ class region {
     // region is moveable
     friend void swap(region& a, region& b) {
         using std::swap;
-        // thread-safety? None.
+        //std::lock(a.mtx,b.mtx);
+        //std::lock_guard l0(a.mtx, std::adopt_lock);
+        //std::lock_guard l1(b.mtx, std::adopt_lock);
         swap(a.map_ptr,b.map_ptr);
         swap(a.map_size,b.map_size);
         swap(a.map_fseek,b.map_fseek);
@@ -286,8 +289,6 @@ class region {
      * fetch data from the disk again.
      * @warn if memory region is not flushed before setting
      * resident(false) any writes may be discarded to backing file.
-     * @todo: more strict version?
-     *  actually unmap the region() until data() is called.
      */
     void resident(bool state);
 
@@ -297,9 +298,15 @@ class region {
      * to reclaim the memory *and* the on-disk area.
      * This means the data is lost in the mapped memory region,
      * and any data within will not be written onto disk by sync()
-     * Subsequent reads after discard() return undefined data.
+     * Subsequent reads after discard() return zero filled data.
+     * @note
+     *  The discarded area shall be within the mapped area.
+     * @param fpos
+     *  file offset from begin of this mapping. (getSeek() + fpos)
+     * @param datasize
+     *  length of the data area to discard.
      */
-    // void discard();
+    void discard(seekoff_t fpos, len_t datasize);
 
     /**
      * Seek in the file to fpos position and
@@ -356,6 +363,7 @@ class struct_region : protected region {
     using region::writeAt;
     using region::resident;
     using region::window;
+    using region::discard;
 
     // note: size means the sizeof(T)
     using region::size;

From f2b1f8c07765702c40d4ecb9b3577316c97f8531 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Mon, 31 Jul 2023 00:06:43 +0300
Subject: [PATCH 17/42] Do const safety pass

- The filePointer points into read-only memory from mmap()
  so apply const to few places to ensure nothing is writing into it.
- getCubesByShape() may return pointers to past-end of the mmap() area
  if shape table entry size is zero.
  ShapeEntry::offset can be wrong if the size is also zero.

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/cube.hpp     |  2 +-
 cpp/include/newCache.hpp | 24 ++++++++++++------------
 cpp/src/newCache.cpp     |  9 ++++++---
 3 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/cpp/include/cube.hpp b/cpp/include/cube.hpp
index 83feaa7..f612ef4 100644
--- a/cpp/include/cube.hpp
+++ b/cpp/include/cube.hpp
@@ -69,7 +69,7 @@ struct Cube {
     // Construct from external source.
     // Cube shares this the memory until modified.
     // Caller guarantees the memory given will live longer than *this
-    Cube(XYZ *start, uint8_t n) : bits{1, n}, array(start) {}
+    Cube(const XYZ *start, uint8_t n) : bits{1, n}, array(const_cast<XYZ*>(start)) {}
 
     // Copy ctor.
     Cube(const Cube &copy) : Cube(copy.size()) { std::copy(copy.begin(), copy.end(), begin()); }
diff --git a/cpp/include/newCache.hpp b/cpp/include/newCache.hpp
index 29e622e..04bb3fe 100644
--- a/cpp/include/newCache.hpp
+++ b/cpp/include/newCache.hpp
@@ -18,7 +18,7 @@ class CubeIterator {
     using reference = Cube&;  // or also value_type&
 
     // constructor
-    CubeIterator(uint32_t _n, XYZ* ptr) : n(_n), m_ptr(ptr) {}
+    CubeIterator(uint32_t _n, const XYZ* ptr) : n(_n), m_ptr(ptr) {}
 
     // invalid iterator (can't deference)
     explicit CubeIterator() : n(0), m_ptr(nullptr) {}
@@ -53,13 +53,13 @@ class CubeIterator {
 
    private:
     uint32_t n;
-    XYZ* m_ptr;
+    const XYZ* m_ptr;
 };
 
 class ShapeRange {
    public:
-    ShapeRange(XYZ* start, XYZ* stop, uint64_t _cubeLen, XYZ _shape)
-        : b(_cubeLen, start), e(_cubeLen, stop), size_(((uint64_t)stop - (uint64_t)start) / (_cubeLen * sizeof(XYZ))), shape_(_shape) {}
+    ShapeRange(const XYZ* start, const XYZ* stop, uint64_t _cubeLen, XYZ _shape)
+        : b(_cubeLen, start), e(_cubeLen, stop), size_(std::distance(start, stop) / _cubeLen), shape_(_shape) {}
 
     CubeIterator begin() { return b; }
     CubeIterator end() { return e; }
@@ -118,26 +118,26 @@ class CacheReader : public ICache {
     };
 
     CubeIterator begin() {
-        uint8_t* start = filePointer + shapes[0].offset;
-        return CubeIterator(header->n, (XYZ*)start);
+        const uint8_t* start = filePointer + shapes[0].offset;
+        return CubeIterator(header->n, (const XYZ*)start);
     }
 
     CubeIterator end() {
-        uint8_t* stop = filePointer + shapes[0].offset + header->numPolycubes * header->n * XYZ_SIZE;
-        return CubeIterator(header->n, (XYZ*)stop);
+        const uint8_t* stop = filePointer + shapes[0].offset + header->numPolycubes * header->n * XYZ_SIZE;
+        return CubeIterator(header->n, (const XYZ*)stop);
     }
 
     ShapeRange getCubesByShape(uint32_t i) override;
 
    private:
-    uint8_t* filePointer;
+    const uint8_t* filePointer;
     std::string path_;
     int fileDescriptor_;
     uint64_t fileSize_;
     bool fileLoaded_;
-    Header dummyHeader;
-    Header* header;
-    ShapeEntry* shapes;
+    const Header dummyHeader;
+    const Header* header;
+    const ShapeEntry* shapes;
 };
 
 class FlatCache : public ICache {
diff --git a/cpp/src/newCache.cpp b/cpp/src/newCache.cpp
index ef925dc..db57900 100644
--- a/cpp/src/newCache.cpp
+++ b/cpp/src/newCache.cpp
@@ -65,15 +65,18 @@ ShapeRange CacheReader::getCubesByShape(uint32_t i) {
     if (i >= header->numShapes) {
         return ShapeRange{nullptr, nullptr, 0, XYZ(0, 0, 0)};
     }
-    XYZ* start = reinterpret_cast<XYZ*>(filePointer + shapes[i].offset);
-    XYZ* end = reinterpret_cast<XYZ*>(filePointer + shapes[i].offset + shapes[i].size);
+    if(shapes[i].size <= 0) {
+        return ShapeRange(nullptr, nullptr, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2));
+    }
+    auto start = reinterpret_cast<const XYZ*>(filePointer + shapes[i].offset);
+    auto end = reinterpret_cast<const XYZ*>(filePointer + shapes[i].offset + shapes[i].size);
     return ShapeRange(start, end, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2));
 }
 
 void CacheReader::unload() {
     // unmap file from memory
     if (fileLoaded_) {
-        if (munmap(filePointer, fileSize_) == -1) {
+        if (munmap((void*)filePointer, fileSize_) == -1) {
             // error handling
             std::printf("error unmapping file\n");
         }

From 2a39964cf2199a6406a1989dbbf5ec3f77b20835 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Mon, 31 Jul 2023 00:28:21 +0300
Subject: [PATCH 18/42] Close the `friend class Workset` trick. - I can
 actually read how the progress is calculated.

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/newCache.hpp | 4 +++-
 cpp/src/cubes.cpp        | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/include/newCache.hpp b/cpp/include/newCache.hpp
index 04bb3fe..9189182 100644
--- a/cpp/include/newCache.hpp
+++ b/cpp/include/newCache.hpp
@@ -27,6 +27,8 @@ class CubeIterator {
     const value_type operator*() const { return Cube(m_ptr, n); }
     // pointer operator->() { return (pointer)m_ptr; }
 
+    const XYZ* data() const { return m_ptr; }
+
     // Prefix increment
     CubeIterator& operator++() {
         m_ptr += n;
@@ -49,7 +51,7 @@ class CubeIterator {
     friend bool operator<(const CubeIterator& a, const CubeIterator& b) { return a.m_ptr < b.m_ptr; };
     friend bool operator>(const CubeIterator& a, const CubeIterator& b) { return a.m_ptr > b.m_ptr; };
     friend bool operator!=(const CubeIterator& a, const CubeIterator& b) { return a.m_ptr != b.m_ptr; };
-    friend class Workset;
+    //friend class Workset;
 
    private:
     uint32_t n;
diff --git a/cpp/src/cubes.cpp b/cpp/src/cubes.cpp
index cdcc5b4..2bcd3c1 100644
--- a/cpp/src/cubes.cpp
+++ b/cpp/src/cubes.cpp
@@ -48,7 +48,7 @@ struct Workset {
         auto a = _begin;
         _begin += 500;
         if (_begin > _end) _begin = _end;
-        return {a, _begin, a < _end, 100 * (float)((uint64_t)a.m_ptr - (uint64_t)_begin_total.m_ptr) / ((uint64_t)_end.m_ptr - (uint64_t)_begin_total.m_ptr)};
+        return {a, _begin, a < _end, 100 * float(std::distance(_begin_total.data(), a.data())) / std::distance(_begin_total.data(), _end.data())};
     }
 
     void expand(const Cube &c) {

From f8a5671f4885090b63c22f5048c262229ec24dcb Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Mon, 31 Jul 2023 01:00:52 +0300
Subject: [PATCH 19/42] Update newCache to use libmappedfile

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/newCache.hpp | 17 ++++++---
 cpp/src/newCache.cpp     | 78 +++++++++++++++++++++++-----------------
 2 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/cpp/include/newCache.hpp b/cpp/include/newCache.hpp
index 9189182..ca3d71f 100644
--- a/cpp/include/newCache.hpp
+++ b/cpp/include/newCache.hpp
@@ -6,6 +6,7 @@
 
 #include "cube.hpp"
 #include "hashes.hpp"
+#include "mapped_file.hpp"
 
 class Workset;
 
@@ -119,7 +120,10 @@ class CacheReader : public ICache {
         uint64_t size;     // in bytes should be multiple of XYZ_SIZE
     };
 
-    CubeIterator begin() {
+    // Do begin() and end() make sense for CacheReader
+    // If the cache file provides data for more than single shape?
+    // The data might not even be mapped contiguously to save memory.
+    /*CubeIterator begin() {
         const uint8_t* start = filePointer + shapes[0].offset;
         return CubeIterator(header->n, (const XYZ*)start);
     }
@@ -127,15 +131,18 @@ class CacheReader : public ICache {
     CubeIterator end() {
         const uint8_t* stop = filePointer + shapes[0].offset + header->numPolycubes * header->n * XYZ_SIZE;
         return CubeIterator(header->n, (const XYZ*)stop);
-    }
+    }*/
 
+    // get shapes at index [0, numShapes()[
     ShapeRange getCubesByShape(uint32_t i) override;
 
    private:
-    const uint8_t* filePointer;
+    std::shared_ptr<mapped::file> file_;
+    std::unique_ptr<const mapped::struct_region<Header>> header_;
+    std::unique_ptr<const mapped::array_region<ShapeEntry>> shapes_;
+    std::unique_ptr<const mapped::array_region<XYZ>> xyz_;
+
     std::string path_;
-    int fileDescriptor_;
-    uint64_t fileSize_;
     bool fileLoaded_;
     const Header dummyHeader;
     const Header* header;
diff --git a/cpp/src/newCache.cpp b/cpp/src/newCache.cpp
index db57900..cd9726c 100644
--- a/cpp/src/newCache.cpp
+++ b/cpp/src/newCache.cpp
@@ -6,8 +6,7 @@
 
 #include <iostream>
 
-CacheReader::CacheReader()
-    : filePointer(nullptr), path_(""), fileDescriptor_(-1), fileSize_(0), fileLoaded_(false), dummyHeader{0, 0, 0, 0}, header(&dummyHeader), shapes(nullptr) {}
+CacheReader::CacheReader() : path_(""), fileLoaded_(false), dummyHeader{0, 0, 0, 0}, header(&dummyHeader), shapes(nullptr) {}
 
 void CacheReader::printHeader() {
     if (fileLoaded_) {
@@ -33,28 +32,37 @@ int CacheReader::printShapes(void) {
 int CacheReader::loadFile(const std::string path) {
     unload();
     path_ = path;
-    fileDescriptor_ = open(path.c_str(), O_RDONLY);
 
-    if (fileDescriptor_ == -1) {
+    // open read-only backing file:
+    file_ = std::make_shared<mapped::file>();
+    if (file_->open(path.c_str())) {
         std::printf("error opening file\n");
         return 1;
     }
 
-    // get filesize
-    fileSize_ = lseek(fileDescriptor_, 0, SEEK_END);
-    lseek(fileDescriptor_, 0, SEEK_SET);
-
-    // memory map file
-    filePointer = (uint8_t*)mmap(NULL, fileSize_, PROT_READ, MAP_SHARED, fileDescriptor_, 0);
-    if (filePointer == MAP_FAILED) {
-        // error handling
-        std::printf("errorm mapping file memory");
-        close(fileDescriptor_);
-        return 2;
+    // map the header struct
+    header_ = std::make_unique<const mapped::struct_region<Header>>(file_, 0);
+    header = header_->get();
+
+    if (header->magic != MAGIC) {
+        std::printf("error opening file: file not recognized\n");
+        return 1;
     }
 
-    header = (Header*)(filePointer);
-    shapes = (ShapeEntry*)(filePointer + sizeof(Header));
+    // map the ShapeEntry array:
+    shapes_ = std::make_unique<const mapped::array_region<ShapeEntry>>(file_, header_->getEndSeek(), (*header_)->numShapes);
+    shapes = shapes_->get();
+
+    size_t datasize = 0;
+    for (unsigned int i = 0; i < header->numShapes; ++i) {
+        datasize += shapes[i].size;
+    }
+
+    // map rest of the file as XYZ data:
+    if (file_->size() != shapes_->getEndSeek() + datasize) {
+        std::printf("warn: file size does not match expected value\n");
+    }
+    xyz_ = std::make_unique<const mapped::array_region<XYZ>>(file_, shapes_->getEndSeek(), datasize);
 
     fileLoaded_ = true;
 
@@ -65,28 +73,34 @@ ShapeRange CacheReader::getCubesByShape(uint32_t i) {
     if (i >= header->numShapes) {
         return ShapeRange{nullptr, nullptr, 0, XYZ(0, 0, 0)};
     }
-    if(shapes[i].size <= 0) {
-        return ShapeRange(nullptr, nullptr, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2));
+    if (shapes[i].size <= 0) {
+        return ShapeRange{nullptr, nullptr, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2)};
+    }
+    // get section start
+    // note: shapes[i].offset may have bogus offset
+    // if any earlier shape table entry was empty before i
+    // so we ignore the offset here.
+    size_t offset = 0;
+    for (unsigned int k = 0; k < i; ++k) {
+        offset += shapes[k].size;
     }
-    auto start = reinterpret_cast<const XYZ*>(filePointer + shapes[i].offset);
-    auto end = reinterpret_cast<const XYZ*>(filePointer + shapes[i].offset + shapes[i].size);
-    return ShapeRange(start, end, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2));
+    auto index = offset / XYZ_SIZE;
+    auto num_xyz = shapes[i].size / XYZ_SIZE;
+    // pointers to Cube data:
+    auto start = xyz_->get() + index;
+    auto end = xyz_->get() + index + num_xyz;
+    return ShapeRange{start, end, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2)};
 }
 
 void CacheReader::unload() {
-    // unmap file from memory
+    // unload file from memory
     if (fileLoaded_) {
-        if (munmap((void*)filePointer, fileSize_) == -1) {
-            // error handling
-            std::printf("error unmapping file\n");
-        }
-
-        // close file descriptor
-        close(fileDescriptor_);
+        xyz_.reset();
+        shapes_.reset();
+        header_.reset();
+        file_.reset();
         fileLoaded_ = false;
     }
-    fileDescriptor_ = -1;
-    filePointer = nullptr;
     header = &dummyHeader;
     shapes = nullptr;
 }

From f62780e0f42bb0829d9be5deb4b65f2ebd315944 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Sun, 13 Aug 2023 21:04:53 +0300
Subject: [PATCH 20/42] fixup tests not compiling.

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/tests/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b30d160..42e0014 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -19,4 +19,5 @@ add_executable(${PROJECT_NAME} $<TARGET_OBJECTS:CubeObjs> ${TESTS})
 
 target_link_libraries(GTest::GTest INTERFACE gtest_main)
 target_link_libraries(${PROJECT_NAME} pthread GTest::GTest)
+target_link_libraries(${PROJECT_NAME} mapped_file)
 ConfigureTarget(${PROJECT_NAME})

From c7609446280216887dced0a172ac8439a1ad0e7c Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Wed, 2 Aug 2023 13:32:37 +0300
Subject: [PATCH 21/42] Make DEBUG_PRINT less noisy

DEBUG_LEVEL selects the level of debug prints that are compiled in.
0 => Same as not compiling with DEBUG at all.
1 => Only DEBUG_PRINT()
2 => DEBUG1_PRINT() and lower levels are enabled
3 => DEBUG2_PRINT() and lower levels are enabled

Change few of the noisiest prints to be silent with DEBUG_LEVEL == 1

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/hashes.hpp |  2 +-
 cpp/include/utils.hpp  | 35 +++++++++++++++++++++++++++++++----
 cpp/src/cubes.cpp      |  4 ++--
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/cpp/include/hashes.hpp b/cpp/include/hashes.hpp
index 7999d5c..09feeed 100644
--- a/cpp/include/hashes.hpp
+++ b/cpp/include/hashes.hpp
@@ -100,7 +100,7 @@ struct Hashy {
         DEBUG_PRINTF("%ld maps by shape\n\r", byshape.size());
         for (auto &set : byshape) {
             auto part = set.second.size();
-            DEBUG_PRINTF("bucket [%2d %2d %2d]: %ld\n", set.first.x(), set.first.y(), set.first.z(), part);
+            DEBUG1_PRINTF("bucket [%2d %2d %2d]: %ld\n", set.first.x(), set.first.y(), set.first.z(), part);
             sum += part;
         }
         return sum;
diff --git a/cpp/include/utils.hpp b/cpp/include/utils.hpp
index 4cd23e3..f895877 100644
--- a/cpp/include/utils.hpp
+++ b/cpp/include/utils.hpp
@@ -3,12 +3,39 @@
 #define OPENCUBES_UTILS_HPP
 
 #include <cstdio>
+
+// Debug print level: all prints enabled
+// below DEBUG_LEVEL.
+// DEBUG_LEVEL -> 0 all prints disabled.
+// DEBUG_LEVEL -> 1 enable DEBUG_PRINTF() statements
+// DEBUG_LEVEL -> 2 enable DEBUG1_PRINTF() statements and earlier
+// DEBUG_LEVEL -> 3 all prints enabled.
+#define DEBUG_LEVEL 1
+
 #ifdef DEBUG
+
+#if DEBUG_LEVEL >= 1
 #define DEBUG_PRINTF(...) std::printf(__VA_ARGS__)
-#else
-#define DEBUG_PRINTF(...) \
-    do {                  \
-    } while (0)
 #endif
 
+#if DEBUG_LEVEL >= 2
+#define DEBUG1_PRINTF(...) std::printf(__VA_ARGS__)
+#endif
+
+#if DEBUG_LEVEL >= 3
+#define DEBUG2_PRINTF(...) std::printf(__VA_ARGS__)
+#endif
+
+#endif
+
+#ifndef DEBUG_PRINTF
+#define DEBUG_PRINTF(...) do {} while (0)
 #endif
+#ifndef DEBUG1_PRINTF
+#define DEBUG1_PRINTF(...) do {} while (0)
+#endif
+#ifndef DEBUG2_PRINTF
+#define DEBUG2_PRINTF(...) do {} while (0)
+#endif
+
+#endif
\ No newline at end of file
diff --git a/cpp/src/cubes.cpp b/cpp/src/cubes.cpp
index 2bcd3c1..bea7327 100644
--- a/cpp/src/cubes.cpp
+++ b/cpp/src/cubes.cpp
@@ -87,14 +87,14 @@ struct Workset {
         std::set_difference(candidates.begin(), end, c.begin(), c.end(), std::back_inserter(tmp));
         candidates = std::move(tmp);
 
-        DEBUG_PRINTF("candidates: %lu\n\r", candidates.size());
+        DEBUG1_PRINTF("candidates: %lu\n\r", candidates.size());
 
         Cube newCube(c.size() + 1);
         Cube lowestHashCube(newCube.size());
         Cube rotatedCube(newCube.size());
 
         for (const auto &p : candidates) {
-            DEBUG_PRINTF("(%2d %2d %2d)\n\r", p.x(), p.y(), p.z());
+            DEBUG2_PRINTF("(%2d %2d %2d)\n\r", p.x(), p.y(), p.z());
             int ax = (p.x() < 0) ? 1 : 0;
             int ay = (p.y() < 0) ? 1 : 0;
             int az = (p.z() < 0) ? 1 : 0;

From 064b9739db23f3bdc7df9cb619bf95bd73941a48 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Thu, 10 Aug 2023 08:13:52 +0300
Subject: [PATCH 22/42] Hack Cube struct into 8-bytes

This is v3 reversion of this hack:
Previously the uint8_t bit-field actually caused Cube to be 16-bytes
due to padding.
Bitpack/Hack the size, is_shared flag and memory address into
into private struct bits_t. This halves the Cube struct size.

Note: If we get any segfaults from de-referencing the pointer
returned by get() helper this hack must be reverted.

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/cube.hpp | 74 ++++++++++++++++++++++++++++++--------------
 1 file changed, 51 insertions(+), 23 deletions(-)

diff --git a/cpp/include/cube.hpp b/cpp/include/cube.hpp
index f612ef4..e6719a7 100644
--- a/cpp/include/cube.hpp
+++ b/cpp/include/cube.hpp
@@ -7,6 +7,7 @@
 #include <memory>
 #include <unordered_set>
 #include <vector>
+#include <atomic>
 
 #include "utils.hpp"
 
@@ -45,20 +46,47 @@ using XYZSet = std::unordered_set<XYZ, HashXYZ, std::equal_to<XYZ>>;
 
 struct Cube {
    private:
-    struct {
-        uint8_t is_shared : 1;
-        uint8_t size : 7;  // MAX 127
-    } bits;
-    XYZ *array = nullptr;
-
-    static_assert(sizeof(bits) == sizeof(uint8_t));
+    // cube memory is stored two ways:
+    // normal, new'd buffer
+    // shared, external memory.
+
+    struct bits_t {
+        uint64_t is_shared : 1;
+        uint64_t size : 7;   // MAX 127
+        uint64_t addr : 56;  // low 56-bits of memory address.
+    };
+    // fields
+    bits_t fields;
+
+    static_assert(sizeof(bits_t) == sizeof(void*));
+
+    static XYZ *get(bits_t key) {
+        // pointer bit-hacking:
+        uint64_t addr = key.addr;
+        return reinterpret_cast<XYZ *>(addr);
+    }
 
+    static bits_t put(bool is_shared, int size, XYZ *addr) {
+        // mask off top byte from the memory address to fit it into bits_t::addr
+        // on x86-64 it is not used by the hardware (yet).
+        // This hack actually saves 8 bytes because previously
+        // the uint8_t caused padding to 16 bytes.
+        // @note if we get segfaults dereferencing get(fields)
+        // then this is the problem and this hack must be undone.
+        uint64_t tmp = reinterpret_cast<uint64_t>((void *)addr);
+        tmp &= 0xffffffffffffff;
+        bits_t bits;
+        bits.addr = tmp;
+        bits.is_shared = is_shared;
+        bits.size = size;
+        return bits;
+    }
    public:
     // Empty cube
-    Cube() : bits{0, 0} {}
+    Cube() : fields{put(0, 0, nullptr)} {}
 
     // Cube with N capacity
-    explicit Cube(uint8_t N) : bits{0, N}, array(new XYZ[bits.size]) {}
+    explicit Cube(uint8_t N) : fields{put(0,N, new XYZ[N])} {}
 
     // Construct from pieces
     Cube(std::initializer_list<XYZ> il) : Cube(il.size()) { std::copy(il.begin(), il.end(), begin()); }
@@ -69,20 +97,23 @@ struct Cube {
     // Construct from external source.
     // Cube shares this the memory until modified.
     // Caller guarantees the memory given will live longer than *this
-    Cube(const XYZ *start, uint8_t n) : bits{1, n}, array(const_cast<XYZ*>(start)) {}
+    Cube(const XYZ *start, uint8_t n) : fields{put(1,n,const_cast<XYZ*>(start))} {}
 
     // Copy ctor.
     Cube(const Cube &copy) : Cube(copy.size()) { std::copy(copy.begin(), copy.end(), begin()); }
 
     ~Cube() {
+        bits_t bits = fields;
         if (!bits.is_shared) {
-            delete[] array;
+            delete[] get(bits);
         }
     }
     friend void swap(Cube &a, Cube &b) {
         using std::swap;
-        swap(a.array, b.array);
-        swap(a.bits, b.bits);
+        bits_t abits = a.fields;
+        bits_t bbits = b.fields;
+        a.fields = bbits;
+        b.fields = abits;
     }
 
     Cube(Cube &&mv) : Cube() { swap(*this, mv); }
@@ -98,19 +129,15 @@ struct Cube {
         return *this;
     }
 
-    size_t size() const { return bits.size; }
+    size_t size() const { return fields.size; }
 
     XYZ *data() {
-        if (bits.is_shared) {
-            // lift to RAM: this should never happen really.
-            Cube tmp(array, bits.size);
-            swap(*this, tmp);
-            std::printf("Bad use of Cube\n");
-        }
-        return array;
-    }
+		return get(fields);
+	}
 
-    const XYZ *data() const { return array; }
+	const XYZ *data() const {
+		return get(fields);
+	}
 
     XYZ *begin() { return data(); }
 
@@ -140,6 +167,7 @@ struct Cube {
     }
 };
 
+static_assert(sizeof(Cube) == 8, "Unexpected sizeof(Cube) for Cube");
 static_assert(std::is_move_assignable_v<Cube>, "Cube must be moveable");
 static_assert(std::is_swappable_v<Cube>, "Cube must swappable");
 

From ca14b5501559f0fd953da0241a0e8e1b6087bc42 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Fri, 11 Aug 2023 19:54:07 +0300
Subject: [PATCH 23/42] Hashy const qualifiers.

- Small changes diffed.

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/cube.hpp   |  6 +++---
 cpp/include/hashes.hpp | 16 ++++++++++------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cube.hpp b/cpp/include/cube.hpp
index e6719a7..72b9d97 100644
--- a/cpp/include/cube.hpp
+++ b/cpp/include/cube.hpp
@@ -47,8 +47,8 @@ using XYZSet = std::unordered_set<XYZ, HashXYZ, std::equal_to<XYZ>>;
 struct Cube {
    private:
     // cube memory is stored two ways:
-    // normal, new'd buffer
-    // shared, external memory.
+    // normal, new'd buffer: is_shared == false
+    // shared, external memory: is_shared == true
 
     struct bits_t {
         uint64_t is_shared : 1;
@@ -59,7 +59,7 @@ struct Cube {
     bits_t fields;
 
     static_assert(sizeof(bits_t) == sizeof(void*));
-
+    // extract the pointer from bits_t
     static XYZ *get(bits_t key) {
         // pointer bit-hacking:
         uint64_t addr = key.addr;
diff --git a/cpp/include/hashes.hpp b/cpp/include/hashes.hpp
index 09feeed..b154d2a 100644
--- a/cpp/include/hashes.hpp
+++ b/cpp/include/hashes.hpp
@@ -28,7 +28,7 @@ using CubeSet = std::unordered_set<Cube, HashCube, std::equal_to<Cube>>;
 struct Hashy {
     struct Subsubhashy {
         CubeSet set;
-        std::shared_mutex set_mutex;
+        mutable std::shared_mutex set_mutex;
 
         template <typename CubeT>
         void insert(CubeT &&c) {
@@ -36,12 +36,16 @@ struct Hashy {
             set.emplace(std::forward<CubeT>(c));
         }
 
-        bool contains(const Cube &c) {
+        bool contains(const Cube &c) const {
             std::shared_lock lock(set_mutex);
-            return set.count(c);
+            auto itr = set.find(c);
+            if(itr != set.end()) {
+				return true;
+			}
+            return false;
         }
 
-        auto size() {
+        auto size() const {
             std::shared_lock lock(set_mutex);
             return set.size();
         }
@@ -59,7 +63,7 @@ struct Hashy {
             // printf("new size %ld\n\r", byshape[shape].size());
         }
 
-        auto size() {
+        auto size() const {
             size_t sum = 0;
             for (auto &set : byhash) {
                 auto part = set.size();
@@ -95,7 +99,7 @@ struct Hashy {
         set.insert(std::forward<CubeT>(c));
     }
 
-    auto size() {
+    auto size() const {
         size_t sum = 0;
         DEBUG_PRINTF("%ld maps by shape\n\r", byshape.size());
         for (auto &set : byshape) {

From 74c0dc31db4e191a0a5816b4d9dfa7b290208e66 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Sat, 12 Aug 2023 02:40:59 +0300
Subject: [PATCH 24/42] cubes: Refactor thread scheduling

- Launching new threads is expensive.
  Refactor the cubes.cpp threading code so that
  The started threads are kept running until the main process is complete.
- Allow main thread do a it's preparation work
  in parallel with the running Workset.
  (The next cache file can be loaded while the old one is being processed.)

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/src/cubes.cpp | 138 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 104 insertions(+), 34 deletions(-)

diff --git a/cpp/src/cubes.cpp b/cpp/src/cubes.cpp
index bea7327..ac04e12 100644
--- a/cpp/src/cubes.cpp
+++ b/cpp/src/cubes.cpp
@@ -7,6 +7,8 @@
 #include <iostream>
 #include <mutex>
 #include <thread>
+#include <deque>
+#include <condition_variable>
 
 #include "cache.hpp"
 #include "cube.hpp"
@@ -19,22 +21,27 @@ const int PERF_STEP = 500;
 
 struct Workset {
     std::mutex mu;
+
+    CacheReader cr;
     CubeIterator _begin_total;
     CubeIterator _begin;
     CubeIterator _end;
     Hashy &hashes;
     XYZ targetShape, shape, expandDim;
     bool notSameShape;
-    Workset(ShapeRange &data, Hashy &hashes, XYZ targetShape, XYZ shape, XYZ expandDim, bool notSameShape)
-        : _begin_total(data.begin())
-        , _begin(data.begin())
-        , _end(data.end())
-        , hashes(hashes)
+    Workset(Hashy &hashes, XYZ targetShape, XYZ shape, XYZ expandDim, bool notSameShape)
+        : hashes(hashes)
         , targetShape(targetShape)
         , shape(shape)
         , expandDim(expandDim)
         , notSameShape(notSameShape) {}
 
+    void setRange(ShapeRange &data) {
+        _begin_total = data.begin();
+        _begin = data.begin();
+        _end = data.end();
+    }
+
     struct Subset {
         CubeIterator _begin, _end;
         bool valid;
@@ -131,26 +138,69 @@ struct Workset {
 };
 
 struct Worker {
-    Workset &ws;
+    std::shared_ptr<Workset> ws;
     int id;
-    Worker(Workset &ws_, int id_) : ws(ws_), id(id_) {}
+    int state = 3; // 1 == completed/waiting for job, 2 == processing, 3 == job assigned.
+    std::mutex mtx;
+    std::condition_variable cond;
+    std::condition_variable cond2;
+    std::thread thr;
+
+    Worker(int id_) : id(id_), thr(&Worker::run, this) {}
+    ~Worker() {
+        std::unique_lock lock(mtx);
+        state = 0;
+        cond.notify_one();
+        lock.unlock();
+        thr.join();
+    }
+
+    void launch(std::shared_ptr<Workset> ws_) {
+        std::unique_lock lock(mtx);
+        while(state > 1) {
+            cond2.wait(lock);
+        }
+        ws = ws_;
+        state = 3;
+        cond.notify_one();
+    }
+
+    void sync() {
+        std::unique_lock lock(mtx);
+        while(state > 1) {
+            cond2.wait(lock);
+        }
+        ws.reset();
+    }
+
     void run() {
-        // std::printf("start %d\n", id);
-        auto subset = ws.getPart();
-        while (subset.valid) {
-            if (id == 0) {
-                std::printf("  %5.2f%%\r", subset.percent);
-                std::flush(std::cout);
-            }
-            // std::cout << id << " next subset " << &*subset.begin() << " to " << &*subset.end() << "\n";
-            for (auto &c : subset) {
-                // std::printf("%p\n", (void *)&c);
-                // c.print();
-                ws.expand(c);
+        std::unique_lock lock(mtx);
+        std::printf("thread nro. %d started.\n", id);
+        while(state) {
+            state = 1;
+            cond2.notify_one();
+            while(state == 1)
+                cond.wait(lock);
+            if(!state)
+                return;
+            state = 2;
+            // std::printf("start %d\n", id);
+            auto subset = ws->getPart();
+            while (subset.valid) {
+                if (id == 0) {
+                    std::printf("  %5.2f%%\r", subset.percent);
+                    std::flush(std::cout);
+                }
+                // std::cout << id << " next subset " << &*subset.begin() << " to " << &*subset.end() << "\n";
+                for (auto &c : subset) {
+                    // std::printf("%p\n", (void *)&c);
+                    // c.print();
+                    ws->expand(c);
+                }
+                subset = ws->getPart();
             }
-            subset = ws.getPart();
+            // std::printf("finished %d\n", id);
         }
-        // std::printf("finished %d\n", id);
     }
 };
 
@@ -185,10 +235,19 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
     }
     std::printf("N = %d || generating new cubes from %lu base cubes.\n\r", n, base->size());
     hashes.init(n);
+
+    // Start worker threads.
+    std::deque<Worker> workers;
+    for (int i = 0; i < threads; ++i) {
+        workers.emplace_back(i);
+    }
+
+
     uint64_t totalSum = 0;
     auto start = std::chrono::steady_clock::now();
     uint32_t totalOutputShapes = hashes.byshape.size();
     uint32_t outShapeCount = 0;
+
     auto prevShapes = Hashy::generateShapes(n - 1);
     for (auto &tup : hashes.byshape) {
         outShapeCount++;
@@ -210,13 +269,14 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
             if (diffy == 1)
                 if (shape.y() == shape.x()) diffx = 1;
 
-            std::printf("  shape %d %d %d\n\r", shape.x(), shape.y(), shape.z());
+            auto ws = std::make_shared<Workset>(hashes, targetShape, shape, XYZ(diffx, diffy, diffz), abssum);
 
             if (use_split_cache) {
                 // load cache file only for this shape
                 std::string cachefile = base_path + "cubes_" + std::to_string(n - 1) + "_" + std::to_string(prevShapes[sid].x()) + "-" +
                                         std::to_string(prevShapes[sid].y()) + "-" + std::to_string(prevShapes[sid].z()) + ".bin";
-                cr.loadFile(cachefile);
+                ws->cr.loadFile(cachefile);
+                base = &ws->cr;
                 // cr.printHeader();
             }
             auto s = base->getCubesByShape(sid);
@@ -224,20 +284,26 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
                 std::printf("ERROR caches shape does not match expected shape!\n");
                 exit(-1);
             }
-            // std::printf("starting %d threads\n\r", threads);
-            std::vector<std::thread> ts;
-            Workset ws(s, hashes, targetShape, shape, XYZ(diffx, diffy, diffz), abssum);
-            std::vector<Worker> workers;
-            ts.reserve(threads);
-            workers.reserve(threads);
-            for (int i = 0; i < threads; ++i) {
-                workers.emplace_back(ws, i);
-                ts.emplace_back(&Worker::run, std::ref(workers[i]));
+
+            ws->setRange(s);
+
+            // Wait for jobs to complete.
+            for (auto& thr : workers) {
+                thr.sync();
             }
-            for (int i = 0; i < threads; ++i) {
-                ts[i].join();
+            std::printf("  shape %d %d %d\n\r", shape.x(), shape.y(), shape.z());
+            // launch the new jobs.
+            // Because the workset is held by shared_ptr
+            // main thread can do above preparation work in parallel
+            // while the jobs are running.
+            for (auto& thr : workers) {
+                thr.launch(ws);
             }
         }
+        // Wait for jobs to complete.
+        for (auto& thr : workers) {
+            thr.sync();
+        }
         std::printf("  num: %lu\n\r", hashes.byshape[targetShape].size());
         totalSum += hashes.byshape[targetShape].size();
         if (write_cache && split_cache) {
@@ -252,6 +318,10 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
             }
         }
     }
+
+    // Stop the workers.
+    workers.clear();
+
     if (write_cache && !split_cache) {
         Cache::save(base_path + "cubes_" + std::to_string(n) + ".bin", hashes, n);
     }

From 9ea1c7eaf0ab7b630d1b276f0b85398cd669b0ad Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Sun, 13 Aug 2023 20:13:47 +0300
Subject: [PATCH 25/42] CacheWriter class

Implement replacement for Cache::save()
CacheWriter should produce identical files to the old code,
but is slightly faster as it doesn't wait for the file finalization.
The old code still exists as reference but nothing is using it except tests.

- libmappedfile would allow the serialization process to be parallelized.
  (WIP, Not implemented yet.)
- Move Header ShapeEntry into cacheformat namespace
- Implement CacheWriter
- Update cubes.cpp to use the new CacheWriter
- Cube::copyout() helper. Idea for this helper is that if
  the cube representation is something else than plain XYZ array.

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/cube.hpp     |   7 +++
 cpp/include/hashes.hpp   |   2 +-
 cpp/include/newCache.hpp |  74 +++++++++++++++---------
 cpp/src/cubes.cpp        |  12 ++--
 cpp/src/newCache.cpp     | 119 +++++++++++++++++++++++++++++++++++++--
 5 files changed, 178 insertions(+), 36 deletions(-)

diff --git a/cpp/include/cube.hpp b/cpp/include/cube.hpp
index 72b9d97..e92e570 100644
--- a/cpp/include/cube.hpp
+++ b/cpp/include/cube.hpp
@@ -165,6 +165,13 @@ struct Cube {
     void print() const {
         for (auto &p : *this) std::printf("  (%2d %2d %2d)\n\r", p.x(), p.y(), p.z());
     }
+
+    /**
+     * Copy cube data into destination buffer.
+     */
+    void copyout(int num, XYZ* dest) const {
+        std::copy_n(begin(), num, dest);
+    }
 };
 
 static_assert(sizeof(Cube) == 8, "Unexpected sizeof(Cube) for Cube");
diff --git a/cpp/include/hashes.hpp b/cpp/include/hashes.hpp
index b154d2a..49462d2 100644
--- a/cpp/include/hashes.hpp
+++ b/cpp/include/hashes.hpp
@@ -101,7 +101,7 @@ struct Hashy {
 
     auto size() const {
         size_t sum = 0;
-        DEBUG_PRINTF("%ld maps by shape\n\r", byshape.size());
+        DEBUG1_PRINTF("%ld maps by shape\n\r", byshape.size());
         for (auto &set : byshape) {
             auto part = set.second.size();
             DEBUG1_PRINTF("bucket [%2d %2d %2d]: %ld\n", set.first.x(), set.first.y(), set.first.z(), part);
diff --git a/cpp/include/newCache.hpp b/cpp/include/newCache.hpp
index ca3d71f..242a273 100644
--- a/cpp/include/newCache.hpp
+++ b/cpp/include/newCache.hpp
@@ -4,11 +4,33 @@
 #include <cstring>
 #include <string>
 
+#include <deque>
+#include <future>
+
 #include "cube.hpp"
 #include "hashes.hpp"
 #include "mapped_file.hpp"
 
-class Workset;
+namespace cacheformat {
+    static constexpr uint32_t MAGIC = 0x42554350;
+    static constexpr uint32_t XYZ_SIZE = 3;
+    static constexpr uint32_t ALL_SHAPES = -1;
+
+    struct Header {
+        uint32_t magic = MAGIC;  // shoud be "PCUB" = 0x42554350
+        uint32_t n;              // we will never need 32bit but it is nicely aligned
+        uint32_t numShapes;      // defines length of the shapeTable
+        uint64_t numPolycubes;   // total number of polycubes
+    };
+    struct ShapeEntry {
+        uint8_t dim0;      // offset by -1
+        uint8_t dim1;      // offset by -1
+        uint8_t dim2;      // offset by -1
+        uint8_t reserved;  // for alignment
+        uint64_t offset;   // from beginning of file
+        uint64_t size;     // in bytes should be multiple of XYZ_SIZE
+    };
+};
 
 class CubeIterator {
    public:
@@ -52,7 +74,6 @@ class CubeIterator {
     friend bool operator<(const CubeIterator& a, const CubeIterator& b) { return a.m_ptr < b.m_ptr; };
     friend bool operator>(const CubeIterator& a, const CubeIterator& b) { return a.m_ptr > b.m_ptr; };
     friend bool operator!=(const CubeIterator& a, const CubeIterator& b) { return a.m_ptr != b.m_ptr; };
-    //friend class Workset;
 
    private:
     uint32_t n;
@@ -101,25 +122,6 @@ class CacheReader : public ICache {
     uint32_t numShapes() override { return header->numShapes; };
     operator bool() { return fileLoaded_; }
 
-    static constexpr uint32_t MAGIC = 0x42554350;
-    static constexpr uint32_t XYZ_SIZE = 3;
-    static constexpr uint32_t ALL_SHAPES = -1;
-
-    struct Header {
-        uint32_t magic = MAGIC;  // shoud be "PCUB" = 0x42554350
-        uint32_t n;              // we will never need 32bit but it is nicely aligned
-        uint32_t numShapes;      // defines length of the shapeTable
-        uint64_t numPolycubes;   // total number of polycubes
-    };
-    struct ShapeEntry {
-        uint8_t dim0;      // offset by -1
-        uint8_t dim1;      // offset by -1
-        uint8_t dim2;      // offset by -1
-        uint8_t reserved;  // for alignment
-        uint64_t offset;   // from beginning of file
-        uint64_t size;     // in bytes should be multiple of XYZ_SIZE
-    };
-
     // Do begin() and end() make sense for CacheReader
     // If the cache file provides data for more than single shape?
     // The data might not even be mapped contiguously to save memory.
@@ -138,15 +140,15 @@ class CacheReader : public ICache {
 
    private:
     std::shared_ptr<mapped::file> file_;
-    std::unique_ptr<const mapped::struct_region<Header>> header_;
-    std::unique_ptr<const mapped::array_region<ShapeEntry>> shapes_;
+    std::unique_ptr<const mapped::struct_region<cacheformat::Header>> header_;
+    std::unique_ptr<const mapped::array_region<cacheformat::ShapeEntry>> shapes_;
     std::unique_ptr<const mapped::array_region<XYZ>> xyz_;
 
     std::string path_;
     bool fileLoaded_;
-    const Header dummyHeader;
-    const Header* header;
-    const ShapeEntry* shapes;
+    const cacheformat::Header dummyHeader;
+    const cacheformat::Header* header;
+    const cacheformat::ShapeEntry* shapes;
 };
 
 class FlatCache : public ICache {
@@ -180,4 +182,24 @@ class FlatCache : public ICache {
     size_t size() override { return allXYZs.size() / n / sizeof(XYZ); }
 };
 
+class CacheWriter {
+protected:
+    // CacheWriter flushes the data in background.
+    std::deque<std::future<void>> m_flushes;
+public:
+    CacheWriter() {}
+    ~CacheWriter();
+
+    /**
+     * Capture snapshot of the Hashy and write cache file.
+     * The data may not be entirely flushed before save() returns.
+     */
+    void save(std::string path, Hashy &hashes, uint8_t n);
+
+    /**
+     * Complete all flushes immediately.
+     */
+    void flush();
+};
+
 #endif
diff --git a/cpp/src/cubes.cpp b/cpp/src/cubes.cpp
index ac04e12..7e5cf66 100644
--- a/cpp/src/cubes.cpp
+++ b/cpp/src/cubes.cpp
@@ -10,7 +10,6 @@
 #include <deque>
 #include <condition_variable>
 
-#include "cache.hpp"
 #include "cube.hpp"
 #include "hashes.hpp"
 #include "newCache.hpp"
@@ -216,7 +215,8 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
         hashes.insert(Cube{{XYZ(0, 0, 0)}}, XYZ(0, 0, 0));
         std::printf("%ld elements for %d\n\r", hashes.size(), n);
         if (write_cache) {
-            Cache::save(base_path + "cubes_" + std::to_string(n) + ".bin", hashes, n);
+            CacheWriter cw;
+            cw.save(base_path + "cubes_" + std::to_string(n) + ".bin", hashes, n);
         }
         return FlatCache(hashes, n);
     }
@@ -242,6 +242,7 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
         workers.emplace_back(i);
     }
 
+    CacheWriter cw;
 
     uint64_t totalSum = 0;
     auto start = std::chrono::steady_clock::now();
@@ -307,7 +308,7 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
         std::printf("  num: %lu\n\r", hashes.byshape[targetShape].size());
         totalSum += hashes.byshape[targetShape].size();
         if (write_cache && split_cache) {
-            Cache::save(base_path + "cubes_" + std::to_string(n) + "_" + std::to_string(targetShape.x()) + "-" + std::to_string(targetShape.y()) + "-" +
+            cw.save(base_path + "cubes_" + std::to_string(n) + "_" + std::to_string(targetShape.x()) + "-" + std::to_string(targetShape.y()) + "-" +
                             std::to_string(targetShape.z()) + ".bin",
                         hashes, n);
         }
@@ -323,8 +324,11 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
     workers.clear();
 
     if (write_cache && !split_cache) {
-        Cache::save(base_path + "cubes_" + std::to_string(n) + ".bin", hashes, n);
+        cw.save(base_path + "cubes_" + std::to_string(n) + ".bin", hashes, n);
     }
+
+    cw.flush();
+
     auto end = std::chrono::steady_clock::now();
     auto dt_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
     std::printf("took %.2f s\033[0K\n\r", dt_ms / 1000.f);
diff --git a/cpp/src/newCache.cpp b/cpp/src/newCache.cpp
index cd9726c..f7af6ed 100644
--- a/cpp/src/newCache.cpp
+++ b/cpp/src/newCache.cpp
@@ -41,16 +41,16 @@ int CacheReader::loadFile(const std::string path) {
     }
 
     // map the header struct
-    header_ = std::make_unique<const mapped::struct_region<Header>>(file_, 0);
+    header_ = std::make_unique<const mapped::struct_region<cacheformat::Header>>(file_, 0);
     header = header_->get();
 
-    if (header->magic != MAGIC) {
+    if (header->magic != cacheformat::MAGIC) {
         std::printf("error opening file: file not recognized\n");
         return 1;
     }
 
     // map the ShapeEntry array:
-    shapes_ = std::make_unique<const mapped::array_region<ShapeEntry>>(file_, header_->getEndSeek(), (*header_)->numShapes);
+    shapes_ = std::make_unique<const mapped::array_region<cacheformat::ShapeEntry>>(file_, header_->getEndSeek(), (*header_)->numShapes);
     shapes = shapes_->get();
 
     size_t datasize = 0;
@@ -84,8 +84,8 @@ ShapeRange CacheReader::getCubesByShape(uint32_t i) {
     for (unsigned int k = 0; k < i; ++k) {
         offset += shapes[k].size;
     }
-    auto index = offset / XYZ_SIZE;
-    auto num_xyz = shapes[i].size / XYZ_SIZE;
+    auto index = offset / cacheformat::XYZ_SIZE;
+    auto num_xyz = shapes[i].size / cacheformat::XYZ_SIZE;
     // pointers to Cube data:
     auto start = xyz_->get() + index;
     auto end = xyz_->get() + index + num_xyz;
@@ -106,3 +106,112 @@ void CacheReader::unload() {
 }
 
 CacheReader::~CacheReader() { unload(); }
+
+CacheWriter::CacheWriter::~CacheWriter()
+{
+    flush();
+}
+
+
+void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
+    if (hashes.size() == 0) return;
+
+    using namespace mapped;
+    using namespace cacheformat;
+
+    auto file_ = std::make_shared<file>();
+    if (file_->openrw(path.c_str(), 0)) {
+        std::printf("error opening file\n");
+        return;
+    }
+
+    auto header = std::make_unique<struct_region<Header>>(file_, 0);
+    (*header)->magic = cacheformat::MAGIC;
+    (*header)->n = n;
+    (*header)->numShapes = hashes.byshape.size();
+    (*header)->numPolycubes = hashes.size();
+
+    std::vector<XYZ> keys;
+    keys.reserve((*header)->numShapes);
+    for (auto &pair : hashes.byshape) keys.push_back(pair.first);
+    std::sort(keys.begin(), keys.end());
+
+    auto shapeEntry = std::make_unique<array_region<ShapeEntry>>(file_, header->getEndSeek(), (*header)->numShapes);
+
+    uint64_t offset = shapeEntry->getEndSeek();
+    size_t num_cubes = 0;
+    int i = 0;
+    for (auto &key : keys) {
+        auto& se = (*shapeEntry)[i++];
+        se.dim0 = key.x();
+        se.dim1 = key.y();
+        se.dim2 = key.z();
+        se.reserved = 0;
+        se.offset = offset;
+        auto count = hashes.byshape[key].size() ;
+        num_cubes += count;
+        se.size = count * XYZ_SIZE * n;
+        offset += se.size;
+    }
+
+    // put XYZs
+    // do this in parallel?
+    // it takes an long while to write out the file.
+    // note: we are at peak memory use in this function.
+
+    auto xyz = std::make_unique<array_region<XYZ>>(file_, (*shapeEntry)[0].offset, num_cubes * n);
+    auto put = xyz->get();
+
+    for (auto &key : keys) {
+        for (auto &subset : hashes.byshape[key].byhash) {
+            auto itr = subset.set.begin();
+            while(itr != subset.set.end()) {
+                static_assert(sizeof(XYZ) == XYZ_SIZE);
+                assert(itr->size() == n);
+                itr->copyout(n, put);
+                put += n;
+                ++itr;
+            }
+        }
+    }
+    // move the resources into lambda and async launch it.
+    // the file is finalized in background.
+    m_flushes.emplace_back(std::async(std::launch::async, [
+        file = std::move(file_),
+        header = std::move(header),
+        shapeEntry = std::move(shapeEntry),
+        xyz = std::move(xyz)]() mutable {
+            // flush.
+            header->flush();
+            shapeEntry->flush();
+            xyz->flush();
+            // Truncate file to proper size.
+            file->truncate(xyz->getEndSeek());
+            file->close();
+            xyz.reset();
+            shapeEntry.reset();
+            header.reset();
+            file.reset();
+    }));
+
+    // cleanup completed flushes. (don't wait)
+    auto rm = std::remove_if(m_flushes.begin(), m_flushes.end(), [](auto& fut) {
+        if(fut.wait_for(std::chrono::seconds(0)) == std::future_status::ready) {
+            fut.get();
+            return true;
+        }
+        return false;
+    });
+    m_flushes.erase(rm, m_flushes.end());
+
+    std::printf("saved %s, %d unfinished.\n\r", path.c_str(), (int)m_flushes.size());
+}
+
+void CacheWriter::flush()
+{
+    for(auto& fut : m_flushes) {
+        fut.get();
+    }
+    m_flushes.clear();
+}
+

From d850fdbea0f8ee5ad2c617c6ace86b7d98167626 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Mon, 14 Aug 2023 04:23:15 +0300
Subject: [PATCH 26/42] CacheWriter: Parallel serialization

- CacheWriter now uses thread pool and copies the Hashy using
  worker threads. This would not be possible without libmapped_file.
  (N=13 completes now in less than 310 seconds, depends on disk)
- Add nice progress bar

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/newCache.hpp |  70 ++++++++++-------
 cpp/src/cubes.cpp        |   4 +-
 cpp/src/newCache.cpp     | 164 ++++++++++++++++++++++++++++-----------
 3 files changed, 163 insertions(+), 75 deletions(-)

diff --git a/cpp/include/newCache.hpp b/cpp/include/newCache.hpp
index 242a273..888ff14 100644
--- a/cpp/include/newCache.hpp
+++ b/cpp/include/newCache.hpp
@@ -1,36 +1,38 @@
 #pragma once
 #ifndef OPENCUBES_NEWCACHE_HPP
 #define OPENCUBES_NEWCACHE_HPP
+#include <condition_variable>
 #include <cstring>
-#include <string>
-
 #include <deque>
-#include <future>
+#include <functional>
+#include <mutex>
+#include <string>
+#include <thread>
 
 #include "cube.hpp"
 #include "hashes.hpp"
 #include "mapped_file.hpp"
 
 namespace cacheformat {
-    static constexpr uint32_t MAGIC = 0x42554350;
-    static constexpr uint32_t XYZ_SIZE = 3;
-    static constexpr uint32_t ALL_SHAPES = -1;
-
-    struct Header {
-        uint32_t magic = MAGIC;  // shoud be "PCUB" = 0x42554350
-        uint32_t n;              // we will never need 32bit but it is nicely aligned
-        uint32_t numShapes;      // defines length of the shapeTable
-        uint64_t numPolycubes;   // total number of polycubes
-    };
-    struct ShapeEntry {
-        uint8_t dim0;      // offset by -1
-        uint8_t dim1;      // offset by -1
-        uint8_t dim2;      // offset by -1
-        uint8_t reserved;  // for alignment
-        uint64_t offset;   // from beginning of file
-        uint64_t size;     // in bytes should be multiple of XYZ_SIZE
-    };
+static constexpr uint32_t MAGIC = 0x42554350;
+static constexpr uint32_t XYZ_SIZE = 3;
+static constexpr uint32_t ALL_SHAPES = -1;
+
+struct Header {
+    uint32_t magic = MAGIC;  // shoud be "PCUB" = 0x42554350
+    uint32_t n;              // we will never need 32bit but it is nicely aligned
+    uint32_t numShapes;      // defines length of the shapeTable
+    uint64_t numPolycubes;   // total number of polycubes
+};
+struct ShapeEntry {
+    uint8_t dim0;      // offset by -1
+    uint8_t dim1;      // offset by -1
+    uint8_t dim2;      // offset by -1
+    uint8_t reserved;  // for alignment
+    uint64_t offset;   // from beginning of file
+    uint64_t size;     // in bytes should be multiple of XYZ_SIZE
 };
+};  // namespace cacheformat
 
 class CubeIterator {
    public:
@@ -183,18 +185,32 @@ class FlatCache : public ICache {
 };
 
 class CacheWriter {
-protected:
-    // CacheWriter flushes the data in background.
-    std::deque<std::future<void>> m_flushes;
-public:
-    CacheWriter() {}
+   protected:
+    std::mutex m_mtx;
+    std::condition_variable m_run;
+    std::condition_variable m_wait;
+    bool m_active = true;
+
+    // Jobs that flush and finalize the written file.
+    std::deque<std::function<void(void)>> m_flushes;
+
+    // Temporary copy jobs into the memory mapped file.
+    std::deque<std::function<void(void)>> m_copy;
+
+    // thread pool executing the jobs.
+    std::deque<std::thread> m_flushers;
+
+    void run();
+
+   public:
+    CacheWriter(int num_threads = 8);
     ~CacheWriter();
 
     /**
      * Capture snapshot of the Hashy and write cache file.
      * The data may not be entirely flushed before save() returns.
      */
-    void save(std::string path, Hashy &hashes, uint8_t n);
+    void save(std::string path, Hashy& hashes, uint8_t n);
 
     /**
      * Complete all flushes immediately.
diff --git a/cpp/src/cubes.cpp b/cpp/src/cubes.cpp
index 7e5cf66..1630bb6 100644
--- a/cpp/src/cubes.cpp
+++ b/cpp/src/cubes.cpp
@@ -215,7 +215,7 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
         hashes.insert(Cube{{XYZ(0, 0, 0)}}, XYZ(0, 0, 0));
         std::printf("%ld elements for %d\n\r", hashes.size(), n);
         if (write_cache) {
-            CacheWriter cw;
+            CacheWriter cw(1);
             cw.save(base_path + "cubes_" + std::to_string(n) + ".bin", hashes, n);
         }
         return FlatCache(hashes, n);
@@ -242,7 +242,7 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
         workers.emplace_back(i);
     }
 
-    CacheWriter cw;
+    CacheWriter cw(threads);
 
     uint64_t totalSum = 0;
     auto start = std::chrono::steady_clock::now();
diff --git a/cpp/src/newCache.cpp b/cpp/src/newCache.cpp
index f7af6ed..4d95792 100644
--- a/cpp/src/newCache.cpp
+++ b/cpp/src/newCache.cpp
@@ -1,8 +1,4 @@
-#include "../include/newCache.hpp"
-
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <unistd.h>
+#include "newCache.hpp"
 
 #include <iostream>
 
@@ -107,11 +103,54 @@ void CacheReader::unload() {
 
 CacheReader::~CacheReader() { unload(); }
 
-CacheWriter::CacheWriter::~CacheWriter()
-{
+CacheWriter::CacheWriter(int num_threads) {
+    for (int i = 0; i < num_threads; ++i) {
+        m_flushers.emplace_back(&CacheWriter::run, this);
+    }
+}
+
+CacheWriter::CacheWriter::~CacheWriter() {
     flush();
+    // stop the threads.
+    std::unique_lock lock(m_mtx);
+    m_active = false;
+    m_run.notify_all();
+    lock.unlock();
+    for (auto &thr : m_flushers) thr.join();
 }
 
+void CacheWriter::run() {
+    std::unique_lock lock(m_mtx);
+    while (m_active) {
+        // do copy jobs:
+        if (!m_copy.empty()) {
+            auto task = std::move(m_copy.front());
+            m_copy.pop_front();
+            lock.unlock();
+
+            task();
+
+            lock.lock();
+            continue;
+        }
+        // file flushes:
+        if (!m_flushes.empty()) {
+            auto task = std::move(m_flushes.front());
+            m_flushes.pop_front();
+            lock.unlock();
+
+            task();
+
+            lock.lock();
+            continue;
+        }
+        // notify that we are done here.
+        m_wait.notify_one();
+        // wait for jobs.
+        m_run.wait(lock);
+    }
+    m_wait.notify_one();
+}
 
 void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
     if (hashes.size() == 0) return;
@@ -125,7 +164,7 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
         return;
     }
 
-    auto header = std::make_unique<struct_region<Header>>(file_, 0);
+    auto header = std::make_shared<struct_region<Header>>(file_, 0);
     (*header)->magic = cacheformat::MAGIC;
     (*header)->n = n;
     (*header)->numShapes = hashes.byshape.size();
@@ -136,51 +175,91 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
     for (auto &pair : hashes.byshape) keys.push_back(pair.first);
     std::sort(keys.begin(), keys.end());
 
-    auto shapeEntry = std::make_unique<array_region<ShapeEntry>>(file_, header->getEndSeek(), (*header)->numShapes);
+    auto shapeEntry = std::make_shared<array_region<ShapeEntry>>(file_, header->getEndSeek(), (*header)->numShapes);
 
     uint64_t offset = shapeEntry->getEndSeek();
     size_t num_cubes = 0;
     int i = 0;
     for (auto &key : keys) {
-        auto& se = (*shapeEntry)[i++];
+        auto &se = (*shapeEntry)[i++];
         se.dim0 = key.x();
         se.dim1 = key.y();
         se.dim2 = key.z();
         se.reserved = 0;
         se.offset = offset;
-        auto count = hashes.byshape[key].size() ;
+        auto count = hashes.byshape[key].size();
         num_cubes += count;
         se.size = count * XYZ_SIZE * n;
         offset += se.size;
     }
 
     // put XYZs
-    // do this in parallel?
-    // it takes an long while to write out the file.
-    // note: we are at peak memory use in this function.
+    // Serialize large CubeSet(s) in parallel.
 
-    auto xyz = std::make_unique<array_region<XYZ>>(file_, (*shapeEntry)[0].offset, num_cubes * n);
+    auto xyz = std::make_shared<array_region<XYZ>>(file_, (*shapeEntry)[0].offset, num_cubes * n);
     auto put = xyz->get();
 
+    auto copyrange = [n](CubeSet::iterator itr, CubeSet::iterator end, XYZ *dest) -> void {
+        while (itr != end) {
+            static_assert(sizeof(XYZ) == XYZ_SIZE);
+            assert(itr->size() == n);
+            itr->copyout(n, dest);
+            dest += n;
+            ++itr;
+        }
+    };
+
+    auto time_start = std::chrono::steady_clock::now();
     for (auto &key : keys) {
         for (auto &subset : hashes.byshape[key].byhash) {
             auto itr = subset.set.begin();
-            while(itr != subset.set.end()) {
-                static_assert(sizeof(XYZ) == XYZ_SIZE);
-                assert(itr->size() == n);
-                itr->copyout(n, put);
-                put += n;
-                ++itr;
+
+            ptrdiff_t dist = subset.set.size();
+            // distribute if range is large enough.
+            auto skip = std::max(4096L, std::max(1L, dist / (signed)m_flushers.size()));
+            while (dist > skip) {
+                auto start = itr;
+                auto dest = put;
+
+                auto inc = std::min(dist, skip);
+                std::advance(itr, inc);
+                put += n * inc;
+                dist = std::distance(itr, subset.set.end());
+
+                auto done = 100.0f * (std::distance(xyz->get(), put) / float(num_cubes * n));
+                std::printf("writing data %5.2f%% ...  \r", done);
+                std::flush(std::cout);
+
+                std::lock_guard lock(m_mtx);
+                m_copy.emplace_back(std::bind(copyrange, start, itr, dest));
+                m_run.notify_all();
+            }
+            // copy remainder, if any.
+            if (dist) {
+                std::lock_guard lock(m_mtx);
+                m_copy.emplace_back(std::bind(copyrange, itr, subset.set.end(), put));
+                m_run.notify_all();
+                put += n * dist;
+
+                auto done = 100.0f * (std::distance(xyz->get(), put) / float(num_cubes * n));
+                std::printf("writing data %5.2f%% ...  \r", done);
+                std::flush(std::cout);
             }
         }
     }
-    // move the resources into lambda and async launch it.
-    // the file is finalized in background.
-    m_flushes.emplace_back(std::async(std::launch::async, [
-        file = std::move(file_),
-        header = std::move(header),
-        shapeEntry = std::move(shapeEntry),
-        xyz = std::move(xyz)]() mutable {
+
+    // sanity check:
+    assert(put == (*xyz).get() + num_cubes * n);
+
+    // sync up.
+    std::unique_lock lock(m_mtx);
+    while (!m_copy.empty()) {
+        m_wait.wait(lock);
+    }
+
+    // move the resources into flush job.
+    m_flushes.emplace_back(std::bind(
+        [](auto &&file, auto &&header, auto &&shapeEntry, auto &&xyz) -> void {
             // flush.
             header->flush();
             shapeEntry->flush();
@@ -188,30 +267,23 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
             // Truncate file to proper size.
             file->truncate(xyz->getEndSeek());
             file->close();
+            file.reset();
             xyz.reset();
             shapeEntry.reset();
             header.reset();
-            file.reset();
-    }));
+        },
+        std::move(file_), std::move(header), std::move(shapeEntry), std::move(xyz)));
+    m_run.notify_all();
 
-    // cleanup completed flushes. (don't wait)
-    auto rm = std::remove_if(m_flushes.begin(), m_flushes.end(), [](auto& fut) {
-        if(fut.wait_for(std::chrono::seconds(0)) == std::future_status::ready) {
-            fut.get();
-            return true;
-        }
-        return false;
-    });
-    m_flushes.erase(rm, m_flushes.end());
+    auto time_end = std::chrono::steady_clock::now();
+    auto dt_ms = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
 
-    std::printf("saved %s, %d unfinished.\n\r", path.c_str(), (int)m_flushes.size());
+    std::printf("saved %s, took %.2f s\n\r", path.c_str(), dt_ms / 1000.f);
 }
 
-void CacheWriter::flush()
-{
-    for(auto& fut : m_flushes) {
-        fut.get();
+void CacheWriter::flush() {
+    std::unique_lock lock(m_mtx);
+    while (!m_flushes.empty()) {
+        m_wait.wait(lock);
     }
-    m_flushes.clear();
 }
-

From 5e0d24567aed37118035c23bc8e083c9232f810e Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Mon, 14 Aug 2023 04:37:48 +0300
Subject: [PATCH 27/42] Remove include/cache.hpp src/cache.cpp

The old cache code has been deprecated since CacheWriter arrived:
Only user was in tests/src/test_cache.cpp so drop the test case
because it doesn't have any impact on the main cubes anymore.

- Delete include/cache.hpp src/cache.cpp source files.
  Hopefully they will not be missed. :-)

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/CMakeLists.txt           |   1 -
 cpp/include/cache.hpp        |  35 --------
 cpp/src/cache.cpp            | 163 -----------------------------------
 cpp/tests/src/test_cache.cpp |  10 ---
 4 files changed, 209 deletions(-)
 delete mode 100644 cpp/include/cache.hpp
 delete mode 100644 cpp/src/cache.cpp
 delete mode 100644 cpp/tests/src/test_cache.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 05e50f0..78e91a2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -44,7 +44,6 @@ ConfigureTarget(mapped_file)
 # Source files
 add_library(CubeObjs OBJECT
 	"src/cubes.cpp"
-	"src/cache.cpp"
 	"src/rotations.cpp"
 	"src/newCache.cpp"
 )
diff --git a/cpp/include/cache.hpp b/cpp/include/cache.hpp
deleted file mode 100644
index 6c3480d..0000000
--- a/cpp/include/cache.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#pragma once
-#ifndef OPENCUBES_CACHE_HPP
-#define OPENCUBES_CACHE_HPP
-#include <string>
-
-#include "hashes.hpp"
-#include "utils.hpp"
-
-struct Cache {
-    static constexpr uint32_t MAGIC = 0x42554350;
-    static constexpr uint32_t XYZ_SIZE = 3;
-    static constexpr uint32_t ALL_SHAPES = -1;
-    struct Header {
-        uint32_t magic = MAGIC;  // shoud be "PCUB" = 0x42554350
-        uint32_t n;              // we will never need 32bit but it is nicely aligned
-        uint32_t numShapes;      // defines length of the shapeTable
-        uint64_t numPolycubes;   // total number of polycubes
-    };
-    struct ShapeEntry {
-        uint8_t dim0;      // offset by -1
-        uint8_t dim1;      // offset by -1
-        uint8_t dim2;      // offset by -1
-        uint8_t reserved;  // for alignment
-        uint64_t offset;   // from beginning of file
-        uint64_t size;     // in bytes should be multiple of XYZ_SIZE
-    };
-
-    static void save(std::string path, Hashy& hashes, uint8_t n);
-    static Hashy load(std::string path, uint32_t extractShape = ALL_SHAPES);
-
-    int filedesc;
-    void* mmap_ptr;
-};
-
-#endif
diff --git a/cpp/src/cache.cpp b/cpp/src/cache.cpp
deleted file mode 100644
index 071ff0f..0000000
--- a/cpp/src/cache.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-#include "cache.hpp"
-
-#include <algorithm>
-#include <fstream>
-#include <limits>
-#include <string>
-#include <unordered_set>
-
-#include "utils.hpp"
-
-/*
-====================
-cache file header
-====================
-
-uint32_t magic = "PCUB"
-uint32_t n = cache file for n cubes in a polycube
-uint32_t numShapes = number of different shapes in cachefile
--------
-
-====================
-shapetable:
-====================
-shapeEntry {
-    uint8_t dim0 // offset by -1
-    uint8_t dim1 // offset by -1
-    uint8_t dim2 // offset by -1
-    uint8_t reserved
-    uint64_t offset in file
-}
-shapeEntry[numShapes]
-
-
-====================
-XYZ data
-====================
-
-*/
-
-void Cache::save(std::string path, Hashy &hashes, uint8_t n) {
-    if (hashes.size() == 0) return;
-    std::ofstream ofs(path, std::ios::binary);
-    Header header;
-    header.magic = MAGIC;
-    header.n = n;
-    header.numShapes = hashes.byshape.size();
-    header.numPolycubes = hashes.size();
-    ofs.write((const char *)&header, sizeof(header));
-
-    std::vector<XYZ> keys;
-    keys.reserve(header.numShapes);
-    for (auto &pair : hashes.byshape) keys.push_back(pair.first);
-    std::sort(keys.begin(), keys.end());
-    uint64_t offset = sizeof(Header) + header.numShapes * sizeof(ShapeEntry);
-    for (auto &key : keys) {
-        ShapeEntry se;
-        se.dim0 = key.x();
-        se.dim1 = key.y();
-        se.dim2 = key.z();
-        se.reserved = 0;
-        se.offset = offset;
-        se.size = hashes.byshape[key].size() * XYZ_SIZE * n;
-        offset += se.size;
-        ofs.write((const char *)&se, sizeof(ShapeEntry));
-    }
-    // put XYZs
-    for (auto &key : keys) {
-        for (auto &subset : hashes.byshape[key].byhash)
-            for (const auto &c : subset.set) {
-                if constexpr (sizeof(XYZ) == XYZ_SIZE) {
-                    ofs.write((const char *)c.data(), sizeof(XYZ) * c.size());
-                } else {
-                    for (const auto &p : c) {
-                        ofs.write((const char *)p.data, XYZ_SIZE);
-                    }
-                }
-            }
-    }
-
-    std::printf("saved %s\n\r", path.c_str());
-}
-
-Hashy Cache::load(std::string path, uint32_t extractShape) {
-    Hashy cubes;
-    auto ifs = std::ifstream(path, std::ios::binary);
-    if (!ifs.is_open()) return cubes;
-    Header header;
-    if (!ifs.read((char *)&header, sizeof(header))) {
-        return cubes;
-    }
-    // check magic
-    if (header.magic != MAGIC) {
-        return cubes;
-    }
-#ifdef CACHE_LOAD_HEADER_ONLY
-    std::printf("loading cache file \"%s\" for N = %u", path.c_str(), header.n);
-    std::printf(", %u shapes, %lu XYZs\n\r", header.numShapes, header.numPolycubes);
-#endif
-    auto cubeSize = XYZ_SIZE * header.n;
-    DEBUG_PRINTF("cubeSize: %u\n\r", cubeSize);
-
-    for (uint32_t i = 0; i < header.numShapes; ++i) {
-        ShapeEntry shapeEntry;
-        if (!ifs.read((char *)&shapeEntry, sizeof(shapeEntry))) {
-            std::printf("ERROR reading ShapeEntry %u\n\r", i);
-            exit(-1);
-        }
-        if (ALL_SHAPES != extractShape && i != extractShape) continue;
-#ifdef CACHE_PRINT_SHAPEENTRIES
-        std::printf("ShapeEntry %3u: [%2d %2d %2d] offset: 0x%08lx size: 0x%08lx (%ld polycubes)\n\r", i, shapeEntry.dim0, shapeEntry.dim1, shapeEntry.dim2,
-                    shapeEntry.offset, shapeEntry.size, shapeEntry.size / cubeSize);
-#endif
-        if (shapeEntry.size % cubeSize != 0) {
-            std::printf("ERROR shape block is not divisible by cubeSize!\n\r");
-            exit(-1);
-        }
-#ifndef CACHE_LOAD_HEADER_ONLY
-        // remember pos in file
-        auto pos = ifs.tellg();
-
-        // read XYZ contents
-        ifs.seekg(shapeEntry.offset);
-        const uint32_t CHUNK_SIZE = 512 * XYZ_SIZE;
-        uint8_t buf[CHUNK_SIZE] = {0};
-        uint64_t buf_offset = 0;
-        uint32_t numCubes = shapeEntry.size / cubeSize;
-        XYZ shape(shapeEntry.dim0, shapeEntry.dim1, shapeEntry.dim2);
-        uint64_t readsize = shapeEntry.size - buf_offset;
-        if (readsize > CHUNK_SIZE) readsize = CHUNK_SIZE;
-        if (!ifs.read((char *)&buf, readsize)) {
-            std::printf("ERROR reading XYZs for Shape %u\n\r", i);
-            exit(-1);
-        }
-        for (uint32_t j = 0; j < numCubes; ++j) {
-            Cube next(header.n);
-            for (uint32_t k = 0; k < header.n; ++k) {
-                // check if buf contains next XYZ
-                uint64_t curr_offset = j * cubeSize + k * XYZ_SIZE;
-                if (curr_offset >= buf_offset + CHUNK_SIZE) {
-                    // std::printf("reload buffer\n\r");
-                    buf_offset += CHUNK_SIZE;
-                    readsize = shapeEntry.size - buf_offset;
-                    if (readsize > CHUNK_SIZE) readsize = CHUNK_SIZE;
-                    if (!ifs.read((char *)&buf, readsize)) {
-                        std::printf("ERROR reading XYZs for Shape %u\n\r", i);
-                        exit(-1);
-                    }
-                }
-
-                next.data()[k].data[0] = buf[curr_offset - buf_offset + 0];
-                next.data()[k].data[1] = buf[curr_offset - buf_offset + 1];
-                next.data()[k].data[2] = buf[curr_offset - buf_offset + 2];
-            }
-            cubes.insert(next, shape);
-        }
-
-        // restore pos
-        ifs.seekg(pos);
-#endif
-    }
-    std::printf("  loaded %lu cubes\n\r", cubes.size());
-    return cubes;
-}
diff --git a/cpp/tests/src/test_cache.cpp b/cpp/tests/src/test_cache.cpp
deleted file mode 100644
index ae10cbf..0000000
--- a/cpp/tests/src/test_cache.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-#include <gtest/gtest.h>
-
-#include "cache.hpp"
-
-TEST(CacheTests, TestCacheLoadDoesNotThrow) { EXPECT_NO_THROW(Cache::load("./test_data.bin")); }
-
-TEST(CacheTests, TestCacheSaveDoesNotThrow) {
-    auto data = Cache::load("./test_data.bin");
-    EXPECT_NO_THROW(Cache::save("./temp.bin", data, 255));
-}
\ No newline at end of file

From 95c6a07641e690ab96de3463d286d7cc185b10e8 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Mon, 14 Aug 2023 09:56:50 +0300
Subject: [PATCH 28/42] CacheWriter: Fix-up synchronization

CacheWriter didn't properly wait for queued job(s) to complete.
Fix with counter that is incremented on queue and
decremented *after* the task is run.

Signed-off-by: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/newCache.hpp | 2 ++
 cpp/src/newCache.cpp     | 9 +++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/cpp/include/newCache.hpp b/cpp/include/newCache.hpp
index 888ff14..c24a06b 100644
--- a/cpp/include/newCache.hpp
+++ b/cpp/include/newCache.hpp
@@ -192,9 +192,11 @@ class CacheWriter {
     bool m_active = true;
 
     // Jobs that flush and finalize the written file.
+    size_t m_num_flushes = 0;
     std::deque<std::function<void(void)>> m_flushes;
 
     // Temporary copy jobs into the memory mapped file.
+    size_t m_num_copys = 0;
     std::deque<std::function<void(void)>> m_copy;
 
     // thread pool executing the jobs.
diff --git a/cpp/src/newCache.cpp b/cpp/src/newCache.cpp
index 4d95792..93f15b3 100644
--- a/cpp/src/newCache.cpp
+++ b/cpp/src/newCache.cpp
@@ -131,6 +131,7 @@ void CacheWriter::run() {
             task();
 
             lock.lock();
+            --m_num_copys;
             continue;
         }
         // file flushes:
@@ -142,6 +143,7 @@ void CacheWriter::run() {
             task();
 
             lock.lock();
+            --m_num_flushes;
             continue;
         }
         // notify that we are done here.
@@ -232,12 +234,14 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
 
                 std::lock_guard lock(m_mtx);
                 m_copy.emplace_back(std::bind(copyrange, start, itr, dest));
+                ++m_num_copys;
                 m_run.notify_all();
             }
             // copy remainder, if any.
             if (dist) {
                 std::lock_guard lock(m_mtx);
                 m_copy.emplace_back(std::bind(copyrange, itr, subset.set.end(), put));
+                ++m_num_copys;
                 m_run.notify_all();
                 put += n * dist;
 
@@ -253,7 +257,7 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
 
     // sync up.
     std::unique_lock lock(m_mtx);
-    while (!m_copy.empty()) {
+    while (m_num_copys) {
         m_wait.wait(lock);
     }
 
@@ -273,6 +277,7 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
             header.reset();
         },
         std::move(file_), std::move(header), std::move(shapeEntry), std::move(xyz)));
+    ++m_num_flushes;
     m_run.notify_all();
 
     auto time_end = std::chrono::steady_clock::now();
@@ -283,7 +288,7 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
 
 void CacheWriter::flush() {
     std::unique_lock lock(m_mtx);
-    while (!m_flushes.empty()) {
+    while (m_num_flushes) {
         m_wait.wait(lock);
     }
 }

From e5a7bce4811be3add531d38a8be652c1d75053d9 Mon Sep 17 00:00:00 2001
From: JATothrim <jarmo.tiitto@gmail.com>
Date: Thu, 17 Aug 2023 22:52:27 +0300
Subject: [PATCH 29/42] Update Readme.md to reflect the state of the C++
 implementation.

The C++ implementation has gained the split cache files and their
associated command line parameters since Readme.md was last updated.
Document the `./cubes` program usage and how to use the split cache files.

Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/Readme.md | 52 +++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/cpp/Readme.md b/cpp/Readme.md
index ce7de8c..7e34fba 100644
--- a/cpp/Readme.md
+++ b/cpp/Readme.md
@@ -1,27 +1,71 @@
 # C++ implementation of opencubes
 - uses list representation of coordinates with ones
 - hashfunction for coordinate is simple concatination of bytes
-- can split problem into threads, but performance can be improoved
+- can split problem into threads, but performance can be improved
 
 ## usage:
 ```bash
 ./cubes -n N
 ```
-options:
+### options:
 ```
+-n    --cube_size
+the size of polycube to generate up to
+This parameter is required.
+
 -t    --threads
 the number of threads to use while generating
 This parameter is optional. The default value is '1'.
 
 -c    --use_cache
-whether to load cache files
+whether to load cache files.
+The last N-1 run must have used -w parameter and that process
+must have completed without errors. The cache file
+must be present under the cache folder. (-f parameter)
 This parameter is optional. The default value is '0'.
 
 -w    --write_cache
-wheather to save cache files
+whether to save cache files
+This parameter is optional. The default value is '0'.
+
+-s    --split_cache
+whether to save separated cache files per output shape.
+requires -w parameter to take affect.
+No combined cache file is saved when -s is present.
 This parameter is optional. The default value is '0'.
+
+-u    --use_split_cache
+whether to load separated cache files per output shape.
+The last N-1 run must have used -s parameter and that process
+must have completed without errors. The split cache file(s)
+must be present under the cache folder. (-f parameter)
+This parameter is optional. The default value is '0'.
+
+-f    --cache_file_folder
+where to store cache files.
+This parameter is optional. The default value is './cache/'.
 ```
 
+### split cache usage:
+Starting with N=9 and beyond it makes sense to use the disk cache system.
+To generate starting cache run:
+```bash
+./cubes -n 9 -w -s
+```
+
+Above saves of the results into the cache folder (specified with -f parameter)
+as split cache files. Next N=10 run can continue processing from where the last N=9 process stopped:
+```bash
+./cubes -n 10 -w -s -u
+```
+The split cache file mode attempts to minimize memory usage.
+All following runs can use above command by incrementing the N by one each time.
+
+If required you can merge the split cache files
+back into single file at last run by dropping the `-s` parameter.
+Merging the split cache this way however uses vastly more memory.
+(Tool should be developed to export/merge the split cache files as standard cube format file)
+
 ## building (cmake)
 To build a release version (with optimisations , default)
 ```bash

From 9658905fcab738ae87eb7976a54ef55508300f25 Mon Sep 17 00:00:00 2001
From: JATothrim <jarmo.tiitto@gmail.com>
Date: Wed, 19 Jul 2023 20:18:30 +0300
Subject: [PATCH 30/42] Add build time configure options.

- Imported commit v2 for next branch.
- Current `git rev-list -n1 HEAD`, used compiler and build type
  and critical settings are embedded into the cubes binary.
- `cubes -v` prints now how it was built.
- CUBES_MAX_N constant now available from "config.hpp"
- CONFIG_PACK_CUBE_ADDR now available from "config.hpp"
- New options can be added into "config.hpp.in"
- Add anti-goof measure for the read-only config.hpp
  The config defines can be changed at cmake configure time.

Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/CMakeLists.txt | 27 ++++++++++++++++++++++++++-
 cpp/config.hpp.in  | 18 ++++++++++++++++++
 cpp/program.cpp    |  7 ++++++-
 3 files changed, 50 insertions(+), 2 deletions(-)
 create mode 100644 cpp/config.hpp.in

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 78e91a2..9da6b04 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -3,11 +3,36 @@ project(cubes CXX)
 
 # default to release build because speed maters.
 if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release")
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "CMAKE_BUILD_TYPE: Release, Debug or RelWithDebInfo" FORCE)
 endif()
 
+if(NOT BUILD_CUBES_MAX_N)
+	set(BUILD_CUBES_MAX_N 20 CACHE STRING "Limit of maximum N Polycubes to be computed" FORCE)
+endif()
+
+if(NOT BUILD_PACK_CUBE_ADDR)
+	set(BUILD_PACK_CUBE_ADDR 1 CACHE BOOL "Pack Cube struct XYZ memory address into 56-bit field." FORCE)
+endif()
+
+# Try extract current HEAD commit-id in git
+find_package(Git)
+if(GIT_FOUND)
+	execute_process(
+		COMMAND ${GIT_EXECUTABLE} rev-list -n1 HEAD
+		WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+		OUTPUT_STRIP_TRAILING_WHITESPACE
+		RESULT_VARIABLE RESULT
+		OUTPUT_VARIABLE CONFIG_GIT_VERSION)
+	message(STATUS "Set ${CONFIG_GIT_VERSION} to build version info")
+endif()
+
+# generate config.hpp header in build directory.
+set(CONFIG_IS_READONLY "Warning: this file is overwritten during build. Do not edit.")
+configure_file("config.hpp.in" "config.hpp")
+
 include_directories("include")
 include_directories("libraries")
+include_directories("${PROJECT_BINARY_DIR}")
 
 macro(ConfigureTarget Target)
 	# Enable C++17
diff --git a/cpp/config.hpp.in b/cpp/config.hpp.in
new file mode 100644
index 0000000..b70f440
--- /dev/null
+++ b/cpp/config.hpp.in
@@ -0,0 +1,18 @@
+#pragma once
+#ifndef OPENCUBES_CONFIG_HPP
+#define OPENCUBES_CONFIG_HPP
+
+// @CONFIG_IS_READONLY@
+
+// Version info embedded into the build
+#define CONFIG_VERSION "@CONFIG_GIT_VERSION@"
+#define CONFIG_BUILDTYPE "@CMAKE_BUILD_TYPE@"
+#define CONFIG_COMPILERID "@CMAKE_CXX_COMPILER_ID@ @CMAKE_CXX_COMPILER_VERSION@"
+
+// Enable Cube struct pointer compaction
+#define CONFIG_PACK_CUBE_ADDR @BUILD_PACK_CUBE_ADDR@
+
+// Maximum Polycubes N that may be computed
+#define CUBES_MAX_N @BUILD_CUBES_MAX_N@
+
+#endif
diff --git a/cpp/program.cpp b/cpp/program.cpp
index d9f0169..5c37ab0 100644
--- a/cpp/program.cpp
+++ b/cpp/program.cpp
@@ -1,14 +1,16 @@
 #include <iostream>
 
 #include "cmdparser.hpp"
+#include "config.hpp"
 #include "cubes.hpp"
 
 void configure_arguments(cli::Parser& parser) {
-    parser.set_required<int>("n", "cube_size", "the size of polycube to generate up to");
+    parser.set_optional<int>("n", "cube_size", 1, "the size of polycube to generate up to");
     parser.set_optional<int>("t", "threads", 1, "the number of threads to use while generating");
     parser.set_optional<bool>("c", "use_cache", false, "whether to load cache files");
     parser.set_optional<bool>("w", "write_cache", false, "wheather to save cache files");
     parser.set_optional<bool>("s", "split_cache", false, "wheather to save in sparate cache files per output shape");
+    parser.set_optional<bool>("v", "version", false, "print build version info");
     parser.set_optional<bool>("u", "use_split_cache", false, "use separate cachefile by input shape");
     parser.set_optional<std::string>("f", "cache_file_folder", "./cache/", "where to store cache files");
 }
@@ -17,6 +19,9 @@ int main(int argc, char** argv) {
     cli::Parser parser(argc, argv);
     configure_arguments(parser);
     parser.run_and_exit_if_error();
+    if (parser.get<bool>("v")) {
+        std::printf("Built from %s, %s, %s\n", CONFIG_VERSION, CONFIG_BUILDTYPE, CONFIG_COMPILERID);
+    }
     gen(parser.get<int>("n"), parser.get<int>("t"), parser.get<bool>("c"), parser.get<bool>("w"), parser.get<bool>("s"), parser.get<bool>("u"), parser.get<std::string>("f"));
     return 0;
 }

From b12bb50a7acd5ab8ccb4a4709ab07b10228410f6 Mon Sep 17 00:00:00 2001
From: JATothrim <jarmo.tiitto@gmail.com>
Date: Mon, 21 Aug 2023 18:01:38 +0300
Subject: [PATCH 31/42] Provide configure option for the Cube struct
 compaction.

- CUBES_PACK_CUBE_XYZ_ADDR CMake option.
  By default do still enable compaction of Cube struct into 8-bytes.
  If the hack does not work on some system this can be set to OFF
  to revert the hack on configure time.
- Add assert into Cube::copyout()

Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/CMakeLists.txt   |  6 ++---
 cpp/config.hpp.in    |  2 +-
 cpp/include/cube.hpp | 54 +++++++++++++++++++++++++++++++-------------
 3 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9da6b04..68344fe 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -7,11 +7,11 @@ if(NOT CMAKE_BUILD_TYPE)
 endif()
 
 if(NOT BUILD_CUBES_MAX_N)
-	set(BUILD_CUBES_MAX_N 20 CACHE STRING "Limit of maximum N Polycubes to be computed" FORCE)
+	set(BUILD_CUBES_MAX_N 20 CACHE STRING "Limit of maximum N Polycubes to be computed")
 endif()
 
-if(NOT BUILD_PACK_CUBE_ADDR)
-	set(BUILD_PACK_CUBE_ADDR 1 CACHE BOOL "Pack Cube struct XYZ memory address into 56-bit field." FORCE)
+if(NOT CUBES_PACK_CUBE_XYZ_ADDR)
+	set(CUBES_PACK_CUBE_XYZ_ADDR ON CACHE BOOL "Pack Cube struct XYZ memory address into 56-bit field.")
 endif()
 
 # Try extract current HEAD commit-id in git
diff --git a/cpp/config.hpp.in b/cpp/config.hpp.in
index b70f440..695addc 100644
--- a/cpp/config.hpp.in
+++ b/cpp/config.hpp.in
@@ -10,7 +10,7 @@
 #define CONFIG_COMPILERID "@CMAKE_CXX_COMPILER_ID@ @CMAKE_CXX_COMPILER_VERSION@"
 
 // Enable Cube struct pointer compaction
-#define CONFIG_PACK_CUBE_ADDR @BUILD_PACK_CUBE_ADDR@
+#cmakedefine01 CUBES_PACK_CUBE_XYZ_ADDR
 
 // Maximum Polycubes N that may be computed
 #define CUBES_MAX_N @BUILD_CUBES_MAX_N@
diff --git a/cpp/include/cube.hpp b/cpp/include/cube.hpp
index e92e570..a13a2e0 100644
--- a/cpp/include/cube.hpp
+++ b/cpp/include/cube.hpp
@@ -3,12 +3,13 @@
 #define OPENCUBES_CUBE_HPP
 
 #include <algorithm>
+#include <cassert>
 #include <cstdint>
 #include <memory>
 #include <unordered_set>
 #include <vector>
-#include <atomic>
 
+#include "config.hpp"
 #include "utils.hpp"
 
 struct XYZ {
@@ -49,44 +50,66 @@ struct Cube {
     // cube memory is stored two ways:
     // normal, new'd buffer: is_shared == false
     // shared, external memory: is_shared == true
-
+#if CUBES_PACK_CUBE_XYZ_ADDR == 1
     struct bits_t {
         uint64_t is_shared : 1;
         uint64_t size : 7;   // MAX 127
         uint64_t addr : 56;  // low 56-bits of memory address.
     };
+    static_assert(sizeof(bits_t) == sizeof(void *));
+#else
+    struct bits_t {
+        uint64_t addr;
+        uint8_t is_shared : 1;
+        uint8_t size : 7;  // MAX 127
+    };
+#endif
     // fields
     bits_t fields;
-
-    static_assert(sizeof(bits_t) == sizeof(void*));
     // extract the pointer from bits_t
     static XYZ *get(bits_t key) {
         // pointer bit-hacking:
         uint64_t addr = key.addr;
+#if CUBES_PACK_CUBE_XYZ_ADDR == 1
+// todo: on x86-64 depending if 5-level-paging is enabled
+// either 47-bit or 56-bit should be replicated to the high
+// part of the address. Don't know how to do this check yet,
+// so the high 8-bits is left zeroed.
+// If we get segfaults dereferencing get(fields)
+// then CUBES_PACK_CUBE_XYZ_ADDR must be disabled.
+#endif
         return reinterpret_cast<XYZ *>(addr);
     }
 
     static bits_t put(bool is_shared, int size, XYZ *addr) {
-        // mask off top byte from the memory address to fit it into bits_t::addr
+#if CUBES_PACK_CUBE_XYZ_ADDR == 1
+        // pack the memory address into 56-bits
         // on x86-64 it is not used by the hardware (yet).
         // This hack actually saves 8 bytes because previously
         // the uint8_t caused padding to 16 bytes.
-        // @note if we get segfaults dereferencing get(fields)
-        // then this is the problem and this hack must be undone.
         uint64_t tmp = reinterpret_cast<uint64_t>((void *)addr);
+        assert((tmp & ~0xffffffffffffff) == 0 && "BUG: CUBES_PACK_CUBE_XYZ_ADDR should be disabled");
         tmp &= 0xffffffffffffff;
         bits_t bits;
         bits.addr = tmp;
         bits.is_shared = is_shared;
         bits.size = size;
         return bits;
+#else
+        bits_t bits;
+        bits.addr = reinterpret_cast<uint64_t>((void *)addr);
+        bits.is_shared = is_shared;
+        bits.size = size;
+        return bits;
+#endif
     }
+
    public:
     // Empty cube
     Cube() : fields{put(0, 0, nullptr)} {}
 
     // Cube with N capacity
-    explicit Cube(uint8_t N) : fields{put(0,N, new XYZ[N])} {}
+    explicit Cube(uint8_t N) : fields{put(0, N, new XYZ[N])} {}
 
     // Construct from pieces
     Cube(std::initializer_list<XYZ> il) : Cube(il.size()) { std::copy(il.begin(), il.end(), begin()); }
@@ -97,7 +120,7 @@ struct Cube {
     // Construct from external source.
     // Cube shares this the memory until modified.
     // Caller guarantees the memory given will live longer than *this
-    Cube(const XYZ *start, uint8_t n) : fields{put(1,n,const_cast<XYZ*>(start))} {}
+    Cube(const XYZ *start, uint8_t n) : fields{put(1, n, const_cast<XYZ *>(start))} {}
 
     // Copy ctor.
     Cube(const Cube &copy) : Cube(copy.size()) { std::copy(copy.begin(), copy.end(), begin()); }
@@ -131,13 +154,9 @@ struct Cube {
 
     size_t size() const { return fields.size; }
 
-    XYZ *data() {
-		return get(fields);
-	}
+    XYZ *data() { return get(fields); }
 
-	const XYZ *data() const {
-		return get(fields);
-	}
+    const XYZ *data() const { return get(fields); }
 
     XYZ *begin() { return data(); }
 
@@ -169,12 +188,15 @@ struct Cube {
     /**
      * Copy cube data into destination buffer.
      */
-    void copyout(int num, XYZ* dest) const {
+    void copyout(int num, XYZ *dest) const {
+        assert(num <= size());
         std::copy_n(begin(), num, dest);
     }
 };
 
+#if CUBES_PACK_CUBE_XYZ_ADDR == 1
 static_assert(sizeof(Cube) == 8, "Unexpected sizeof(Cube) for Cube");
+#endif
 static_assert(std::is_move_assignable_v<Cube>, "Cube must be moveable");
 static_assert(std::is_swappable_v<Cube>, "Cube must swappable");
 

From 37536552066612544e6c000bdf39188cf3a66b70 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Tue, 15 Aug 2023 23:48:23 +0300
Subject: [PATCH 32/42] Hashy refactor

Hashy code is somewhat tangled and there is now known possible
data-race in `Hashy::insert()`.
This issue cannot be permanently fixed without hiding the `Hashy::byshape`
under protected/private and preventing direct access to the member.

Replacements to the direct member access will come in later changes.

- Move Subhashy and Subsubhashy out from Hashy class.

Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/hashes.hpp | 81 +++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 40 deletions(-)

diff --git a/cpp/include/hashes.hpp b/cpp/include/hashes.hpp
index 49462d2..8ed5c04 100644
--- a/cpp/include/hashes.hpp
+++ b/cpp/include/hashes.hpp
@@ -25,54 +25,55 @@ struct HashCube {
 
 using CubeSet = std::unordered_set<Cube, HashCube, std::equal_to<Cube>>;
 
-struct Hashy {
-    struct Subsubhashy {
-        CubeSet set;
-        mutable std::shared_mutex set_mutex;
+struct Subsubhashy {
+    CubeSet set;
+    mutable std::shared_mutex set_mutex;
 
-        template <typename CubeT>
-        void insert(CubeT &&c) {
-            std::lock_guard lock(set_mutex);
-            set.emplace(std::forward<CubeT>(c));
-        }
+    template <typename CubeT>
+    void insert(CubeT &&c) {
+        std::lock_guard lock(set_mutex);
+        set.emplace(std::forward<CubeT>(c));
+    }
 
-        bool contains(const Cube &c) const {
-            std::shared_lock lock(set_mutex);
-            auto itr = set.find(c);
-            if(itr != set.end()) {
-				return true;
-			}
-            return false;
+    bool contains(const Cube &c) const {
+        std::shared_lock lock(set_mutex);
+        auto itr = set.find(c);
+        if (itr != set.end()) {
+            return true;
         }
+        return false;
+    }
 
-        auto size() const {
-            std::shared_lock lock(set_mutex);
-            return set.size();
-        }
-    };
-    template <int NUM>
-    struct Subhashy {
-        std::array<Subsubhashy, NUM> byhash;
+    auto size() const {
+        std::shared_lock lock(set_mutex);
+        return set.size();
+    }
+};
 
-        template <typename CubeT>
-        void insert(CubeT &&c) {
-            HashCube hash;
-            auto idx = hash(c) % NUM;
-            auto &set = byhash[idx];
-            if (!set.contains(c)) set.insert(std::forward<CubeT>(c));
-            // printf("new size %ld\n\r", byshape[shape].size());
-        }
+template <int NUM>
+struct Subhashy {
+    std::array<Subsubhashy, NUM> byhash;
 
-        auto size() const {
-            size_t sum = 0;
-            for (auto &set : byhash) {
-                auto part = set.size();
-                sum += part;
-            }
-            return sum;
+    template <typename CubeT>
+    void insert(CubeT &&c) {
+        HashCube hash;
+        auto idx = hash(c) % NUM;
+        auto &set = byhash[idx];
+        if (!set.contains(c)) set.insert(std::forward<CubeT>(c));
+        // printf("new size %ld\n\r", byshape[shape].size());
+    }
+
+    auto size() const {
+        size_t sum = 0;
+        for (auto &set : byhash) {
+            auto part = set.size();
+            sum += part;
         }
-    };
+        return sum;
+    }
+};
 
+struct Hashy {
     std::map<XYZ, Subhashy<32>> byshape;
 
     static std::vector<XYZ> generateShapes(int n) {

From cda5b3a2e69f10dc0b0b77f65e2607c455e07670 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Wed, 16 Aug 2023 00:03:58 +0300
Subject: [PATCH 33/42] Hashy refactor: SubsubHashy

- Make Subsubhashy a class to note its members aren't directly accessible.
- Hide members under protected
- Discover class users and fix them.
  Mainly iterating the SubsubHashy.

Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/hashes.hpp   | 15 ++++++++++++++-
 cpp/include/newCache.hpp |  2 +-
 cpp/src/cubes.cpp        |  3 +--
 cpp/src/newCache.cpp     |  8 ++++----
 4 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/cpp/include/hashes.hpp b/cpp/include/hashes.hpp
index 8ed5c04..6c543cb 100644
--- a/cpp/include/hashes.hpp
+++ b/cpp/include/hashes.hpp
@@ -25,10 +25,12 @@ struct HashCube {
 
 using CubeSet = std::unordered_set<Cube, HashCube, std::equal_to<Cube>>;
 
-struct Subsubhashy {
+class Subsubhashy {
+   protected:
     CubeSet set;
     mutable std::shared_mutex set_mutex;
 
+   public:
     template <typename CubeT>
     void insert(CubeT &&c) {
         std::lock_guard lock(set_mutex);
@@ -48,6 +50,17 @@ struct Subsubhashy {
         std::shared_lock lock(set_mutex);
         return set.size();
     }
+
+    void clear() {
+        std::lock_guard lock(set_mutex);
+        set.clear();
+        set.reserve(1);
+    }
+
+    auto begin() const { return set.begin(); }
+    auto end() const { return set.end(); }
+    auto begin() { return set.begin(); }
+    auto end() { return set.end(); }
 };
 
 template <int NUM>
diff --git a/cpp/include/newCache.hpp b/cpp/include/newCache.hpp
index c24a06b..453bb9e 100644
--- a/cpp/include/newCache.hpp
+++ b/cpp/include/newCache.hpp
@@ -167,7 +167,7 @@ class FlatCache : public ICache {
         for (auto& [shape, set] : hashes.byshape) {
             auto begin = allXYZs.data() + allXYZs.size();
             for (auto& subset : set.byhash) {
-                for (auto& cube : subset.set)
+                for (auto& cube : subset)
                     // allXYZs.emplace_back(allXYZs.end(), subset.set.begin(), subset.set.end());
                     std::copy(cube.begin(), cube.end(), std::back_inserter(allXYZs));
             }
diff --git a/cpp/src/cubes.cpp b/cpp/src/cubes.cpp
index 1630bb6..737e398 100644
--- a/cpp/src/cubes.cpp
+++ b/cpp/src/cubes.cpp
@@ -314,8 +314,7 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
         }
         if (split_cache) {
             for (auto &subset : hashes.byshape[targetShape].byhash) {
-                subset.set.clear();
-                subset.set.reserve(1);
+                subset.clear();
             }
         }
     }
diff --git a/cpp/src/newCache.cpp b/cpp/src/newCache.cpp
index 93f15b3..26b0514 100644
--- a/cpp/src/newCache.cpp
+++ b/cpp/src/newCache.cpp
@@ -214,9 +214,9 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
     auto time_start = std::chrono::steady_clock::now();
     for (auto &key : keys) {
         for (auto &subset : hashes.byshape[key].byhash) {
-            auto itr = subset.set.begin();
+            auto itr = subset.begin();
 
-            ptrdiff_t dist = subset.set.size();
+            ptrdiff_t dist = subset.size();
             // distribute if range is large enough.
             auto skip = std::max(4096L, std::max(1L, dist / (signed)m_flushers.size()));
             while (dist > skip) {
@@ -226,7 +226,7 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
                 auto inc = std::min(dist, skip);
                 std::advance(itr, inc);
                 put += n * inc;
-                dist = std::distance(itr, subset.set.end());
+                dist = std::distance(itr, subset.end());
 
                 auto done = 100.0f * (std::distance(xyz->get(), put) / float(num_cubes * n));
                 std::printf("writing data %5.2f%% ...  \r", done);
@@ -240,7 +240,7 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
             // copy remainder, if any.
             if (dist) {
                 std::lock_guard lock(m_mtx);
-                m_copy.emplace_back(std::bind(copyrange, itr, subset.set.end(), put));
+                m_copy.emplace_back(std::bind(copyrange, itr, subset.end(), put));
                 ++m_num_copys;
                 m_run.notify_all();
                 put += n * dist;

From e57e436ea01c53118b0ec582cf7596406c9ec199 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Wed, 16 Aug 2023 00:10:39 +0300
Subject: [PATCH 34/42] Hashy refactor: SubHashy

- Make Subhashy a class to note its members aren't directly accessible.
- Hide members under protected
- Discover class users and fix them.

Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/hashes.hpp   | 10 ++++++++--
 cpp/include/newCache.hpp |  2 +-
 cpp/src/cubes.cpp        |  2 +-
 cpp/src/newCache.cpp     |  2 +-
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/cpp/include/hashes.hpp b/cpp/include/hashes.hpp
index 6c543cb..a0e9d4c 100644
--- a/cpp/include/hashes.hpp
+++ b/cpp/include/hashes.hpp
@@ -64,9 +64,10 @@ class Subsubhashy {
 };
 
 template <int NUM>
-struct Subhashy {
+class Subhashy {
+   protected:
     std::array<Subsubhashy, NUM> byhash;
-
+   public:
     template <typename CubeT>
     void insert(CubeT &&c) {
         HashCube hash;
@@ -84,6 +85,11 @@ struct Subhashy {
         }
         return sum;
     }
+
+    auto begin() const { return byhash.begin(); }
+    auto end() const { return byhash.end(); }
+    auto begin() { return byhash.begin(); }
+    auto end() { return byhash.end(); }
 };
 
 struct Hashy {
diff --git a/cpp/include/newCache.hpp b/cpp/include/newCache.hpp
index 453bb9e..1a0a8e8 100644
--- a/cpp/include/newCache.hpp
+++ b/cpp/include/newCache.hpp
@@ -166,7 +166,7 @@ class FlatCache : public ICache {
         // std::printf("Flatcache %d %p %p\n", n, (void*)allXYZs.data(), (void*)shapes.data());
         for (auto& [shape, set] : hashes.byshape) {
             auto begin = allXYZs.data() + allXYZs.size();
-            for (auto& subset : set.byhash) {
+            for (auto& subset : set) {
                 for (auto& cube : subset)
                     // allXYZs.emplace_back(allXYZs.end(), subset.set.begin(), subset.set.end());
                     std::copy(cube.begin(), cube.end(), std::back_inserter(allXYZs));
diff --git a/cpp/src/cubes.cpp b/cpp/src/cubes.cpp
index 737e398..b8b0779 100644
--- a/cpp/src/cubes.cpp
+++ b/cpp/src/cubes.cpp
@@ -313,7 +313,7 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
                         hashes, n);
         }
         if (split_cache) {
-            for (auto &subset : hashes.byshape[targetShape].byhash) {
+            for (auto &subset : hashes.byshape[targetShape]) {
                 subset.clear();
             }
         }
diff --git a/cpp/src/newCache.cpp b/cpp/src/newCache.cpp
index 26b0514..fff4e8d 100644
--- a/cpp/src/newCache.cpp
+++ b/cpp/src/newCache.cpp
@@ -213,7 +213,7 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
 
     auto time_start = std::chrono::steady_clock::now();
     for (auto &key : keys) {
-        for (auto &subset : hashes.byshape[key].byhash) {
+        for (auto &subset : hashes.byshape[key]) {
             auto itr = subset.begin();
 
             ptrdiff_t dist = subset.size();

From 67ae09a13288f712bc20455f5423e644dcac6964 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Wed, 16 Aug 2023 00:32:59 +0300
Subject: [PATCH 35/42] Hashy refactor: Hashy class

- Finally fix the potential data-race in Hashy::insert():
  insert() uses the at() to lookup/create the shape and it is thread-safe.
- Make Hashy a class to note its members aren't directly accessible.
- Hide members under protected
- Discover class users and fix them.
- Added begin(), end(), numShapes() and at() replacing direct member access.

Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/hashes.hpp   | 34 +++++++++++++++++++++++++++++++---
 cpp/include/newCache.hpp |  4 ++--
 cpp/src/cubes.cpp        | 10 +++++-----
 cpp/src/newCache.cpp     |  8 ++++----
 4 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/cpp/include/hashes.hpp b/cpp/include/hashes.hpp
index a0e9d4c..7234bd3 100644
--- a/cpp/include/hashes.hpp
+++ b/cpp/include/hashes.hpp
@@ -67,6 +67,7 @@ template <int NUM>
 class Subhashy {
    protected:
     std::array<Subsubhashy, NUM> byhash;
+
    public:
     template <typename CubeT>
     void insert(CubeT &&c) {
@@ -92,9 +93,12 @@ class Subhashy {
     auto end() { return byhash.end(); }
 };
 
-struct Hashy {
+class Hashy {
+   protected:
     std::map<XYZ, Subhashy<32>> byshape;
+    mutable std::shared_mutex set_mutex;
 
+   public:
     static std::vector<XYZ> generateShapes(int n) {
         std::vector<XYZ> out;
         for (int x = 0; x < n; ++x)
@@ -109,17 +113,31 @@ struct Hashy {
 
     void init(int n) {
         // create all subhashy which will be needed for N
+        std::lock_guard lock(set_mutex);
         for (auto s : generateShapes(n)) byshape[s].size();
         std::printf("%ld sets by shape for N=%d\n\r", byshape.size(), n);
     }
 
+    Subhashy<32> &at(XYZ shape) {
+        std::shared_lock lock(set_mutex);
+        auto itr = byshape.find(shape);
+        if (itr != byshape.end()) {
+            return itr->second;
+        }
+        lock.unlock();
+        // Not sure if this is supposed to happen normally
+        // if init() creates all subhashys required.
+        std::lock_guard elock(set_mutex);
+        return byshape[shape];
+    }
+
     template <typename CubeT>
     void insert(CubeT &&c, XYZ shape) {
-        auto &set = byshape[shape];
-        set.insert(std::forward<CubeT>(c));
+        at(shape).insert(std::forward<CubeT>(c));
     }
 
     auto size() const {
+        std::shared_lock lock(set_mutex);
         size_t sum = 0;
         DEBUG1_PRINTF("%ld maps by shape\n\r", byshape.size());
         for (auto &set : byshape) {
@@ -129,5 +147,15 @@ struct Hashy {
         }
         return sum;
     }
+
+    int numShapes() const {
+        std::shared_lock lock(set_mutex);
+        return byshape.size();
+    }
+
+    auto begin() const { return byshape.begin(); }
+    auto end() const { return byshape.end(); }
+    auto begin() { return byshape.begin(); }
+    auto end() { return byshape.end(); }
 };
 #endif
diff --git a/cpp/include/newCache.hpp b/cpp/include/newCache.hpp
index 1a0a8e8..1d62940 100644
--- a/cpp/include/newCache.hpp
+++ b/cpp/include/newCache.hpp
@@ -162,9 +162,9 @@ class FlatCache : public ICache {
     FlatCache() {}
     FlatCache(Hashy& hashes, uint8_t n) : n(n) {
         allXYZs.reserve(hashes.size() * n);
-        shapes.reserve(hashes.byshape.size());
+        shapes.reserve(hashes.numShapes());
         // std::printf("Flatcache %d %p %p\n", n, (void*)allXYZs.data(), (void*)shapes.data());
-        for (auto& [shape, set] : hashes.byshape) {
+        for (auto& [shape, set] : hashes) {
             auto begin = allXYZs.data() + allXYZs.size();
             for (auto& subset : set) {
                 for (auto& cube : subset)
diff --git a/cpp/src/cubes.cpp b/cpp/src/cubes.cpp
index b8b0779..6b60085 100644
--- a/cpp/src/cubes.cpp
+++ b/cpp/src/cubes.cpp
@@ -246,11 +246,11 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
 
     uint64_t totalSum = 0;
     auto start = std::chrono::steady_clock::now();
-    uint32_t totalOutputShapes = hashes.byshape.size();
+    uint32_t totalOutputShapes = hashes.numShapes();
     uint32_t outShapeCount = 0;
 
     auto prevShapes = Hashy::generateShapes(n - 1);
-    for (auto &tup : hashes.byshape) {
+    for (auto &tup : hashes) {
         outShapeCount++;
         XYZ targetShape = tup.first;
         std::printf("process output shape %3d/%d [%2d %2d %2d]\n\r", outShapeCount, totalOutputShapes, targetShape.x(), targetShape.y(), targetShape.z());
@@ -305,15 +305,15 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
         for (auto& thr : workers) {
             thr.sync();
         }
-        std::printf("  num: %lu\n\r", hashes.byshape[targetShape].size());
-        totalSum += hashes.byshape[targetShape].size();
+        std::printf("  num: %lu\n\r", hashes.at(targetShape).size());
+        totalSum += hashes.at(targetShape).size();
         if (write_cache && split_cache) {
             cw.save(base_path + "cubes_" + std::to_string(n) + "_" + std::to_string(targetShape.x()) + "-" + std::to_string(targetShape.y()) + "-" +
                             std::to_string(targetShape.z()) + ".bin",
                         hashes, n);
         }
         if (split_cache) {
-            for (auto &subset : hashes.byshape[targetShape]) {
+            for (auto &subset : hashes.at(targetShape)) {
                 subset.clear();
             }
         }
diff --git a/cpp/src/newCache.cpp b/cpp/src/newCache.cpp
index fff4e8d..d54b057 100644
--- a/cpp/src/newCache.cpp
+++ b/cpp/src/newCache.cpp
@@ -169,12 +169,12 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
     auto header = std::make_shared<struct_region<Header>>(file_, 0);
     (*header)->magic = cacheformat::MAGIC;
     (*header)->n = n;
-    (*header)->numShapes = hashes.byshape.size();
+    (*header)->numShapes = hashes.numShapes();
     (*header)->numPolycubes = hashes.size();
 
     std::vector<XYZ> keys;
     keys.reserve((*header)->numShapes);
-    for (auto &pair : hashes.byshape) keys.push_back(pair.first);
+    for (auto &pair : hashes) keys.push_back(pair.first);
     std::sort(keys.begin(), keys.end());
 
     auto shapeEntry = std::make_shared<array_region<ShapeEntry>>(file_, header->getEndSeek(), (*header)->numShapes);
@@ -189,7 +189,7 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
         se.dim2 = key.z();
         se.reserved = 0;
         se.offset = offset;
-        auto count = hashes.byshape[key].size();
+        auto count = hashes.at(key).size();
         num_cubes += count;
         se.size = count * XYZ_SIZE * n;
         offset += se.size;
@@ -213,7 +213,7 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
 
     auto time_start = std::chrono::steady_clock::now();
     for (auto &key : keys) {
-        for (auto &subset : hashes.byshape[key]) {
+        for (auto &subset : hashes.at(key)) {
             auto itr = subset.begin();
 
             ptrdiff_t dist = subset.size();

From b79f161bab09fb6e74f964c537d69d9742b4a4dc Mon Sep 17 00:00:00 2001
From: JATothrim <jarmo.tiitto@gmail.com>
Date: Sat, 19 Aug 2023 14:23:52 +0300
Subject: [PATCH 36/42] libmappedfile: Provide standalone I/O operations for
 file

Implement few basic operations in mapped::file so that mapped::region
is not needed for these:
- readAt() and writeAt()
- copyAt() is the most interesting because the data copy is
  done by the operating system.

Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/libraries/mapped_file.cpp | 40 +++++++++++++++++++++++++++++++++++
 cpp/libraries/mapped_file.hpp | 23 ++++++++++++++++++++
 2 files changed, 63 insertions(+)

diff --git a/cpp/libraries/mapped_file.cpp b/cpp/libraries/mapped_file.cpp
index e7261dc..f0e4b0f 100644
--- a/cpp/libraries/mapped_file.cpp
+++ b/cpp/libraries/mapped_file.cpp
@@ -148,6 +148,46 @@ int file::truncate(seekoff_t newsize) {
     return 0;
 }
 
+int file::readAt(seekoff_t fpos, len_t size, void* dataout) const
+{
+    ssize_t rd = pread(fd, dataout, size, fpos);
+    if (rd != (signed)size) {
+        std::fprintf(stderr, "Error reading data from file:%s\n", std::strerror(errno));
+        return -1;
+    }
+    return 0;
+}
+
+int file::writeAt(seekoff_t fpos, len_t size, const void* datain)
+{
+    std::lock_guard lock(mut);
+
+    ssize_t rd = pwrite(fd, datain, size, fpos);
+    if (rd != (signed)size) {
+        std::fprintf(stderr, "Error writing data into file:%s\n", std::strerror(errno));
+        return -1;
+    }
+
+    fd_size = std::max(fd_size, fpos+size);
+    return 0;
+}
+
+int file::copyAt(std::shared_ptr<file> other, seekoff_t other_fpos, len_t size, seekoff_t dest_fpos)
+{
+    off64_t srcp = other_fpos;
+    off64_t dstp = dest_fpos;
+    ssize_t cpy = ::copy_file_range(other->fd, &srcp, fd, &dstp, size, 0);
+    if (cpy != (signed)size) {
+        std::fprintf(stderr, "Error copying file data:%s\n", std::strerror(errno));
+        return -1;
+    }
+
+    std::lock_guard lock(mut);
+    fd_size = std::max(fd_size, dest_fpos+size);
+    return 0;
+}
+
+
 /**
  * Mapped region POSIX/Linux compatible implementation.
  */
diff --git a/cpp/libraries/mapped_file.hpp b/cpp/libraries/mapped_file.hpp
index b86657a..ce3c2e3 100644
--- a/cpp/libraries/mapped_file.hpp
+++ b/cpp/libraries/mapped_file.hpp
@@ -538,6 +538,29 @@ class file : public std::enable_shared_from_this<file> {
      */
     int truncate(seekoff_t newsize);
 
+    /**
+     * Read @size bytes starting at file offset @fpos
+     * @note copies [fpos, fpos+size] into [dataout, dataout+size]
+     * @return non-zero if error occurred.
+     */
+    int readAt(seekoff_t fpos, len_t size, void* dataout) const;
+
+    /**
+     * Write @size bytes starting at file offset @fpos
+     * @note copies [datain, datain+size] into [fpos, fpos+size]
+     * @note the file size after writeAt() is std::max(size(), fpos+size)
+     * @return non-zero if error occurred.
+     */
+    int writeAt(seekoff_t fpos, len_t size, const void* datain);
+
+    /**
+     * Copy @size bytes starting at file offset @other_fpos
+     * from @other file copying the data at @dest_fpos in this file.
+     * @note copies from other:[other_fpos, other_fpos+size] into this:[dest_fpos, dest_fpos+size]
+     * @note if other is same as *this the destination range cannot overlap with the source range.
+     */
+    int copyAt(std::shared_ptr<file> other, seekoff_t other_fpos, len_t size, seekoff_t dest_fpos);
+
     /**
      * Current length of the file
      * The file EOF (end-of-file) is at this position.

From 64278c8e1bfd81cca15d2d21a2613fee38947670 Mon Sep 17 00:00:00 2001
From: Jarmo Tiitto <jarmo.tiitto@gmail.com>
Date: Wed, 16 Aug 2023 22:20:25 +0300
Subject: [PATCH 37/42] Hashy CubeSwapper

Implement way to temporally dump the cube data into disk storage
in order to save system memory.

For `./cubes -n 13 -w -s -u` run

heaptrack tool reports:
- total runtime: 26min 18s
- peak RSS: 2.4 Gb
- peak heap memory: 978 Mb

This confirms that only the std::unordered_set<> internal
nodes (and the lookup array) are kept in memory.
Slow down is expected as accessing an element reads it from the disk.

The swap files are named as `storage_<number>.bin` in the cache folder.
These files are normally deleted as soon as they are no longer needed.

Important!!
the process can open so many files simultaneously
that the system NOFILE limit is reached.
This limit should be raised with `ulimit -n 128000` to avoid terminating
the program. The minimum number for open file handles is at least:
<maximum number of shapes for N> * 32

- CubeSwapSet is specialized std::unordered_set<> that stores the cube data in a file.
- CubeStorage acts as pseudo allocator for the cube data.
- CubePtr is the key type inserted in to CubeSwapSet.
  This only an 64-bit offset into the backing file and
  CubePtr is owned by CubeStorage that created it.
- CubePtr::get(const CubeStorage&) reads out the Cube from the storage.
  Hashy users are adapted to use it where needed.
- Clearing Hashy is now quite fast because there is no memory to be
  freed for CubePtrs. SubsubHashy::clear() simply deletes the data
  and the backing file.
- Compiling in C++20 mode enables speed up by allowing
  SubsubHashy::contains() to work with Cube and CubePtr types.

Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/CMakeLists.txt          |   1 +
 cpp/include/cubeSwapSet.hpp | 178 ++++++++++++++++++++++++++++++++++++
 cpp/include/hashes.hpp      |  86 +++++++++++++----
 cpp/include/newCache.hpp    |   5 +-
 cpp/src/cubeSwapSet.cpp     | 121 ++++++++++++++++++++++++
 cpp/src/cubes.cpp           |  45 ++++-----
 cpp/src/newCache.cpp        |  11 ++-
 7 files changed, 393 insertions(+), 54 deletions(-)
 create mode 100644 cpp/include/cubeSwapSet.hpp
 create mode 100644 cpp/src/cubeSwapSet.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 68344fe..ad05812 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -71,6 +71,7 @@ add_library(CubeObjs OBJECT
 	"src/cubes.cpp"
 	"src/rotations.cpp"
 	"src/newCache.cpp"
+	"src/cubeSwapSet.cpp"
 )
 ConfigureTarget(CubeObjs)
 
diff --git a/cpp/include/cubeSwapSet.hpp b/cpp/include/cubeSwapSet.hpp
new file mode 100644
index 0000000..b7a2e5f
--- /dev/null
+++ b/cpp/include/cubeSwapSet.hpp
@@ -0,0 +1,178 @@
+#pragma once
+#ifndef OPENCUBES_CUBE_DISKSWAP_SET_HPP
+#define OPENCUBES_CUBE_DISKSWAP_SET_HPP
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <filesystem>
+
+#include "cube.hpp"
+#include "mapped_file.hpp"
+
+/**
+ * Implement std::unordered_set<> that stores element data in a file.
+ *
+ * Cubes stored with size N in the set have constant cost of RAM memory:
+ * Only the std::unordered_set<> itself and the internals nodes are stored in RAM.
+ * The element *data* (i.e. XYZ data) is stored in the file.
+ * The performance cost is that each time the element is accessed
+ * the data has to be read back from the file.
+ * (Iterating the entire CubeSwapSet involves reading the entire backing file)
+ *
+ * Clearing the CubeSwapSet does not release the backing file space managed by CubeStorage.
+ * Call to CubeStorage::discard() is required after clearing or destructing
+ * the CubeSwapSet instance to cleanup the file.
+ * Elements cannot be removed one-by-one.
+ */
+class CubeStorage;
+
+/**
+ * Overlay that reads the cube data from the backing file.
+ * CubePtr needs its associated CubeStorage instance to be able to
+ * access its contents with CubePtr::get()
+ * The associated CubeStorage owning the CubePtr
+ * should always be available where CubePtr is used.
+ */
+class CubePtr {
+   protected:
+    mapped::seekoff_t m_seek = 0;
+
+   public:
+    explicit CubePtr(mapped::seekoff_t offset) : m_seek(offset) {}
+    CubePtr(const CubePtr& c) : m_seek(c.m_seek) {}
+
+    /**
+     * Get the Cube pointed by this instance.
+     */
+    Cube get(const CubeStorage& storage) const;
+
+    template <typename Itr>
+    void copyout(const CubeStorage& storage, size_t n, Itr out) const {
+        auto tmp = get(storage);
+        std::copy_n(tmp.begin(), n, out);
+    }
+
+    mapped::seekoff_t seek() const { return m_seek; }
+};
+
+/**
+ * Stateful comparator for Cubeptr
+ */
+class CubePtrEqual {
+   protected:
+    const CubeStorage* m_storage = nullptr;
+   public:
+	// C++20 feature:
+    using is_transparent = void;
+
+    CubePtrEqual(const CubeStorage* ctx) : m_storage(ctx) {}
+    CubePtrEqual(const CubePtrEqual& ctx) : m_storage(ctx.m_storage) {}
+
+    bool operator()(const CubePtr& a, const CubePtr& b) const { return a.get(*m_storage) == b.get(*m_storage); }
+
+    bool operator()(const Cube& a, const CubePtr& b) const { return a == b.get(*m_storage); }
+
+    bool operator()(const CubePtr& a, const Cube& b) const { return a.get(*m_storage) == b; }
+};
+
+class CubePtrHash {
+   protected:
+    const CubeStorage* m_storage = nullptr;
+   public:
+	// C++20 feature:
+    using is_transparent = void;
+    using transparent_key_equal = CubePtrEqual;
+
+    CubePtrHash(const CubeStorage* ctx) : m_storage(ctx) {}
+    CubePtrHash(const CubePtrHash& ctx) : m_storage(ctx.m_storage) {}
+
+    size_t operator()(const Cube& x) const {
+        std::size_t seed = x.size();
+        for (auto& p : x) {
+            auto x = HashXYZ()(p);
+            seed ^= x + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+        }
+        return seed;
+    }
+
+    size_t operator()(const CubePtr& x) const {
+        auto cube = x.get(*m_storage);
+        std::size_t seed = cube.size();
+        for (auto& p : cube) {
+            auto x = HashXYZ()(p);
+            seed ^= x + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+        }
+        return seed;
+    }
+};
+
+class CubeStorage {
+   protected:
+    std::mutex m_mtx;
+    std::filesystem::path m_fpath;
+    std::shared_ptr<mapped::file> m_file;
+    std::unique_ptr<mapped::region> m_map;
+
+    static std::atomic<int> m_init_num;
+    const size_t m_cube_size;
+    mapped::seekoff_t m_prev_seek = 0;
+    mapped::seekoff_t m_alloc_seek = 0;
+
+   public:
+    /**
+     * Initialize Cube file storage
+     * @param fname directory where to store the backing file.
+     * @param n The storage is reserved in n sized chunks.
+     *   This should be equal to Cube::size() that are passed into allocate()
+     *   as no other allocation size is supported.
+     * @note the backing file creation is delayed until allocate() is called first time.
+     */
+    CubeStorage(std::filesystem::path path, size_t n);
+    ~CubeStorage();
+
+    // not copyable
+    CubeStorage(const CubeStorage&) = delete;
+    CubeStorage& operator=(const CubeStorage&) = delete;
+    // move constructible: but only if no allocations exists
+    CubeStorage(CubeStorage&& mv);
+    CubeStorage& operator=(CubeStorage&& mv) = delete;
+
+    size_t cubeSize() const { return m_cube_size; }
+
+    /**
+     * Store Cube data into the backing file.
+     * Returns CubePtr that can be inserted into CubeSwapSet.
+     * @note cube.size() must be equal to this->cubeSize()
+     */
+    CubePtr allocate(const Cube& cube);
+
+    /**
+     * Revert the effect of last allocate()
+     */
+    void cancel_allocation();
+
+    /**
+     * Retrieve the cube data from the backing file.
+     */
+    Cube read(const CubePtr& x) const;
+
+    /**
+     * Drop all stored data.
+     * Shrinks the backing file to zero size and deletes it.
+     */
+    void discard();
+};
+
+/**
+ * CubeStorage enabled std::unordered_set<>
+ *
+ * The CubeSwapSet must be constructed with already initialized
+ * stateful instances of CubePtrEqual and CubePtrHash functors
+ * that resolve the CubePtr instance using the CubeStorage instance.
+ */
+using CubeSwapSet = std::unordered_set<CubePtr, CubePtrHash, CubePtrEqual>;
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/hashes.hpp b/cpp/include/hashes.hpp
index 7234bd3..79bedfb 100644
--- a/cpp/include/hashes.hpp
+++ b/cpp/include/hashes.hpp
@@ -3,12 +3,15 @@
 #define OPENCUBES_HASHES_HPP
 #include <array>
 #include <cstdio>
+#include <deque>
+#include <filesystem>
 #include <map>
 #include <shared_mutex>
 #include <unordered_set>
 #include <vector>
 
 #include "cube.hpp"
+#include "cubeSwapSet.hpp"
 #include "utils.hpp"
 
 struct HashCube {
@@ -27,24 +30,30 @@ using CubeSet = std::unordered_set<Cube, HashCube, std::equal_to<Cube>>;
 
 class Subsubhashy {
    protected:
-    CubeSet set;
+    CubeStorage set_storage;
+    CubeSwapSet set;
     mutable std::shared_mutex set_mutex;
 
    public:
+    explicit Subsubhashy(std::filesystem::path path, size_t n) : set_storage(path, n), set(1, CubePtrHash(&set_storage), CubePtrEqual(&set_storage)) {}
+
     template <typename CubeT>
     void insert(CubeT &&c) {
         std::lock_guard lock(set_mutex);
-        set.emplace(std::forward<CubeT>(c));
+        auto [itr, isnew] = set.emplace(set_storage.allocate(std::forward<CubeT>(c)));
+        if (!isnew) {
+            set_storage.cancel_allocation();
+        }
     }
 
+#if __cplusplus > 201703L
+// todo: need C++17 equivalent for *generic*
+// contains() or find() that accepts both Cube and CubePtr types
     bool contains(const Cube &c) const {
         std::shared_lock lock(set_mutex);
-        auto itr = set.find(c);
-        if (itr != set.end()) {
-            return true;
-        }
-        return false;
+        return set.contains<Cube>(c);
     }
+#endif
 
     auto size() const {
         std::shared_lock lock(set_mutex);
@@ -57,27 +66,45 @@ class Subsubhashy {
         set.reserve(1);
     }
 
+    // Get CubeStorage instance.
+    // [this->begin(), this->end()] iterated CubePtr's
+    // Can be resolved with CubePtr::get(this->storage())
+    // that returns copy of the data as Cube.
+    const CubeStorage &storage() const { return set_storage; }
+
     auto begin() const { return set.begin(); }
     auto end() const { return set.end(); }
     auto begin() { return set.begin(); }
     auto end() { return set.end(); }
 };
 
-template <int NUM>
 class Subhashy {
    protected:
-    std::array<Subsubhashy, NUM> byhash;
+    std::deque<Subsubhashy> byhash;
 
    public:
+    Subhashy(int NUM, size_t N, std::filesystem::path path) {
+        for (int i = 0; i < NUM; ++i) {
+            byhash.emplace_back(path, N);
+        }
+    }
+
     template <typename CubeT>
     void insert(CubeT &&c) {
         HashCube hash;
-        auto idx = hash(c) % NUM;
+        auto idx = hash(c) % byhash.size();
         auto &set = byhash[idx];
-        if (!set.contains(c)) set.insert(std::forward<CubeT>(c));
+#if __cplusplus > 201703L
+        if (set.contains(c)) return;
+#endif
+        set.insert(std::forward<CubeT>(c));
         // printf("new size %ld\n\r", byshape[shape].size());
     }
 
+    void clear() {
+        for (auto &set : byhash) set.clear();
+    }
+
     auto size() const {
         size_t sum = 0;
         for (auto &set : byhash) {
@@ -95,7 +122,9 @@ class Subhashy {
 
 class Hashy {
    protected:
-    std::map<XYZ, Subhashy<32>> byshape;
+    std::map<XYZ, Subhashy> byshape;
+    std::filesystem::path base_path;
+    int N;
     mutable std::shared_mutex set_mutex;
 
    public:
@@ -111,24 +140,41 @@ class Hashy {
         return out;
     }
 
+    explicit Hashy(std::string path = ".") : base_path(path) {}
+
     void init(int n) {
         // create all subhashy which will be needed for N
-        std::lock_guard lock(set_mutex);
-        for (auto s : generateShapes(n)) byshape[s].size();
+        N = n;
+        for (auto s : generateShapes(n)) {
+            initSubHashy(n, s);
+        }
         std::printf("%ld sets by shape for N=%d\n\r", byshape.size(), n);
     }
 
-    Subhashy<32> &at(XYZ shape) {
+    Subhashy &initSubHashy(int n, XYZ s) {
+        assert(N == n);
+
+        auto itr = byshape.find(s);
+        if (itr == byshape.end()) {
+            auto [itr, isnew] = byshape.emplace(s, Subhashy(32, n, base_path));
+            assert(isnew);
+            itr->second.size();
+            return itr->second;
+        } else {
+            return itr->second;
+        }
+    }
+
+    Subhashy &at(XYZ shape) {
         std::shared_lock lock(set_mutex);
         auto itr = byshape.find(shape);
         if (itr != byshape.end()) {
             return itr->second;
         }
-        lock.unlock();
-        // Not sure if this is supposed to happen normally
-        // if init() creates all subhashys required.
-        std::lock_guard elock(set_mutex);
-        return byshape[shape];
+        // should never get here...
+        std::printf("BUG: missing shape [%2d %2d %2d]:\n\r", shape.x(), shape.y(), shape.z());
+        std::abort();
+        return *((Subhashy *)0);
     }
 
     template <typename CubeT>
diff --git a/cpp/include/newCache.hpp b/cpp/include/newCache.hpp
index 1d62940..b9705ce 100644
--- a/cpp/include/newCache.hpp
+++ b/cpp/include/newCache.hpp
@@ -167,9 +167,8 @@ class FlatCache : public ICache {
         for (auto& [shape, set] : hashes) {
             auto begin = allXYZs.data() + allXYZs.size();
             for (auto& subset : set) {
-                for (auto& cube : subset)
-                    // allXYZs.emplace_back(allXYZs.end(), subset.set.begin(), subset.set.end());
-                    std::copy(cube.begin(), cube.end(), std::back_inserter(allXYZs));
+                for (auto& cubeptr : subset)
+                    cubeptr.copyout(subset.storage(), n, std::back_inserter(allXYZs));
             }
             auto end = allXYZs.data() + allXYZs.size();
             // std::printf("  SR %p %p\n", (void*)begin, (void*)end);
diff --git a/cpp/src/cubeSwapSet.cpp b/cpp/src/cubeSwapSet.cpp
new file mode 100644
index 0000000..1f391ff
--- /dev/null
+++ b/cpp/src/cubeSwapSet.cpp
@@ -0,0 +1,121 @@
+#include "cubeSwapSet.hpp"
+
+#include <filesystem>
+
+std::atomic<int> CubeStorage::m_init_num(0);
+
+CubeStorage::CubeStorage(std::filesystem::path path, size_t n) : m_cube_size(n) {
+    // Generate file name:
+    m_fpath = path / ("storage_" + std::to_string(m_init_num.fetch_add(1)) + ".bin");
+}
+
+CubeStorage::~CubeStorage() { discard(); }
+
+CubeStorage::CubeStorage(CubeStorage&& mv)
+    : m_fpath(std::move(mv.m_fpath)), m_file(std::move(mv.m_file)), m_map(std::move(mv.m_map)), m_cube_size(mv.m_cube_size), m_alloc_seek(mv.m_alloc_seek) {
+    // no allocations can exist in the moved from object:
+    assert(m_alloc_seek == 0);
+}
+
+CubePtr CubeStorage::allocate(const Cube& cube) {
+    std::lock_guard lock(m_mtx);
+
+    if (!m_file) {
+        using namespace mapped;
+        // file not open yet.
+        m_file = std::make_shared<file>();
+        if (m_file->openrw(m_fpath.c_str(), 0, file::CREATE | file::RESIZE | file::FSTUNE)) {
+            std::printf("CubeStorage::allocate() ERROR: Failed to create backing file: %s\n", m_fpath.c_str());
+            std::abort();
+        }
+        // Map some data.
+        // todo: mapped::file could provide following:
+        // m_file->readAt(offset,size,datain)
+        // m_file->writeAt(offset,size,dataout)
+        // so that we don't need this mapping for I/O.
+        // However the mapped::region::readAt() will be faster if
+        // the area fits in the region window and is accessed multiple times.
+        m_map = std::make_unique<region>(m_file, 0, PAGE_SIZE);
+    }
+
+    if (m_cube_size != cube.size()) {
+        std::printf("CubeStorage::allocate() ERROR: Cube size different than initialized");
+        std::abort();
+    }
+
+    m_map->writeAt(m_alloc_seek, m_cube_size * sizeof(XYZ), cube.data());
+
+    auto fpos = m_alloc_seek;
+    m_prev_seek = m_alloc_seek;
+    m_alloc_seek += m_cube_size * sizeof(XYZ);
+
+    return CubePtr(fpos);
+}
+
+void CubeStorage::cancel_allocation() {
+    std::lock_guard lock(m_mtx);
+    // last allocation was mistake.
+    if (m_alloc_seek >= m_cube_size * sizeof(XYZ)) m_alloc_seek -= m_cube_size * sizeof(XYZ);
+
+    // allocate() -> cancel_allocation() must be serialized:
+    assert(m_alloc_seek == m_prev_seek);
+}
+
+Cube CubeStorage::read(const CubePtr& x) const {
+    // todo: How to speed up:
+    // Option 1:
+    // Memory-map the file in 2 MiB aligned chunks:
+    // This would speed up reading the same data multiple times.
+    // Chunk is mapped by rounding down the x.seek() to multiple of 2MiB
+    // and creating 2MiB sized mapping at that file offset.
+    // Caching the last file offset used we could detect
+    // when we have do do jump() to the next "reading window".
+    // -Plus: let the kernel do the caching for us.
+    // -Plus: no memory overhead.
+    // -Minus: if implemented with just single memory-map per CubeStorage
+    //         threads can fight about what chunk is currently mapped.
+    // Option 2:
+    // Implement fine-grained read-cache with:
+    // std::unordered_map<fileoffset, Cube>
+    // And begin evicting them once the cache is full using
+    // cache eviction policy. (E.g. least-recently-used LRU)
+    // The cache should be made to be thread local
+    // so it won't interfere with other workers.
+    // -Plus: We decide how much data to keep in memory
+    // -Plus: No need to remap the memory.
+    // -Minus: complicated to implement.
+    Cube tmp(m_cube_size);
+    m_map->readAt(x.seek(), m_cube_size * sizeof(XYZ), tmp.data());
+    return tmp;
+}
+
+void CubeStorage::discard() {
+    std::lock_guard lock(m_mtx);
+
+    if (m_file) {
+        // avoid flushing any more data to disk:
+        m_map->discard(0, m_map->regionSize());
+        m_map.reset();
+        m_file->truncate(0);
+        m_file.reset();
+        m_alloc_seek = 0;
+
+        // Try remove the file created...
+        std::error_code ec;
+        auto stat = std::filesystem::status(m_fpath, ec);
+        if (!ec && std::filesystem::is_regular_file(stat)) {
+            if (!std::filesystem::remove(m_fpath, ec)) {
+                std::printf("WARN: failed to remove file: %s", m_fpath.c_str());
+            }
+        } else {
+            std::printf("WARN: failed to get file status: %s", m_fpath.c_str());
+        }
+    }
+}
+
+Cube CubePtr::get(const CubeStorage& storage) const {
+    // CubePtr::get() is really just an convenience function...
+    // However this cannot be implemented in the header file because
+    // CubeStorage definition is not known.
+    return storage.read(*this);
+}
diff --git a/cpp/src/cubes.cpp b/cpp/src/cubes.cpp
index 6b60085..89b4e12 100644
--- a/cpp/src/cubes.cpp
+++ b/cpp/src/cubes.cpp
@@ -2,13 +2,13 @@
 
 #include <algorithm>
 #include <chrono>
+#include <condition_variable>
 #include <cstdint>
+#include <deque>
 #include <filesystem>
 #include <iostream>
 #include <mutex>
 #include <thread>
-#include <deque>
-#include <condition_variable>
 
 #include "cube.hpp"
 #include "hashes.hpp"
@@ -29,11 +29,7 @@ struct Workset {
     XYZ targetShape, shape, expandDim;
     bool notSameShape;
     Workset(Hashy &hashes, XYZ targetShape, XYZ shape, XYZ expandDim, bool notSameShape)
-        : hashes(hashes)
-        , targetShape(targetShape)
-        , shape(shape)
-        , expandDim(expandDim)
-        , notSameShape(notSameShape) {}
+        : hashes(hashes), targetShape(targetShape), shape(shape), expandDim(expandDim), notSameShape(notSameShape) {}
 
     void setRange(ShapeRange &data) {
         _begin_total = data.begin();
@@ -139,7 +135,7 @@ struct Workset {
 struct Worker {
     std::shared_ptr<Workset> ws;
     int id;
-    int state = 3; // 1 == completed/waiting for job, 2 == processing, 3 == job assigned.
+    int state = 3;  // 1 == completed/waiting for job, 2 == processing, 3 == job assigned.
     std::mutex mtx;
     std::condition_variable cond;
     std::condition_variable cond2;
@@ -156,7 +152,7 @@ struct Worker {
 
     void launch(std::shared_ptr<Workset> ws_) {
         std::unique_lock lock(mtx);
-        while(state > 1) {
+        while (state != 1) {
             cond2.wait(lock);
         }
         ws = ws_;
@@ -166,7 +162,7 @@ struct Worker {
 
     void sync() {
         std::unique_lock lock(mtx);
-        while(state > 1) {
+        while (state != 1) {
             cond2.wait(lock);
         }
         ws.reset();
@@ -175,13 +171,11 @@ struct Worker {
     void run() {
         std::unique_lock lock(mtx);
         std::printf("thread nro. %d started.\n", id);
-        while(state) {
+        while (state) {
             state = 1;
             cond2.notify_one();
-            while(state == 1)
-                cond.wait(lock);
-            if(!state)
-                return;
+            while (state == 1) cond.wait(lock);
+            if (!state) return;
             state = 2;
             // std::printf("start %d\n", id);
             auto subset = ws->getPart();
@@ -207,7 +201,7 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
     if (!std::filesystem::is_directory(base_path)) {
         std::filesystem::create_directory(base_path);
     }
-    Hashy hashes;
+    Hashy hashes(base_path);
     if (n < 1)
         return {};
     else if (n == 1) {
@@ -248,12 +242,13 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
     auto start = std::chrono::steady_clock::now();
     uint32_t totalOutputShapes = hashes.numShapes();
     uint32_t outShapeCount = 0;
-
     auto prevShapes = Hashy::generateShapes(n - 1);
-    for (auto &tup : hashes) {
+
+    for (const auto &tup : hashes) {
         outShapeCount++;
         XYZ targetShape = tup.first;
         std::printf("process output shape %3d/%d [%2d %2d %2d]\n\r", outShapeCount, totalOutputShapes, targetShape.x(), targetShape.y(), targetShape.z());
+
         for (uint32_t sid = 0; sid < prevShapes.size(); ++sid) {
             auto &shape = prevShapes[sid];
             int diffx = targetShape.x() - shape.x();
@@ -289,7 +284,7 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
             ws->setRange(s);
 
             // Wait for jobs to complete.
-            for (auto& thr : workers) {
+            for (auto &thr : workers) {
                 thr.sync();
             }
             std::printf("  shape %d %d %d\n\r", shape.x(), shape.y(), shape.z());
@@ -297,25 +292,23 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
             // Because the workset is held by shared_ptr
             // main thread can do above preparation work in parallel
             // while the jobs are running.
-            for (auto& thr : workers) {
+            for (auto &thr : workers) {
                 thr.launch(ws);
             }
         }
         // Wait for jobs to complete.
-        for (auto& thr : workers) {
+        for (auto &thr : workers) {
             thr.sync();
         }
         std::printf("  num: %lu\n\r", hashes.at(targetShape).size());
         totalSum += hashes.at(targetShape).size();
         if (write_cache && split_cache) {
             cw.save(base_path + "cubes_" + std::to_string(n) + "_" + std::to_string(targetShape.x()) + "-" + std::to_string(targetShape.y()) + "-" +
-                            std::to_string(targetShape.z()) + ".bin",
-                        hashes, n);
+                        std::to_string(targetShape.z()) + ".bin",
+                    hashes, n);
         }
         if (split_cache) {
-            for (auto &subset : hashes.at(targetShape)) {
-                subset.clear();
-            }
+            hashes.at(targetShape).clear();
         }
     }
 
diff --git a/cpp/src/newCache.cpp b/cpp/src/newCache.cpp
index d54b057..cfb078a 100644
--- a/cpp/src/newCache.cpp
+++ b/cpp/src/newCache.cpp
@@ -1,4 +1,5 @@
 #include "newCache.hpp"
+#include "cubeSwapSet.hpp"
 
 #include <iostream>
 
@@ -201,11 +202,11 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
     auto xyz = std::make_shared<array_region<XYZ>>(file_, (*shapeEntry)[0].offset, num_cubes * n);
     auto put = xyz->get();
 
-    auto copyrange = [n](CubeSet::iterator itr, CubeSet::iterator end, XYZ *dest) -> void {
+    auto copyrange = [n](const CubeStorage& storage, CubeSwapSet::iterator itr, CubeSwapSet::iterator end, XYZ *dest) -> void {
         while (itr != end) {
             static_assert(sizeof(XYZ) == XYZ_SIZE);
-            assert(itr->size() == n);
-            itr->copyout(n, dest);
+            assert(storage.cubeSize() == n);
+            itr->copyout(storage, n, dest);
             dest += n;
             ++itr;
         }
@@ -233,14 +234,14 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
                 std::flush(std::cout);
 
                 std::lock_guard lock(m_mtx);
-                m_copy.emplace_back(std::bind(copyrange, start, itr, dest));
+                m_copy.emplace_back(std::bind(copyrange, std::ref(subset.storage()), start, itr, dest));
                 ++m_num_copys;
                 m_run.notify_all();
             }
             // copy remainder, if any.
             if (dist) {
                 std::lock_guard lock(m_mtx);
-                m_copy.emplace_back(std::bind(copyrange, itr, subset.end(), put));
+                m_copy.emplace_back(std::bind(copyrange, std::ref(subset.storage()), itr, subset.end(), put));
                 ++m_num_copys;
                 m_run.notify_all();
                 put += n * dist;

From ea703294761622e4a5f7d4faae1243dcb9f9c9ea Mon Sep 17 00:00:00 2001
From: JATothrim <jarmo.tiitto@gmail.com>
Date: Sat, 19 Aug 2023 17:18:48 +0300
Subject: [PATCH 38/42] CubeSwapper: I/O optimizations

- Thread-local read-cache for CubeStorage:
  The read-cache is private for each thread that calls CubeStorage::read()
  The cache is shared by all CubeStorage instances per thread.
  Entries are evicted from the cache with LRU policy. (least-recently-used)

- Massive CacheWriter optimizations:
  The written CubeStorage file is extremely useful for CacheWriter.
  CacheWriter now uses mapped::file::copyAt() to merge the
  CubeStorage file into the saved cache-file as-is.
  This completely by-passes iterating the CubeSwapSet Cube-by-Cube
  and makes CacheWriter::save() return without waiting data copy
  process to actually complete.
  Once copy job is complete the source CubeStorage file is deleted.
  CubeStorage::discard() now simply drops reference to the old file.

Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/cubeSwapSet.hpp | 162 ++++++++++++++++++----------
 cpp/include/hashes.hpp      |  27 +++--
 cpp/src/cubeSwapSet.cpp     | 207 ++++++++++++++++++++++++------------
 cpp/src/newCache.cpp        | 125 +++++++++++-----------
 4 files changed, 318 insertions(+), 203 deletions(-)

diff --git a/cpp/include/cubeSwapSet.hpp b/cpp/include/cubeSwapSet.hpp
index b7a2e5f..4a5e6f3 100644
--- a/cpp/include/cubeSwapSet.hpp
+++ b/cpp/include/cubeSwapSet.hpp
@@ -2,39 +2,50 @@
 #ifndef OPENCUBES_CUBE_DISKSWAP_SET_HPP
 #define OPENCUBES_CUBE_DISKSWAP_SET_HPP
 
+#include <filesystem>
 #include <memory>
 #include <mutex>
-#include <string>
-#include <unordered_map>
+#include <atomic>
 #include <unordered_set>
-#include <filesystem>
 
 #include "cube.hpp"
 #include "mapped_file.hpp"
 
 /**
- * Implement std::unordered_set<> that stores element data in a file.
+ * CubeSwapSet: Implement std::unordered_set<> that offloads XYZ data into a file:
  *
- * Cubes stored with size N in the set have constant cost of RAM memory:
- * Only the std::unordered_set<> itself and the internals nodes are stored in RAM.
+ * Cubes stored in the set have reduced cost of memory:
+ * Only the std::unordered_set<> itself and the internal nodes are stored in RAM.
  * The element *data* (i.e. XYZ data) is stored in the file.
- * The performance cost is that each time the element is accessed
- * the data has to be read back from the file.
- * (Iterating the entire CubeSwapSet involves reading the entire backing file)
+ * The performance cost is that each time the set element is accessed
+ * the data is read back from the file.
+ * (Iterating the entire CubeSwapSet involves reading the entire file)
  *
- * Clearing the CubeSwapSet does not release the backing file space managed by CubeStorage.
- * Call to CubeStorage::discard() is required after clearing or destructing
- * the CubeSwapSet instance to cleanup the file.
- * Elements cannot be removed one-by-one.
+ * Features:
+ * - XYZ data is recorded sequentially into the file and
+ *   the Cube size is not saved in the storage file.
+ * - Cube XYZ data length is constant in CubeStorage instance.
+ * - Clearing the CubeSwapSet does not release the file managed by CubeStorage.
+ *   (CubePtr(s) cannot be erased from CubeStorage)
+ * - CubeStorage::read(const CubePtr&) caches up to 1024 Cubes for each thread.
+ *   This read-cache is maintained by any thread that calls CubePtr::get().
+ *   CubeStorage::discard() is used to begin writing the XYZ data at new file instance.
+ * - CacheWriter utilizes the file instance from CubeStorage:
+ *   the CubeSwapSet is not iterated through at all by CacheWriter
+ *   and instead CubeStorage::getFile() is assigned into a copy job and then
+ *   copied into the cache-file with mapped::file::copyAt().
+ *   The source storage file is deleted once the copy is completed.
+ *   This provides wait-free saving of the cache-file and uses
+ *   minimal amount of system memory.
  */
 class CubeStorage;
 
 /**
- * Overlay that reads the cube data from the backing file.
- * CubePtr needs its associated CubeStorage instance to be able to
- * access its contents with CubePtr::get()
- * The associated CubeStorage owning the CubePtr
- * should always be available where CubePtr is used.
+ * CubePtr: "File Pointer to Cube" that reads the cube data from file.
+ * CubePtr needs CubeStorage instance to be able to access
+ * its contents with CubePtr::get().
+ * The associated CubeStorage should always be available
+ * in context where CubePtr(s) data is accessed.
  */
 class CubePtr {
    protected:
@@ -46,13 +57,23 @@ class CubePtr {
 
     /**
      * Get the Cube pointed by this instance.
+     * @note The Cube is cached in the thread-local read-cache.
+     * @warn
+     *  The Cube object is local to calling thread and shall
+     *  not be passed into other threads.
+     */
+    const Cube& get(const CubeStorage& storage) const;
+
+    /**
+     * Raw data copy. By-passes the thread-local cache.
      */
-    Cube get(const CubeStorage& storage) const;
+    void copyout(const CubeStorage& storage, size_t n, XYZ* out) const;
 
     template <typename Itr>
     void copyout(const CubeStorage& storage, size_t n, Itr out) const {
-        auto tmp = get(storage);
-        std::copy_n(tmp.begin(), n, out);
+        std::vector<XYZ> buff(n);
+        copyout(storage, n, buff.data());
+        std::copy_n(buff.begin(), n, out);
     }
 
     mapped::seekoff_t seek() const { return m_seek; }
@@ -64,42 +85,36 @@ class CubePtr {
 class CubePtrEqual {
    protected:
     const CubeStorage* m_storage = nullptr;
+
    public:
-	// C++20 feature:
+    // C++20 feature:
     using is_transparent = void;
 
     CubePtrEqual(const CubeStorage* ctx) : m_storage(ctx) {}
     CubePtrEqual(const CubePtrEqual& ctx) : m_storage(ctx.m_storage) {}
 
-    bool operator()(const CubePtr& a, const CubePtr& b) const { return a.get(*m_storage) == b.get(*m_storage); }
-
-    bool operator()(const Cube& a, const CubePtr& b) const { return a == b.get(*m_storage); }
-
-    bool operator()(const CubePtr& a, const Cube& b) const { return a.get(*m_storage) == b; }
+    bool operator()(const CubePtr& a, const CubePtr& b) const {
+        // todo: there is possibility that
+        // a.get() returned cube is *deleted* from the cache by b.get()
+        // The read-cache size must be at least 3 to avoid this.
+        return a.get(*m_storage) == b.get(*m_storage);
+    }
 };
 
 class CubePtrHash {
    protected:
     const CubeStorage* m_storage = nullptr;
+
    public:
-	// C++20 feature:
+    // C++20 feature:
     using is_transparent = void;
     using transparent_key_equal = CubePtrEqual;
 
     CubePtrHash(const CubeStorage* ctx) : m_storage(ctx) {}
     CubePtrHash(const CubePtrHash& ctx) : m_storage(ctx.m_storage) {}
 
-    size_t operator()(const Cube& x) const {
-        std::size_t seed = x.size();
-        for (auto& p : x) {
-            auto x = HashXYZ()(p);
-            seed ^= x + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-        }
-        return seed;
-    }
-
     size_t operator()(const CubePtr& x) const {
-        auto cube = x.get(*m_storage);
+        auto& cube = x.get(*m_storage);
         std::size_t seed = cube.size();
         for (auto& p : cube) {
             auto x = HashXYZ()(p);
@@ -111,24 +126,24 @@ class CubePtrHash {
 
 class CubeStorage {
    protected:
-    std::mutex m_mtx;
+    mutable std::mutex m_mtx;
     std::filesystem::path m_fpath;
     std::shared_ptr<mapped::file> m_file;
-    std::unique_ptr<mapped::region> m_map;
 
     static std::atomic<int> m_init_num;
+    int m_storage_version = 0;
     const size_t m_cube_size;
-    mapped::seekoff_t m_prev_seek = 0;
-    mapped::seekoff_t m_alloc_seek = 0;
+
+    mapped::seekoff_t m_alloc_seek;
 
    public:
     /**
      * Initialize Cube file storage
-     * @param fname directory where to store the backing file.
-     * @param n The storage is reserved in n sized chunks.
-     *   This should be equal to Cube::size() that are passed into allocate()
-     *   as no other allocation size is supported.
-     * @note the backing file creation is delayed until allocate() is called first time.
+     * @param path directory where to write the storage file.
+     * @param n The storage is written in n sized chunks of XYZ structs.
+     *   This should be equal to Cube::size() that are passed into local()
+     *   Different sized Cubes in same CubeStorage instance will not work.
+     * @note the file creation is delayed until commit() is called first time.
      */
     CubeStorage(std::filesystem::path path, size_t n);
     ~CubeStorage();
@@ -136,32 +151,63 @@ class CubeStorage {
     // not copyable
     CubeStorage(const CubeStorage&) = delete;
     CubeStorage& operator=(const CubeStorage&) = delete;
-    // move constructible: but only if no allocations exists
+    // move constructible: but only if no allocations exists in mv
     CubeStorage(CubeStorage&& mv);
     CubeStorage& operator=(CubeStorage&& mv) = delete;
 
     size_t cubeSize() const { return m_cube_size; }
 
     /**
-     * Store Cube data into the backing file.
-     * Returns CubePtr that can be inserted into CubeSwapSet.
-     * @note cube.size() must be equal to this->cubeSize()
+     * Make thread local CubePtr instance.
+     * @note
+     *  Other thread cannot access the returned CubePtr until commit() is called.
+     */
+    CubePtr local(const Cube& cube) const;
+
+    /**
+     * Publish the last local() returned CubePtr.
+     * commit() writes this the data into the file storage
+     * making it visible to all threads.
+     */
+    void commit();
+
+    /**
+     * Discard the last local() returned CubePtr.
+     */
+    void drop() const;
+
+    /**
+     * Retrieve the cube data from the backing file
+     * and cache the result for the caller thread.
+     */
+    const Cube& read(const CubePtr& x) const;
+
+    /**
+     * Copy the cube data from the storage into destination buffer.
+     */
+    void copydata(const CubePtr& x, size_t n, XYZ* destination) const;
+
+    /**
+     * Explicitly clear the calling thread's read-cache.
+     * @note this will initialize callers read-cache instance
+     *  if the thread has not used the read-cache yet.
+     *  So only call this from thread that has used to read().
      */
-    CubePtr allocate(const Cube& cube);
+    void resetReadCache() const;
 
     /**
-     * Revert the effect of last allocate()
+     * Get the file name CubeStorage is using.
      */
-    void cancel_allocation();
+    std::filesystem::path fileName() const { return m_fpath; }
 
     /**
-     * Retrieve the cube data from the backing file.
+     * Get the mapped::file instance.
+     * @note this can be null if nothing has been written to the storage yet.
      */
-    Cube read(const CubePtr& x) const;
+    std::shared_ptr<mapped::file> getFile() const { return m_file; }
 
     /**
      * Drop all stored data.
-     * Shrinks the backing file to zero size and deletes it.
      */
     void discard();
 };
@@ -171,7 +217,7 @@ class CubeStorage {
  *
  * The CubeSwapSet must be constructed with already initialized
  * stateful instances of CubePtrEqual and CubePtrHash functors
- * that resolve the CubePtr instance using the CubeStorage instance.
+ * that resolve the CubePtr(s) using the CubeStorage instance.
  */
 using CubeSwapSet = std::unordered_set<CubePtr, CubePtrHash, CubePtrEqual>;
 
diff --git a/cpp/include/hashes.hpp b/cpp/include/hashes.hpp
index 79bedfb..fcbab4e 100644
--- a/cpp/include/hashes.hpp
+++ b/cpp/include/hashes.hpp
@@ -26,7 +26,7 @@ struct HashCube {
     }
 };
 
-using CubeSet = std::unordered_set<Cube, HashCube, std::equal_to<Cube>>;
+// using CubeSet = std::unordered_set<Cube, HashCube, std::equal_to<Cube>>;
 
 class Subsubhashy {
    protected:
@@ -40,20 +40,22 @@ class Subsubhashy {
     template <typename CubeT>
     void insert(CubeT &&c) {
         std::lock_guard lock(set_mutex);
-        auto [itr, isnew] = set.emplace(set_storage.allocate(std::forward<CubeT>(c)));
-        if (!isnew) {
-            set_storage.cancel_allocation();
+        auto cptr = set_storage.local(std::forward<CubeT>(c));
+        auto [itr, isnew] = set.emplace(cptr);
+        if (isnew) {
+            set_storage.commit();
+        } else {
+            set_storage.drop();
         }
     }
 
-#if __cplusplus > 201703L
-// todo: need C++17 equivalent for *generic*
-// contains() or find() that accepts both Cube and CubePtr types
     bool contains(const Cube &c) const {
         std::shared_lock lock(set_mutex);
-        return set.contains<Cube>(c);
+        auto cptr = set_storage.local(c);
+        auto itr = set.find(cptr);
+        set_storage.drop();
+        return itr != set.end();
     }
-#endif
 
     auto size() const {
         std::shared_lock lock(set_mutex);
@@ -64,12 +66,10 @@ class Subsubhashy {
         std::lock_guard lock(set_mutex);
         set.clear();
         set.reserve(1);
+        set_storage.discard();
     }
 
     // Get CubeStorage instance.
-    // [this->begin(), this->end()] iterated CubePtr's
-    // Can be resolved with CubePtr::get(this->storage())
-    // that returns copy of the data as Cube.
     const CubeStorage &storage() const { return set_storage; }
 
     auto begin() const { return set.begin(); }
@@ -94,9 +94,8 @@ class Subhashy {
         HashCube hash;
         auto idx = hash(c) % byhash.size();
         auto &set = byhash[idx];
-#if __cplusplus > 201703L
+
         if (set.contains(c)) return;
-#endif
         set.insert(std::forward<CubeT>(c));
         // printf("new size %ld\n\r", byshape[shape].size());
     }
diff --git a/cpp/src/cubeSwapSet.cpp b/cpp/src/cubeSwapSet.cpp
index 1f391ff..a4c8e4c 100644
--- a/cpp/src/cubeSwapSet.cpp
+++ b/cpp/src/cubeSwapSet.cpp
@@ -1,6 +1,56 @@
 #include "cubeSwapSet.hpp"
 
 #include <filesystem>
+#include <list>
+#include <unordered_map>
+
+/**
+ * thread-local read-cache for Cube(s)
+ */
+class ThreadCache {
+   public:
+    static ThreadCache& get();
+
+    struct entry {
+        // read-cache "key"
+        const CubeStorage* storage;
+        mapped::seekoff_t seek;
+        int version;
+
+        friend bool operator==(const entry& a, const entry& b) { return std::tie(a.storage, a.seek, a.version) == std::tie(b.storage, b.seek, b.version); }
+    };
+
+    struct state {
+        // cached data.
+        Cube cube;
+        std::list<entry>::iterator lru;
+    };
+
+    struct entry_hash {
+        size_t operator()(const entry& x) const {
+            size_t seed = uintptr_t(x.storage);
+            seed ^= x.seek + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+            seed ^= x.version + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+            return seed;
+        };
+    };
+
+    // Least-recently-used, LRU eviction policy list.
+    std::list<entry> lru;
+    // trick: make map with reference_wrapper<entry>
+    // as key so we don't need to duplicate the data from the lru list.
+    // surprisingly C++17 cache.find(entry) works.
+    std::unordered_map<std::reference_wrapper<const entry>, state, entry_hash, std::equal_to<entry>> cache;
+
+    bool local_enabled = false;
+    mapped::seekoff_t local_seek = -1;
+    Cube local;
+};
+
+ThreadCache& ThreadCache::get() {
+    static thread_local ThreadCache instance;
+    return instance;
+}
 
 std::atomic<int> CubeStorage::m_init_num(0);
 
@@ -12,12 +62,20 @@ CubeStorage::CubeStorage(std::filesystem::path path, size_t n) : m_cube_size(n)
 CubeStorage::~CubeStorage() { discard(); }
 
 CubeStorage::CubeStorage(CubeStorage&& mv)
-    : m_fpath(std::move(mv.m_fpath)), m_file(std::move(mv.m_file)), m_map(std::move(mv.m_map)), m_cube_size(mv.m_cube_size), m_alloc_seek(mv.m_alloc_seek) {
+    : m_fpath(std::move(mv.m_fpath)), m_file(std::move(mv.m_file)), m_cube_size(mv.m_cube_size), m_alloc_seek(mv.m_alloc_seek) {
     // no allocations can exist in the moved from object:
     assert(m_alloc_seek == 0);
 }
 
-CubePtr CubeStorage::allocate(const Cube& cube) {
+CubePtr CubeStorage::local(const Cube& cube) const {
+    auto& ctx = ThreadCache::get();
+    ctx.local = cube;
+    ctx.local_seek = m_alloc_seek;
+    ctx.local_enabled = true;
+    return CubePtr(ctx.local_seek);
+}
+
+void CubeStorage::commit() {
     std::lock_guard lock(m_mtx);
 
     if (!m_file) {
@@ -25,97 +83,110 @@ CubePtr CubeStorage::allocate(const Cube& cube) {
         // file not open yet.
         m_file = std::make_shared<file>();
         if (m_file->openrw(m_fpath.c_str(), 0, file::CREATE | file::RESIZE | file::FSTUNE)) {
-            std::printf("CubeStorage::allocate() ERROR: Failed to create backing file: %s\n", m_fpath.c_str());
+            std::printf("CubeStorage::allocate() ERROR: Failed to create file: %s\n", m_fpath.c_str());
             std::abort();
         }
-        // Map some data.
-        // todo: mapped::file could provide following:
-        // m_file->readAt(offset,size,datain)
-        // m_file->writeAt(offset,size,dataout)
-        // so that we don't need this mapping for I/O.
-        // However the mapped::region::readAt() will be faster if
-        // the area fits in the region window and is accessed multiple times.
-        m_map = std::make_unique<region>(m_file, 0, PAGE_SIZE);
     }
 
-    if (m_cube_size != cube.size()) {
-        std::printf("CubeStorage::allocate() ERROR: Cube size different than initialized");
-        std::abort();
-    }
-
-    m_map->writeAt(m_alloc_seek, m_cube_size * sizeof(XYZ), cube.data());
+    auto& ctx = ThreadCache::get();
+    assert(ctx.local_enabled);
+    assert(ctx.local_seek == m_alloc_seek);
+    ctx.local_enabled = false;
 
-    auto fpos = m_alloc_seek;
-    m_prev_seek = m_alloc_seek;
+    m_file->writeAt(m_alloc_seek, m_cube_size * sizeof(XYZ), ctx.local.data());
     m_alloc_seek += m_cube_size * sizeof(XYZ);
+}
 
-    return CubePtr(fpos);
+void CubeStorage::drop() const {
+    auto& ctx = ThreadCache::get();
+    assert(ctx.local_enabled);
+    ctx.local_enabled = false;
+    ctx.local_seek = -1;
 }
 
-void CubeStorage::cancel_allocation() {
-    std::lock_guard lock(m_mtx);
-    // last allocation was mistake.
-    if (m_alloc_seek >= m_cube_size * sizeof(XYZ)) m_alloc_seek -= m_cube_size * sizeof(XYZ);
+const Cube& CubeStorage::read(const CubePtr& x) const {
+    // Get thread's cache instance:
+    auto& ctx = ThreadCache::get();
+
+    // Check if x is actually the object returned by local():
+    if (ctx.local_enabled && x.seek() == ctx.local_seek) {
+        assert(ctx.local.size() == m_cube_size);
+        return ctx.local;
+    }
 
-    // allocate() -> cancel_allocation() must be serialized:
-    assert(m_alloc_seek == m_prev_seek);
+    ThreadCache::entry key{this, x.seek(), m_storage_version};
+    auto itr = ctx.cache.find(key);
+    if (itr != ctx.cache.end()) {
+        // cache-hit.
+        // LRU policy simply moves the element at back of the list:
+        if (std::next(itr->second.lru) != ctx.lru.end()) {
+            ctx.lru.splice(itr->second.lru, ctx.lru, ctx.lru.end());
+        }
+        return itr->second.cube;
+    } else {
+        // cache-miss.
+        // Evict entry at front if read-cache is full:
+        if (ctx.cache.size() >= 1024) {
+            auto rm = ctx.cache.find(ctx.lru.front());
+            ctx.cache.erase(rm);
+            ctx.lru.pop_front();
+        }
+
+        // Read Cube data
+        Cube tmp(m_cube_size);
+        m_file->readAt(x.seek(), m_cube_size * sizeof(XYZ), tmp.data());
+
+        // Move it into an new read-cache entry:
+        auto nitr = ctx.lru.insert(ctx.lru.end(), key);
+        auto [itr, ok] = ctx.cache.emplace(std::ref(*nitr), ThreadCache::state{std::move(tmp), nitr});
+        assert(ok);
+        return itr->second.cube;
+    }
+}
+
+void CubeStorage::resetReadCache() const {
+    auto& ctx = ThreadCache::get();
+    ctx.cache.clear();
+    ctx.lru.clear();
 }
 
-Cube CubeStorage::read(const CubePtr& x) const {
-    // todo: How to speed up:
-    // Option 1:
-    // Memory-map the file in 2 MiB aligned chunks:
-    // This would speed up reading the same data multiple times.
-    // Chunk is mapped by rounding down the x.seek() to multiple of 2MiB
-    // and creating 2MiB sized mapping at that file offset.
-    // Caching the last file offset used we could detect
-    // when we have do do jump() to the next "reading window".
-    // -Plus: let the kernel do the caching for us.
-    // -Plus: no memory overhead.
-    // -Minus: if implemented with just single memory-map per CubeStorage
-    //         threads can fight about what chunk is currently mapped.
-    // Option 2:
-    // Implement fine-grained read-cache with:
-    // std::unordered_map<fileoffset, Cube>
-    // And begin evicting them once the cache is full using
-    // cache eviction policy. (E.g. least-recently-used LRU)
-    // The cache should be made to be thread local
-    // so it won't interfere with other workers.
-    // -Plus: We decide how much data to keep in memory
-    // -Plus: No need to remap the memory.
-    // -Minus: complicated to implement.
-    Cube tmp(m_cube_size);
-    m_map->readAt(x.seek(), m_cube_size * sizeof(XYZ), tmp.data());
-    return tmp;
+void CubeStorage::copydata(const CubePtr& x, size_t n, XYZ* destination) const {
+    // copydata() doesn't use thread's read-cache
+    // so local() cannot be active:
+    assert(!ThreadCache::get().local_enabled);
+    m_file->readAt(x.seek(), n * sizeof(XYZ), destination);
 }
 
 void CubeStorage::discard() {
     std::lock_guard lock(m_mtx);
 
     if (m_file) {
-        // avoid flushing any more data to disk:
-        m_map->discard(0, m_map->regionSize());
-        m_map.reset();
-        m_file->truncate(0);
+        // The backing file is kept intact
+        // so that CacheWriter can process it.
         m_file.reset();
         m_alloc_seek = 0;
-
-        // Try remove the file created...
-        std::error_code ec;
-        auto stat = std::filesystem::status(m_fpath, ec);
-        if (!ec && std::filesystem::is_regular_file(stat)) {
-            if (!std::filesystem::remove(m_fpath, ec)) {
-                std::printf("WARN: failed to remove file: %s", m_fpath.c_str());
-            }
-        } else {
-            std::printf("WARN: failed to get file status: %s", m_fpath.c_str());
-        }
+        // Thread read-cache problem:
+        // discard() must cause eviction of all entries for each
+        // thread's read cache that point into this.
+        // This done by incrementing m_storage_version:
+        // the entries can't simply be found as they are
+        // made with m_storage_version - 1 value.
+        // The entries are eventually evicted by
+        // the read-cache this way.
+        ++m_storage_version;
     }
 }
 
-Cube CubePtr::get(const CubeStorage& storage) const {
+const Cube& CubePtr::get(const CubeStorage& storage) const {
     // CubePtr::get() is really just an convenience function...
     // However this cannot be implemented in the header file because
     // CubeStorage definition is not known.
     return storage.read(*this);
 }
+
+void CubePtr::copyout(const CubeStorage& storage, size_t n, XYZ* out) const {
+    // CubePtr::copyout() is really just an convenience function...
+    // However this cannot be implemented in the header file because
+    // CubeStorage definition is not known.
+    storage.copydata(*this, n, out);
+}
\ No newline at end of file
diff --git a/cpp/src/newCache.cpp b/cpp/src/newCache.cpp
index cfb078a..8ae2f5b 100644
--- a/cpp/src/newCache.cpp
+++ b/cpp/src/newCache.cpp
@@ -1,8 +1,9 @@
 #include "newCache.hpp"
-#include "cubeSwapSet.hpp"
 
 #include <iostream>
 
+#include "cubeSwapSet.hpp"
+
 CacheReader::CacheReader() : path_(""), fileLoaded_(false), dummyHeader{0, 0, 0, 0}, header(&dummyHeader), shapes(nullptr) {}
 
 void CacheReader::printHeader() {
@@ -167,18 +168,24 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
         return;
     }
 
+    // Write header:
     auto header = std::make_shared<struct_region<Header>>(file_, 0);
     (*header)->magic = cacheformat::MAGIC;
     (*header)->n = n;
     (*header)->numShapes = hashes.numShapes();
     (*header)->numPolycubes = hashes.size();
+    header->flush();
 
     std::vector<XYZ> keys;
     keys.reserve((*header)->numShapes);
     for (auto &pair : hashes) keys.push_back(pair.first);
     std::sort(keys.begin(), keys.end());
 
+    // Write shape table:
     auto shapeEntry = std::make_shared<array_region<ShapeEntry>>(file_, header->getEndSeek(), (*header)->numShapes);
+    header.reset();
+
+    static_assert(XYZ_SIZE == sizeof(XYZ), "XYZ_SIZE differs from sizeof(XYZ)");
 
     uint64_t offset = shapeEntry->getEndSeek();
     size_t num_cubes = 0;
@@ -195,89 +202,79 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
         se.size = count * XYZ_SIZE * n;
         offset += se.size;
     }
+    shapeEntry->flush();
 
     // put XYZs
-    // Serialize large CubeSet(s) in parallel.
-
-    auto xyz = std::make_shared<array_region<XYZ>>(file_, (*shapeEntry)[0].offset, num_cubes * n);
-    auto put = xyz->get();
-
-    auto copyrange = [n](const CubeStorage& storage, CubeSwapSet::iterator itr, CubeSwapSet::iterator end, XYZ *dest) -> void {
-        while (itr != end) {
-            static_assert(sizeof(XYZ) == XYZ_SIZE);
-            assert(storage.cubeSize() == n);
-            itr->copyout(storage, n, dest);
-            dest += n;
-            ++itr;
+    // Schedule merging of the cache file.
+    // CubeSwapSet enables massive optimizations in how
+    // CacheWriter can merge the SubsubHashy's data into the final cache file:
+    // - copystorage lambda takes the source file and it's file name from the
+    //   SubsubHashy::storage() returned CubeStorage.
+    // - mapped::file::copyAt() is used to efficiently copy the source file contents into this cache file
+    // - Finally the copystorage lambda *deletes* the source storage file
+    // The main program does not need to wait for this process to complete.
+
+    // copystorage takes shared ownership of the file_
+    auto copystorage = [n, file = file_](std::shared_ptr<mapped::file> src, std::filesystem::path rmname, size_t num, mapped::seekoff_t dest) -> void {
+        file->copyAt(src, 0, num * n * sizeof(XYZ), dest);
+        src.reset();
+
+        // Try remove the source storage file.
+        std::error_code ec;
+        auto stat = std::filesystem::status(rmname, ec);
+        if (!ec && std::filesystem::is_regular_file(stat)) {
+            if (!std::filesystem::remove(rmname, ec)) {
+                std::printf("WARN: failed to remove file: %s", rmname.c_str());
+            }
+        } else {
+            std::printf("WARN: failed to get file status: %s", rmname.c_str());
         }
     };
 
+    mapped::seekoff_t fileEnd = shapeEntry->getEndSeek();
     auto time_start = std::chrono::steady_clock::now();
-    for (auto &key : keys) {
-        for (auto &subset : hashes.at(key)) {
-            auto itr = subset.begin();
-
-            ptrdiff_t dist = subset.size();
-            // distribute if range is large enough.
-            auto skip = std::max(4096L, std::max(1L, dist / (signed)m_flushers.size()));
-            while (dist > skip) {
-                auto start = itr;
-                auto dest = put;
-
-                auto inc = std::min(dist, skip);
-                std::advance(itr, inc);
-                put += n * inc;
-                dist = std::distance(itr, subset.end());
-
-                auto done = 100.0f * (std::distance(xyz->get(), put) / float(num_cubes * n));
-                std::printf("writing data %5.2f%% ...  \r", done);
-                std::flush(std::cout);
-
-                std::lock_guard lock(m_mtx);
-                m_copy.emplace_back(std::bind(copyrange, std::ref(subset.storage()), start, itr, dest));
-                ++m_num_copys;
-                m_run.notify_all();
-            }
-            // copy remainder, if any.
-            if (dist) {
-                std::lock_guard lock(m_mtx);
-                m_copy.emplace_back(std::bind(copyrange, std::ref(subset.storage()), itr, subset.end(), put));
+    for (size_t i = 0; i < keys.size(); ++i) {
+        auto put = (*shapeEntry)[i].offset;
+        for (auto &subset : hashes.at(keys[i])) {
+            ptrdiff_t num = subset.size();
+            if (num) {
+                // By pass iterating the Subsubhashy entirely
+                // and copy the data from CubeStorage file *directly* into this file.
+                // the Cube data does end up in different order than when copying one-by-one.
+                // But we don't care as the order is random already.
+                // the copy job also deletes the CubeStorage::fileName() file from the disk
+                // once the data copy completes.
+                std::unique_lock lock(m_mtx);
+                m_copy.emplace_back(std::bind(copystorage, subset.storage().getFile(), subset.storage().fileName(), num, put));
                 ++m_num_copys;
                 m_run.notify_all();
-                put += n * dist;
-
-                auto done = 100.0f * (std::distance(xyz->get(), put) / float(num_cubes * n));
-                std::printf("writing data %5.2f%% ...  \r", done);
+                std::printf("scheduled copy jobs: %*d ...  \r", 3, (int)m_num_copys);
                 std::flush(std::cout);
             }
+            put += num * n * XYZ_SIZE;
         }
+        fileEnd = std::max(fileEnd, put);
     }
+    shapeEntry.reset();
 
-    // sanity check:
-    assert(put == (*xyz).get() + num_cubes * n);
-
-    // sync up.
+    // sync up a bit.
+    // don't allow the copy job queue to grow indefinitely
+    // if the disk can't keep up.
     std::unique_lock lock(m_mtx);
-    while (m_num_copys) {
+    while (m_num_copys > m_flushers.size()) {
+        std::printf("waiting for %*d copy jobs to complete ...  \r", 3, (int)m_num_copys);
+        std::flush(std::cout);
         m_wait.wait(lock);
     }
 
-    // move the resources into flush job.
+    // move the file into flush job.
     m_flushes.emplace_back(std::bind(
-        [](auto &&file, auto &&header, auto &&shapeEntry, auto &&xyz) -> void {
-            // flush.
-            header->flush();
-            shapeEntry->flush();
-            xyz->flush();
-            // Truncate file to proper size.
-            file->truncate(xyz->getEndSeek());
+        [fileEnd](auto &&file) -> void {
+            file->truncate(fileEnd);
             file->close();
             file.reset();
-            xyz.reset();
-            shapeEntry.reset();
-            header.reset();
         },
-        std::move(file_), std::move(header), std::move(shapeEntry), std::move(xyz)));
+        std::move(file_)));
     ++m_num_flushes;
     m_run.notify_all();
 
@@ -290,6 +287,8 @@ void CacheWriter::save(std::string path, Hashy &hashes, uint8_t n) {
 void CacheWriter::flush() {
     std::unique_lock lock(m_mtx);
     while (m_num_flushes) {
+        std::printf("%*d copy jobs total remaining on %*d files  ...  \r", 3, (int)m_num_copys, 2, (int)m_num_flushes);
+        std::flush(std::cout);
         m_wait.wait(lock);
     }
 }

From 2f86284a5d8a65966b0abd451908a38c26514533 Mon Sep 17 00:00:00 2001
From: JATothrim <jarmo.tiitto@gmail.com>
Date: Fri, 25 Aug 2023 18:52:55 +0300
Subject: [PATCH 39/42] CubeStorage: Memory map 2 MiB area at end of the file.

- Memory map 2 MiB region at end of the backing file.
  This consumes additional 2 MiB of RAM per CubeStorage instance but
  reduces the number of file::truncate() and systems calls issued
  by large factor.
  The mapped region also speeds up CubeStorage::read() if the CubePtr
  falls into the mapped area as mapped::region::readAt() can simply
  memcpy the data.
- Reduce Subsubhashy::insert() write-lock scope.
  If the entry is dropped (because another thread inserted it first)
  unlock immediately before CubeStorage::drop() is called.

Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/cubeSwapSet.hpp | 13 ++++++++++---
 cpp/include/hashes.hpp      |  3 ++-
 cpp/src/cubeSwapSet.cpp     | 34 +++++++++++++++++++++++++++-------
 3 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/cpp/include/cubeSwapSet.hpp b/cpp/include/cubeSwapSet.hpp
index 4a5e6f3..b3395aa 100644
--- a/cpp/include/cubeSwapSet.hpp
+++ b/cpp/include/cubeSwapSet.hpp
@@ -134,8 +134,13 @@ class CubeStorage {
     int m_storage_version = 0;
     const size_t m_cube_size;
 
+    mapped::seekoff_t m_reserved_end;
+    // End of committed data.
     mapped::seekoff_t m_alloc_seek;
 
+    // m_file_head: 2 MiB memory mapped area at end of the file.
+    std::unique_ptr<mapped::region> m_file_head;
+
    public:
     /**
      * Initialize Cube file storage
@@ -160,7 +165,9 @@ class CubeStorage {
     /**
      * Make thread local CubePtr instance.
      * @note
-     *  Other thread cannot access the returned CubePtr until commit() is called.
+     *  Other thread(s) cannot access the returned CubePtr until commit() is called.
+     *  This requires that external lock is held for the data structure
+     *  if CubePtr is made visible to other thread(s) until this thread calls commit()
      */
     CubePtr local(const Cube& cube) const;
 
@@ -189,9 +196,9 @@ class CubeStorage {
 
     /**
      * Explicitly clear the calling thread's read-cache.
-     * @note this will initialize callers read-cache instance
+     * @note this will *initialize* callers read-cache instance
      *  if the thread has not used the read-cache yet.
-     *  So only call this from thread that has used to read().
+     *  Only call this from thread that has used to read() previously.
      */
     void resetReadCache() const;
 
diff --git a/cpp/include/hashes.hpp b/cpp/include/hashes.hpp
index fcbab4e..cc838ab 100644
--- a/cpp/include/hashes.hpp
+++ b/cpp/include/hashes.hpp
@@ -39,12 +39,13 @@ class Subsubhashy {
 
     template <typename CubeT>
     void insert(CubeT &&c) {
-        std::lock_guard lock(set_mutex);
+        std::unique_lock lock(set_mutex);
         auto cptr = set_storage.local(std::forward<CubeT>(c));
         auto [itr, isnew] = set.emplace(cptr);
         if (isnew) {
             set_storage.commit();
         } else {
+            lock.unlock();
             set_storage.drop();
         }
     }
diff --git a/cpp/src/cubeSwapSet.cpp b/cpp/src/cubeSwapSet.cpp
index a4c8e4c..7d5819b 100644
--- a/cpp/src/cubeSwapSet.cpp
+++ b/cpp/src/cubeSwapSet.cpp
@@ -76,7 +76,7 @@ CubePtr CubeStorage::local(const Cube& cube) const {
 }
 
 void CubeStorage::commit() {
-    std::lock_guard lock(m_mtx);
+    std::unique_lock lock(m_mtx);
 
     if (!m_file) {
         using namespace mapped;
@@ -86,15 +86,32 @@ void CubeStorage::commit() {
             std::printf("CubeStorage::allocate() ERROR: Failed to create file: %s\n", m_fpath.c_str());
             std::abort();
         }
+
+        // memory map 2 MiB chunk for writing.
+        // This also works as "pre-read-cache" for read():
+        // Any CubePtr(s) in this window even if they
+        // are not yet in thread's read-cache have fast readAt().
+        m_file_head = std::make_unique<mapped::region>(m_file, 0, 2 * 1024 * 1024);
+    }
+    auto datasize = m_cube_size * sizeof(XYZ);
+    auto write_fpos = m_alloc_seek;
+
+    if(m_reserved_end < m_alloc_seek + datasize) {
+        // advance the backing file m_file_head to next 2 MiB chunk.
+        m_reserved_end += 2 * 1024 * 1024;
+        m_file_head->flushJump(m_reserved_end);
     }
+    // advance write offset:
+    m_alloc_seek = write_fpos + datasize;
+    // allow parallel m_file_head->writeAt() calls:
+    lock.unlock();
 
     auto& ctx = ThreadCache::get();
     assert(ctx.local_enabled);
     assert(ctx.local_seek == m_alloc_seek);
     ctx.local_enabled = false;
 
-    m_file->writeAt(m_alloc_seek, m_cube_size * sizeof(XYZ), ctx.local.data());
-    m_alloc_seek += m_cube_size * sizeof(XYZ);
+    m_file_head->writeAt(write_fpos, datasize, ctx.local.data());
 }
 
 void CubeStorage::drop() const {
@@ -134,7 +151,7 @@ const Cube& CubeStorage::read(const CubePtr& x) const {
 
         // Read Cube data
         Cube tmp(m_cube_size);
-        m_file->readAt(x.seek(), m_cube_size * sizeof(XYZ), tmp.data());
+        m_file_head->readAt(x.seek(), m_cube_size * sizeof(XYZ), tmp.data());
 
         // Move it into an new read-cache entry:
         auto nitr = ctx.lru.insert(ctx.lru.end(), key);
@@ -154,7 +171,7 @@ void CubeStorage::copydata(const CubePtr& x, size_t n, XYZ* destination) const {
     // copydata() doesn't use thread's read-cache
     // so local() cannot be active:
     assert(!ThreadCache::get().local_enabled);
-    m_file->readAt(x.seek(), n * sizeof(XYZ), destination);
+    m_file_head->readAt(x.seek(), n * sizeof(XYZ), destination);
 }
 
 void CubeStorage::discard() {
@@ -163,11 +180,14 @@ void CubeStorage::discard() {
     if (m_file) {
         // The backing file is kept intact
         // so that CacheWriter can process it.
+        m_file_head->flush();
+        m_file_head.reset();
         m_file.reset();
         m_alloc_seek = 0;
+        m_reserved_end = 0;
         // Thread read-cache problem:
-        // discard() must cause eviction of all entries for each
-        // thread's read cache that point into this.
+        // discard() must cause eviction of all read-cache
+        // entries for each thread's read cache that point into this.
         // This done by incrementing m_storage_version:
         // the entries can't simply be found as they are
         // made with m_storage_version - 1 value.

From 55e15f9f2358f335dbfa972288a8676fee8c1da1 Mon Sep 17 00:00:00 2001
From: JATothrim <jarmo.tiitto@gmail.com>
Date: Fri, 25 Aug 2023 23:11:55 +0300
Subject: [PATCH 40/42] Fix-up asserts and debug build.

Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/cube.hpp    | 2 +-
 cpp/src/cubeSwapSet.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cube.hpp b/cpp/include/cube.hpp
index a13a2e0..5abf4c7 100644
--- a/cpp/include/cube.hpp
+++ b/cpp/include/cube.hpp
@@ -189,7 +189,7 @@ struct Cube {
      * Copy cube data into destination buffer.
      */
     void copyout(int num, XYZ *dest) const {
-        assert(num <= size());
+        assert(num <= (signed)size());
         std::copy_n(begin(), num, dest);
     }
 };
diff --git a/cpp/src/cubeSwapSet.cpp b/cpp/src/cubeSwapSet.cpp
index 7d5819b..979e7ab 100644
--- a/cpp/src/cubeSwapSet.cpp
+++ b/cpp/src/cubeSwapSet.cpp
@@ -108,7 +108,7 @@ void CubeStorage::commit() {
 
     auto& ctx = ThreadCache::get();
     assert(ctx.local_enabled);
-    assert(ctx.local_seek == m_alloc_seek);
+    assert(ctx.local_seek == write_fpos);
     ctx.local_enabled = false;
 
     m_file_head->writeAt(write_fpos, datasize, ctx.local.data());

From 713b0630ebbe65a16d893991a19795bc14168d70 Mon Sep 17 00:00:00 2001
From: JATothrim <jarmo.tiitto@gmail.com>
Date: Fri, 25 Aug 2023 23:23:58 +0300
Subject: [PATCH 41/42] CacheReader: Abstract CubeIterator interface

Surprisingly N=14 is not possible with 16GiB of memory
because at certain point of progress OS begins to swap
*something* and the process grinds to halt.
This happens even if *there is free-memory available* so something is
going haywire.

I found out that the culprit maybe that large (+3 GiB)
CacheReader memory mappings are being swapped out from the memory.
The OS is trying keeping the previously accessed memory in system memory
to our detriment.
For -t K threads we only need to have K Cubes from the cache-file
in memory at once.

Only way out of this problem is to not memory map the entire cache file
at once and instead read it Cube-by-Cube.
I think @nsch0e would have wanted to implement reading this way from
the beginning but he was missing the `mapped::file::readAt()`
that works with absolute file offsets and can read the file in parallel.

Currently FlatCache and CacheReader use the same CubeIterator
and ShapeRange types.
This is an problem for implementing better CubeIterator that reads
the Cubes one-by-one from a file because any changes to
these would break FlatCache that doesn't use cache files.

Start by adding abstract interfaces for CubeIterator and ShapeRange.

- ICubeIterator base class interface for Cube iterators
- CubeIterator the current implementation for ICubeIterator.
- CacheIterator type-erased proxy.
  This is needed to avoid disrupting the CubeIterator class users
  too much and make the type-erased iterator work in practice.
- IShapeRange base class interface.
- Make ICache::getCubesByShape() return reference to the IShapeRange.
- Adapt CubeIterator users to use CacheIterator instead.

Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/newCache.hpp | 165 +++++++++++++++++++++++++++++++--------
 cpp/src/cubes.cpp        |  18 ++---
 cpp/src/newCache.cpp     |  49 +++++++-----
 3 files changed, 172 insertions(+), 60 deletions(-)

diff --git a/cpp/include/newCache.hpp b/cpp/include/newCache.hpp
index b9705ce..c24cde7 100644
--- a/cpp/include/newCache.hpp
+++ b/cpp/include/newCache.hpp
@@ -34,7 +34,47 @@ struct ShapeEntry {
 };
 };  // namespace cacheformat
 
-class CubeIterator {
+/**
+ * newCache.hpp: provide two versions of the cache:
+ *
+ * - FlatCache implements "memory-only" cache and is constructed from Hashy.
+ *   It is needed for boot-strapping the cache files and computing
+ *   cubes without writing any data into disk.
+ *   FlatCache::getCubesByShape() return ShapeRange that points into the Cube data in memory.
+ *   ShapeRange then provides the Cube range as CubeIterator(s).
+ *
+ * - CacheReader implements the actual cache file system.
+ *   CacheReader::getCubesByShape() return FileShapeRange that
+ *   defines subset shape range from the cache file.
+ *   FileShapeRange then provides the Cube range as CubeFileIterator(s).
+ */
+class ICubeIterator {
+   public:
+    using iterator_category = std::forward_iterator_tag;
+    using difference_type = std::ptrdiff_t;
+    using value_type = Cube;
+    using pointer = Cube*;    // or also value_type*
+    using reference = Cube&;  // or also value_type&
+
+    virtual ~ICubeIterator(){};
+
+    virtual std::unique_ptr<ICubeIterator> clone() const = 0;
+
+    virtual const value_type operator*() const = 0;
+    virtual uint64_t seek() const = 0;
+    virtual ICubeIterator& operator++() = 0;
+    virtual ICubeIterator& operator+=(int incr) = 0;
+
+    friend bool operator==(const ICubeIterator& a, const ICubeIterator& b) { return a.seek() == b.seek(); };
+    friend bool operator<(const ICubeIterator& a, const ICubeIterator& b) { return a.seek() < b.seek(); };
+    friend bool operator>(const ICubeIterator& a, const ICubeIterator& b) { return a.seek() > b.seek(); };
+    friend bool operator!=(const ICubeIterator& a, const ICubeIterator& b) { return a.seek() != b.seek(); };
+};
+
+/**
+ * Iterator for Cubes stored in some memory area.
+ */
+class CubeIterator : public ICubeIterator {
    public:
     using iterator_category = std::forward_iterator_tag;
     using difference_type = std::ptrdiff_t;
@@ -48,19 +88,22 @@ class CubeIterator {
     // invalid iterator (can't deference)
     explicit CubeIterator() : n(0), m_ptr(nullptr) {}
 
+    std::unique_ptr<ICubeIterator> clone() const override { return std::make_unique<CubeIterator>(*this); }
+
     // derefecence
-    const value_type operator*() const { return Cube(m_ptr, n); }
+    const value_type operator*() const override { return Cube(m_ptr, n); }
+
     // pointer operator->() { return (pointer)m_ptr; }
 
-    const XYZ* data() const { return m_ptr; }
+    uint64_t seek() const override { return (uint64_t)m_ptr; }
 
     // Prefix increment
-    CubeIterator& operator++() {
+    ICubeIterator& operator++() override {
         m_ptr += n;
         return *this;
     }
 
-    CubeIterator& operator+=(int incr) {
+    ICubeIterator& operator+=(int incr) override {
         m_ptr += n * incr;
         return *this;
     }
@@ -82,19 +125,88 @@ class CubeIterator {
     const XYZ* m_ptr;
 };
 
-class ShapeRange {
+/**
+ * To avoid complicating the use of the ICubeIterator
+ * CacheIterator provides type-erased wrapper that can be copied.
+ */
+class CacheIterator {
+   public:
+    using iterator_category = std::forward_iterator_tag;
+    using difference_type = std::ptrdiff_t;
+    using value_type = Cube;
+    using pointer = Cube*;    // or also value_type*
+    using reference = Cube&;  // or also value_type&
+
+    CacheIterator() {}
+
+    template <typename Itr>
+    explicit CacheIterator(Itr&& init) : proxy(std::make_unique<std::decay_t<Itr>>(std::forward<Itr>(init))) {}
+
+    CacheIterator(const CacheIterator& copy) {
+        if (copy.proxy) {
+            proxy = copy.proxy->clone();
+        }
+    }
+    CacheIterator& operator=(const CacheIterator& x) {
+        CacheIterator tmp(x);
+        std::swap(proxy, tmp.proxy);
+        return *this;
+    }
+    CacheIterator(CacheIterator&& copy) =default;
+    CacheIterator& operator=(CacheIterator&& x) =default;
+
+    const value_type operator*() const { return **proxy; }
+
+    uint64_t seek() const { return proxy->seek(); }
+
+    CacheIterator& operator++() {
+        ++(*proxy);
+        return *this;
+    }
+    CacheIterator& operator+=(int incr) {
+        (*proxy) += incr;
+        return *this;
+    }
+
+    CacheIterator operator++(int) {
+        CacheIterator tmp = *this;
+        ++(*this);
+        return tmp;
+    }
+
+    friend bool operator==(const CacheIterator& a, const CacheIterator& b) { return a.seek() == b.seek(); };
+    friend bool operator<(const CacheIterator& a, const CacheIterator& b) { return a.seek() < b.seek(); };
+    friend bool operator>(const CacheIterator& a, const CacheIterator& b) { return a.seek() > b.seek(); };
+    friend bool operator!=(const CacheIterator& a, const CacheIterator& b) { return a.seek() != b.seek(); };
+
+   private:
+    std::unique_ptr<ICubeIterator> proxy;
+};
+
+class IShapeRange {
+   public:
+    IShapeRange(){};
+    virtual ~IShapeRange() {}
+
+    virtual CacheIterator begin() const = 0;
+    virtual CacheIterator end() const = 0;
+    virtual XYZ& shape() = 0;
+    virtual size_t size() const = 0;
+};
+
+class ShapeRange : public IShapeRange {
    public:
     ShapeRange(const XYZ* start, const XYZ* stop, uint64_t _cubeLen, XYZ _shape)
-        : b(_cubeLen, start), e(_cubeLen, stop), size_(std::distance(start, stop) / _cubeLen), shape_(_shape) {}
+        : b(CubeIterator(_cubeLen, start)), e(CubeIterator(_cubeLen, stop)), size_(std::distance(start, stop) / _cubeLen), shape_(_shape) {}
 
-    CubeIterator begin() { return b; }
-    CubeIterator end() { return e; }
+    CacheIterator begin() const override { return b; }
+    CacheIterator end() const override { return e; }
 
-    XYZ& shape() { return shape_; }
-    auto size() const { return size_; }
+    XYZ& shape() override { return shape_; }
+    size_t size() const override { return size_; }
 
    private:
-    CubeIterator b, e;
+    CacheIterator b, e;
     uint64_t size_;
     XYZ shape_;
 };
@@ -102,7 +214,7 @@ class ShapeRange {
 class ICache {
    public:
     virtual ~ICache(){};
-    virtual ShapeRange getCubesByShape(uint32_t i) = 0;
+    virtual IShapeRange& getCubesByShape(uint32_t i) = 0;
     virtual uint32_t numShapes() = 0;
     virtual size_t size() = 0;
 };
@@ -124,21 +236,8 @@ class CacheReader : public ICache {
     uint32_t numShapes() override { return header->numShapes; };
     operator bool() { return fileLoaded_; }
 
-    // Do begin() and end() make sense for CacheReader
-    // If the cache file provides data for more than single shape?
-    // The data might not even be mapped contiguously to save memory.
-    /*CubeIterator begin() {
-        const uint8_t* start = filePointer + shapes[0].offset;
-        return CubeIterator(header->n, (const XYZ*)start);
-    }
-
-    CubeIterator end() {
-        const uint8_t* stop = filePointer + shapes[0].offset + header->numPolycubes * header->n * XYZ_SIZE;
-        return CubeIterator(header->n, (const XYZ*)stop);
-    }*/
-
     // get shapes at index [0, numShapes()[
-    ShapeRange getCubesByShape(uint32_t i) override;
+    IShapeRange& getCubesByShape(uint32_t i) override;
 
    private:
     std::shared_ptr<mapped::file> file_;
@@ -146,6 +245,8 @@ class CacheReader : public ICache {
     std::unique_ptr<const mapped::array_region<cacheformat::ShapeEntry>> shapes_;
     std::unique_ptr<const mapped::array_region<XYZ>> xyz_;
 
+    std::vector<ShapeRange> shapeRanges;
+
     std::string path_;
     bool fileLoaded_;
     const cacheformat::Header dummyHeader;
@@ -167,16 +268,18 @@ class FlatCache : public ICache {
         for (auto& [shape, set] : hashes) {
             auto begin = allXYZs.data() + allXYZs.size();
             for (auto& subset : set) {
-                for (auto& cubeptr : subset)
-                    cubeptr.copyout(subset.storage(), n, std::back_inserter(allXYZs));
+                for (auto& cubeptr : subset) cubeptr.copyout(subset.storage(), n, std::back_inserter(allXYZs));
             }
             auto end = allXYZs.data() + allXYZs.size();
             // std::printf("  SR %p %p\n", (void*)begin, (void*)end);
             shapes.emplace_back(begin, end, n, shape);
         }
+
+        // Add dummy shape range at back:
+        shapes.emplace_back(nullptr, nullptr, n, XYZ(0, 0, 0));
     }
-    ShapeRange getCubesByShape(uint32_t i) override {
-        if (i >= shapes.size()) return ShapeRange{nullptr, nullptr, 0, XYZ(0, 0, 0)};
+    IShapeRange& getCubesByShape(uint32_t i) override {
+        if (i >= shapes.size() - 1) return shapes.back();
         return shapes[i];
     };
     uint32_t numShapes() override { return shapes.size(); };
diff --git a/cpp/src/cubes.cpp b/cpp/src/cubes.cpp
index 89b4e12..ec34936 100644
--- a/cpp/src/cubes.cpp
+++ b/cpp/src/cubes.cpp
@@ -22,27 +22,27 @@ struct Workset {
     std::mutex mu;
 
     CacheReader cr;
-    CubeIterator _begin_total;
-    CubeIterator _begin;
-    CubeIterator _end;
+    CacheIterator _begin_total;
+    CacheIterator _begin;
+    CacheIterator _end;
     Hashy &hashes;
     XYZ targetShape, shape, expandDim;
     bool notSameShape;
     Workset(Hashy &hashes, XYZ targetShape, XYZ shape, XYZ expandDim, bool notSameShape)
         : hashes(hashes), targetShape(targetShape), shape(shape), expandDim(expandDim), notSameShape(notSameShape) {}
 
-    void setRange(ShapeRange &data) {
+    void setRange(IShapeRange &data) {
         _begin_total = data.begin();
         _begin = data.begin();
         _end = data.end();
     }
 
     struct Subset {
-        CubeIterator _begin, _end;
+        CacheIterator _begin, _end;
         bool valid;
         float percent;
-        auto begin() { return _begin; }
-        auto end() { return _end; }
+        CacheIterator begin() { return _begin; }
+        CacheIterator end() { return _end; }
     };
 
     Subset getPart() {
@@ -50,7 +50,7 @@ struct Workset {
         auto a = _begin;
         _begin += 500;
         if (_begin > _end) _begin = _end;
-        return {a, _begin, a < _end, 100 * float(std::distance(_begin_total.data(), a.data())) / std::distance(_begin_total.data(), _end.data())};
+        return {a, _begin, a < _end, 100 * float(a.seek() - _begin_total.seek() + 1) / (_end.seek() - _begin_total.seek() + 1)};
     }
 
     void expand(const Cube &c) {
@@ -275,7 +275,7 @@ FlatCache gen(int n, int threads, bool use_cache, bool write_cache, bool split_c
                 base = &ws->cr;
                 // cr.printHeader();
             }
-            auto s = base->getCubesByShape(sid);
+            auto& s = base->getCubesByShape(sid);
             if (shape != s.shape()) {
                 std::printf("ERROR caches shape does not match expected shape!\n");
                 exit(-1);
diff --git a/cpp/src/newCache.cpp b/cpp/src/newCache.cpp
index 8ae2f5b..e2d7800 100644
--- a/cpp/src/newCache.cpp
+++ b/cpp/src/newCache.cpp
@@ -51,48 +51,57 @@ int CacheReader::loadFile(const std::string path) {
     shapes_ = std::make_unique<const mapped::array_region<cacheformat::ShapeEntry>>(file_, header_->getEndSeek(), (*header_)->numShapes);
     shapes = shapes_->get();
 
+    // Initialize ShapeRanges
     size_t datasize = 0;
     for (unsigned int i = 0; i < header->numShapes; ++i) {
         datasize += shapes[i].size;
     }
 
-    // map rest of the file as XYZ data:
     if (file_->size() != shapes_->getEndSeek() + datasize) {
         std::printf("warn: file size does not match expected value\n");
     }
+
     xyz_ = std::make_unique<const mapped::array_region<XYZ>>(file_, shapes_->getEndSeek(), datasize);
 
+    // Initialize shapeRanges array:
+    size_t offset = 0;
+    for (unsigned int i = 0; i < header->numShapes; ++i) {
+        if (shapes[i].size) {
+            auto index = offset / cacheformat::XYZ_SIZE;
+            auto num_xyz = shapes[i].size / cacheformat::XYZ_SIZE;
+            auto start = xyz_->get() + index;
+            auto end = xyz_->get() + index + num_xyz;
+
+            shapeRanges.emplace_back(start, end, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2));
+        } else {
+            // table entry has no data.
+            // shapes[i].offset may have bogus value.
+            shapeRanges.emplace_back(nullptr, nullptr, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2));
+        }
+
+        offset += shapes[i].size;
+    }
+
+    // Add dummy entry at back:
+    shapeRanges.emplace_back(nullptr, nullptr, header->n, XYZ(0, 0, 0));
+
     fileLoaded_ = true;
 
     return 0;
 }
 
-ShapeRange CacheReader::getCubesByShape(uint32_t i) {
+IShapeRange &CacheReader::getCubesByShape(uint32_t i) {
     if (i >= header->numShapes) {
-        return ShapeRange{nullptr, nullptr, 0, XYZ(0, 0, 0)};
-    }
-    if (shapes[i].size <= 0) {
-        return ShapeRange{nullptr, nullptr, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2)};
+        return shapeRanges.back();
     }
-    // get section start
-    // note: shapes[i].offset may have bogus offset
-    // if any earlier shape table entry was empty before i
-    // so we ignore the offset here.
-    size_t offset = 0;
-    for (unsigned int k = 0; k < i; ++k) {
-        offset += shapes[k].size;
-    }
-    auto index = offset / cacheformat::XYZ_SIZE;
-    auto num_xyz = shapes[i].size / cacheformat::XYZ_SIZE;
-    // pointers to Cube data:
-    auto start = xyz_->get() + index;
-    auto end = xyz_->get() + index + num_xyz;
-    return ShapeRange{start, end, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2)};
+
+    return shapeRanges[i];
 }
 
 void CacheReader::unload() {
     // unload file from memory
     if (fileLoaded_) {
+        shapeRanges.clear();
         xyz_.reset();
         shapes_.reset();
         header_.reset();

From 37d51e5ee567a8c9a6cf36c0ef441aa7f342205e Mon Sep 17 00:00:00 2001
From: JATothrim <jarmo.tiitto@gmail.com>
Date: Sat, 26 Aug 2023 03:36:26 +0300
Subject: [PATCH 42/42] CacheReader: Implement cache file reading one Cube at
 time

- Remove CacheReader XYZ mapping.
- Add CubeReadIterator that reads Cubes one at time.
- FileShapeRange takes the cache file and offsets into the file
- Update CacheReader::loadFile() to initialize array of
  FileShapeRange from the cache file.

Result is celebration hooray for computing N=14 first time
with less than 9 GiB of RSS:

```process output shape  99/101 [ 3  5  5]
  shape 2 5 5
  shape 3 4 5
  num: 588828
saved ./cache/cubes_14_3-5-5.bin, took 0.01 s
process output shape 100/101 [ 4  4  4]
  shape 3 4 4
  shape 4 4 4
  num: 3341560
saved ./cache/cubes_14_4-4-4.bin, took 0.11 s
process output shape 101/101 [ 4  4  5]
  shape 3 4 5
  shape 4 4 4
  num: 752858
saved ./cache/cubes_14_4-4-5.bin, took 0.02 s
took 7231.83 s
num total cubes: 1039496297```

My nvme disk was not particularly happy with with
`output shape  80/101 [ 2  3  4]` that produced an +8 GiB file at end.
The disk throttled badly after reaching 60*C...
But it did complete eventually at reasonable pace and
memory usage dropped below 7 GiB for rest of the run.

N=15 will require more tuning to the CubeStorage read-cache and
more parallel file system.
btrfs looks to be not very good at this job
as writing the storage files in parallel reduces the program to
near single threaded speed.

Signed-off-by: JATothrim <jarmo.tiitto@gmail.com>
---
 cpp/include/newCache.hpp | 78 ++++++++++++++++++++++++++++++++++++++--
 cpp/src/newCache.cpp     | 26 +++++++-------
 2 files changed, 88 insertions(+), 16 deletions(-)

diff --git a/cpp/include/newCache.hpp b/cpp/include/newCache.hpp
index c24cde7..20fd660 100644
--- a/cpp/include/newCache.hpp
+++ b/cpp/include/newCache.hpp
@@ -8,6 +8,7 @@
 #include <mutex>
 #include <string>
 #include <thread>
+#include <memory>
 
 #include "cube.hpp"
 #include "hashes.hpp"
@@ -125,6 +126,61 @@ class CubeIterator : public ICubeIterator {
     const XYZ* m_ptr;
 };
 
+class CubeReadIterator : public ICubeIterator {
+   public:
+    using iterator_category = std::forward_iterator_tag;
+    using difference_type = std::ptrdiff_t;
+    using value_type = Cube;
+    using pointer = Cube*;    // or also value_type*
+    using reference = Cube&;  // or also value_type&
+
+    // constructor
+    CubeReadIterator(std::shared_ptr<mapped::file> file, uint32_t _n, mapped::seekoff_t offset) : n(_n), m_seek(offset), m_file(file) {}
+
+    // invalid iterator (can't deference)
+    explicit CubeReadIterator() : n(0), m_seek(-1) {}
+
+    std::unique_ptr<ICubeIterator> clone() const override { return std::make_unique<CubeReadIterator>(*this); }
+
+    // derefecence
+    const value_type operator*() const override { return read(); }
+
+    // pointer operator->() { return (pointer)m_seek; }
+
+    uint64_t seek() const override { return (uint64_t)m_seek; }
+
+    // Prefix increment
+    ICubeIterator& operator++() override {
+        m_seek += n * sizeof(XYZ);
+        return *this;
+    }
+
+    ICubeIterator& operator+=(int incr) override {
+        m_seek += n * incr * sizeof(XYZ);
+        return *this;
+    }
+
+    // Postfix increment
+    CubeReadIterator operator++(int) {
+        CubeReadIterator tmp = *this;
+        ++(*this);
+        return tmp;
+    }
+
+    friend bool operator==(const CubeReadIterator& a, const CubeReadIterator& b) { return a.m_seek == b.m_seek; };
+    friend bool operator<(const CubeReadIterator& a, const CubeReadIterator& b) { return a.m_seek < b.m_seek; };
+    friend bool operator>(const CubeReadIterator& a, const CubeReadIterator& b) { return a.m_seek > b.m_seek; };
+    friend bool operator!=(const CubeReadIterator& a, const CubeReadIterator& b) { return a.m_seek != b.m_seek; };
+
+   private:
+    uint32_t n;
+    mapped::seekoff_t m_seek;
+    std::shared_ptr<mapped::file> m_file;
+
+    // de-reference is implemented by read()
+    Cube read() const;
+};
+
 /**
  * To avoid complicating the use of the ICubeIterator
  * CacheIterator provides type-erased wrapper that can be copied.
@@ -211,6 +267,25 @@ class ShapeRange : public IShapeRange {
     XYZ shape_;
 };
 
+class FileShapeRange : public IShapeRange {
+   public:
+    FileShapeRange(std::shared_ptr<mapped::file> file, mapped::seekoff_t start, mapped::seekoff_t stop, uint64_t _cubeLen, XYZ _shape)
+        : b(CubeReadIterator(file, _cubeLen, start)),
+        e(CubeReadIterator(file, _cubeLen, stop)),
+        size_((stop - start) / _cubeLen), shape_(_shape) {}
+
+    CacheIterator begin() const override { return b; }
+    CacheIterator end() const override { return e; }
+
+    XYZ& shape() override { return shape_; }
+    size_t size() const override { return size_; }
+
+   private:
+    CacheIterator b, e;
+    uint64_t size_;
+    XYZ shape_;
+};
+
 class ICache {
    public:
     virtual ~ICache(){};
@@ -243,9 +318,8 @@ class CacheReader : public ICache {
     std::shared_ptr<mapped::file> file_;
     std::unique_ptr<const mapped::struct_region<cacheformat::Header>> header_;
     std::unique_ptr<const mapped::array_region<cacheformat::ShapeEntry>> shapes_;
-    std::unique_ptr<const mapped::array_region<XYZ>> xyz_;
 
-    std::vector<ShapeRange> shapeRanges;
+    std::vector<FileShapeRange> shapeRanges;
 
     std::string path_;
     bool fileLoaded_;
diff --git a/cpp/src/newCache.cpp b/cpp/src/newCache.cpp
index e2d7800..9e0c54e 100644
--- a/cpp/src/newCache.cpp
+++ b/cpp/src/newCache.cpp
@@ -61,35 +61,34 @@ int CacheReader::loadFile(const std::string path) {
         std::printf("warn: file size does not match expected value\n");
     }
 
-    xyz_ = std::make_unique<const mapped::array_region<XYZ>>(file_, shapes_->getEndSeek(), datasize);
-
     // Initialize shapeRanges array:
-    size_t offset = 0;
     for (unsigned int i = 0; i < header->numShapes; ++i) {
         if (shapes[i].size) {
-            auto index = offset / cacheformat::XYZ_SIZE;
-            auto num_xyz = shapes[i].size / cacheformat::XYZ_SIZE;
-            auto start = xyz_->get() + index;
-            auto end = xyz_->get() + index + num_xyz;
-
-            shapeRanges.emplace_back(start, end, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2));
+            auto start = shapes[i].offset;
+            auto end = start + shapes[i].size;
+            shapeRanges.emplace_back(file_, start, end, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2));
         } else {
             // table entry has no data.
             // shapes[i].offset may have bogus value.
-            shapeRanges.emplace_back(nullptr, nullptr, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2));
+            shapeRanges.emplace_back(file_, -1, -1, header->n, XYZ(shapes[i].dim0, shapes[i].dim1, shapes[i].dim2));
         }
-
-        offset += shapes[i].size;
     }
 
     // Add dummy entry at back:
-    shapeRanges.emplace_back(nullptr, nullptr, header->n, XYZ(0, 0, 0));
+    shapeRanges.emplace_back(file_, -1, -1, header->n, XYZ(0, 0, 0));
 
     fileLoaded_ = true;
 
     return 0;
 }
 
+Cube CubeReadIterator::read() const {
+    Cube tmp(n);
+    m_file->readAt(m_seek, n * sizeof(XYZ), tmp.data());
+    return tmp;
+}
+
+
 IShapeRange &CacheReader::getCubesByShape(uint32_t i) {
     if (i >= header->numShapes) {
         return shapeRanges.back();
@@ -102,7 +101,6 @@ void CacheReader::unload() {
     // unload file from memory
     if (fileLoaded_) {
         shapeRanges.clear();
-        xyz_.reset();
         shapes_.reset();
         header_.reset();
         file_.reset();