diff --git a/notebooks/02_benchmark_parsing.ipynb b/notebooks/02_benchmark_parsing.ipynb index 5fd15b6..039b983 100644 --- a/notebooks/02_benchmark_parsing.ipynb +++ b/notebooks/02_benchmark_parsing.ipynb @@ -6,7 +6,7 @@ "source": [ "# Benchmark: np.memmap vs struct.unpack\n", "\n", - "fastEIT parses Dräger `.bin` files using `np.memmap`, the OS maps the file\n", + "fastEIT parses Dr\u00e4ger `.bin` files using `np.memmap`, the OS maps the file\n", "directly into virtual memory as a numpy structured array, with no Python loop\n", "and no intermediate copy.\n", "\n", @@ -18,7 +18,7 @@ "every frame. Only the parsing strategy differs.\n", "\n", "**File sizes tested:** 10 MB - 50 MB - 100 MB - 250 MB \n", - "**Frame format:** Dräger BASE (4358 bytes/frame, 1024 float32 pixels per frame)" + "**Frame format:** Dr\u00e4ger BASE (4358 bytes/frame, 1024 float32 pixels per frame)" ] }, { @@ -36,13 +36,8 @@ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "\n", - "# Import the actual dtype and sentinel utilities from fastEIT\n", - "from fasteit.parsers.draeger.bin.draeger_dtypes import FRAME_BASE_DTYPE\n", - "from fasteit.parsers.draeger.bin.bin_utils import (\n", - " replace_no_data_sentinels,\n", - " _BIT_SENTINELS,\n", - " _FLOAT_SENTINELS,\n", - ")" + "# Import the actual dtype from fastEIT\n", + "from fasteit.parsers.draeger.bin.draeger_dtypes import FRAME_BASE_DTYPE" ] }, { @@ -97,16 +92,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "Generated 10 MB target → 10.00 MB, 2406 frames\n", - "Generated 50 MB target → 50.00 MB, 12030 frames\n", - "Generated 100 MB target → 100.00 MB, 24060 frames\n", - "Generated 250 MB target → 250.00 MB, 60152 frames\n" + "Generated 10 MB target \u2192 10.00 MB, 2406 frames\n", + "Generated 50 MB target \u2192 50.00 MB, 12030 frames\n", + "Generated 100 MB target \u2192 100.00 MB, 24060 frames\n", + "Generated 250 MB target \u2192 250.00 MB, 60152 frames\n" ] } ], "source": [ "def make_synthetic_bin(path: Path, n_frames: int) -> None:\n", - " \"\"\"Write n_frames of random data in Dräger BASE dtype format.\"\"\"\n", + " \"\"\"Write n_frames of random data in Dr\u00e4ger BASE dtype format.\"\"\"\n", " rng = np.random.default_rng(seed=42) # fixed seed = reproducible\n", " frames = np.zeros(n_frames, dtype=FRAME_BASE_DTYPE)\n", " frames[\"ts\"] = rng.uniform(0.5, 0.9, n_frames)\n", @@ -121,7 +116,7 @@ " p = tmpdir / f\"synthetic_{mb}mb.bin\"\n", " make_synthetic_bin(p, n)\n", " bin_files[mb] = p\n", - " print(f\"Generated {mb:>4} MB target → {p.stat().st_size / 1024**2:.2f} MB, {n} frames\")" + " print(f\"Generated {mb:>4} MB target \u2192 {p.stat().st_size / 1024**2:.2f} MB, {n} frames\")" ] }, { @@ -150,10 +145,10 @@ "source": [ "def parse_memmap(path: Path) -> np.ndarray:\n", " n_frames = path.stat().st_size // FRAME_SIZE\n", - " # Map file as structured array — zero-copy, OS handles paging\n", + " # Map file as structured array \u2014 zero-copy, OS handles paging\n", + " # Sentinel values are preserved: replacement is deferred to preprocessing layer\n", " mapped = np.memmap(path, dtype=FRAME_BASE_DTYPE, mode=\"r\", shape=(n_frames,))\n", - " pixels = replace_no_data_sentinels(mapped[\"pixels\"], _FLOAT_SENTINELS, _BIT_SENTINELS)\n", - " return pixels\n", + " return mapped[\"pixels\"]\n", "\n", "\n", "_STRUCT_FMT = \"" ] @@ -296,7 +291,7 @@ "ram_st = [r[\"ram_struct\"] for r in results]\n", "\n", "fig, axes = plt.subplots(1, 2, figsize=(12, 5))\n", - "fig.suptitle(\"fastEIT parsing benchmark — np.memmap vs struct.unpack loop\", fontsize=13)\n", + "fig.suptitle(\"fastEIT parsing benchmark \u2014 np.memmap vs struct.unpack loop\", fontsize=13)\n", "\n", "# Plot 1: parse time (log scale)\n", "ax = axes[0]\n", @@ -326,25 +321,7 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "## Interpretation\n", - "\n", - "memmap is faster and uses less RAM across all file sizes tested.\n", - "\n", - "At **small files** (10–50 MB), the OS page cache likely holds the file after\n", - "generation, so memmap reads are served from memory without disk I/O.\n", - "struct.unpack is bottlenecked by Python overhead: one function call and one\n", - "tuple allocation per frame, regardless of file size.\n", - "\n", - "At **large files** (100–250 MB), both approaches become **I/O-bound**, reading\n", - "from disk dominates the total time. struct's Python overhead stays constant in\n", - "absolute terms but shrinks as a fraction of the total, so the speedup ratio\n", - "drops (from ~21× at 50 MB to ~9× at 250 MB). Both are limited by the same disk bandwidth floor.\n", - "\n", - "struct peak RAM grows with file size because it loads the entire file into a\n", - "`bytes` buffer before parsing. memmap peak RAM grows only with the pixel output\n", - "array. Roughly 60% of struct's peak at equivalent file size." - ] + "source": "## Interpretation\n\nmemmap is faster and uses less RAM across all file sizes tested.\n\n**Why memmap is fast:** `np.memmap` sets up a virtual address space mapping \u2014 the OS loads file pages on demand. `parse_memmap()` returns a lazy view of the pixel field: no data is actually read from disk until accessed; the mapping is read-only.\n\n**Why struct.unpack is slower:** `path.read_bytes()` loads the entire file into a Python `bytes` buffer in RAM before any parsing begins. Every frame then requires one `struct.unpack_from` call and one tuple allocation.\n\nstruct peak RAM grows linearly: one full `bytes` buffer + one pixel array.\nmemmap peak RAM is near zero: only the virtual mapping is set up, no pixel array is allocated in RAM.\n\nThis could be useful to separate completely the parsing layer and the preprocessing layer. The computational cost stays entirely in the preprocessing layer, avoiding extra overhead when parsing more than one file. You will pay the computational cost only on the file you intend to work on, and only when you start working on it." } ], "metadata": { @@ -360,4 +337,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/src/fasteit/parsers/draeger/bin/bin_parser.py b/src/fasteit/parsers/draeger/bin/bin_parser.py index 9f9ba7b..baa49d7 100644 --- a/src/fasteit/parsers/draeger/bin/bin_parser.py +++ b/src/fasteit/parsers/draeger/bin/bin_parser.py @@ -12,11 +12,8 @@ from fasteit.parsers.errors import AmbiguousFormatError, UnsupportedFrameSizeError from .bin_utils import ( - _BIT_SENTINELS, - _FLOAT_SENTINELS, estimate_sampling_frequency_hz, normalize_frame_slice, - replace_no_data_sentinels, ) # Default sampling frequency used when fs cannot be estimated from timestamps. @@ -43,10 +40,6 @@ class DragerBinParser(BaseParser): https://github.com/EIT-ALIVE/eitprocessing """ - def __init__(self) -> None: - self._float_sentinels = _FLOAT_SENTINELS - self._bit_sentinels = _BIT_SENTINELS - def validate(self, path: Path) -> bool: """Return True if file exists, is non-empty, and has a known frame size.""" path = Path(path) @@ -119,35 +112,17 @@ def parse( else: warnings_list = [] - # ── 6. Copy memmap to writable array ────────────────────────────────── - frames = mapped_frames.copy() - - # ── 7. Sanitize pixels: vectorized over all frames at once ──────────── - frames["pixels"] = replace_no_data_sentinels( - mapped_frames["pixels"], - self._float_sentinels, - self._bit_sentinels, - ) - - # ── 8. Sanitize Medibus data if present (EXT format only) ───────────── - if spec.medibus_fields is not None: - frames["medibus_data"] = replace_no_data_sentinels( - frames["medibus_data"], - self._float_sentinels, - self._bit_sentinels, - ) - - # ── 9. Build aux_signals dict: {signal_name → array shape (N,)} ────── + # ── 6. Build aux_signals dict: {signal_name → array shape (N,)} ────── aux_signals = None if spec.medibus_fields is not None: aux_signals = { - field_name: frames["medibus_data"][:, field_idx] + field_name: mapped_frames["medibus_data"][:, field_idx] for field_idx, field_name in enumerate(spec.medibus_fields) } # ── 10. Assemble and return result ──────────────────────────────────── result = ReconstructedFrameData( - frames=frames, + frames=mapped_frames, aux_signals=aux_signals, fs=fs, filename=str(path), diff --git a/tests/test_bin_parser.py b/tests/test_bin_parser.py index 1d7d956..dfb8a88 100644 --- a/tests/test_bin_parser.py +++ b/tests/test_bin_parser.py @@ -125,7 +125,12 @@ def test_parse_slice_max_frames(tmp_path): # ── float sentinel replacement ──────────────────────────────────────────────── -def test_parse_float_sentinel_replaced_with_nan(tmp_path): +def test_parse_float_sentinel_preserved_in_raw_data(tmp_path): + """Parser returns raw memmap — sentinel values are NOT replaced. + + Sentinel replacement is deferred to the preprocessing layer so that + the memmap is never copied to RAM during parsing (lazy loading). + """ frames = np.zeros(3, dtype=FRAME_BASE_DTYPE) dt_day = 1.0 / (50.0 * 86400.0) frames["ts"] = np.arange(3) * dt_day @@ -134,12 +139,10 @@ def test_parse_float_sentinel_replaced_with_nan(tmp_path): frames.tofile(path) data = DragerBinParser().parse(path) - assert np.isnan(data.pixels[1, 0, 0]) + assert data.pixels[1, 0, 0] == -1000.0 # ── Round-trip value correctness (Tasks 1.5.3 / 1.5.4 / 1.5.5 / 1.5.10) ───── -# Uses the shared bin_3frames fixture from conftest.py. -# Frame i has all pixels = float(i+1), timestamps at 50 Hz fraction-of-day. _DT_DAY = 1.0 / (50.0 * 86400.0)