From d7d7361b452cd430a701fb691f60f9fdbafdb159 Mon Sep 17 00:00:00 2001 From: Jordao Bragantini Date: Mon, 15 Jun 2026 13:20:41 -0700 Subject: [PATCH 1/2] Support Zarr and OME-Zarr inputs in the track CLI The track CLI loader only handled single files and folders of frames via the optional bioio extra. Broaden hoct._io.load_array to read three input families with always-available dependencies: - Single TIFF, read with tifffile (first channel, length-1 axes collapsed). - Simple Zarr arrays, with length-1 axes collapsed. - OME-Zarr (t, c, z, y, x) multiscale groups: highest-resolution level, first channel kept, length-1 axes collapsed (c=1/z=1 -> (T, Y, X)). bioio is now only a fallback for other single-file formats. The track command's layout check uses is_frame_folder so a .zarr store (a directory) is treated as a single input rather than a frame folder. tifffile and zarr are promoted to explicit dependencies since they are now imported directly. Adds unit tests for the new loader paths and an end-to-end track test on OME-Zarr inputs. --- pyproject.toml | 2 + src/hoct/_io.py | 176 ++++++++++++++++++++++-------- src/hoct/_tests/test_cli_track.py | 50 ++++++++- src/hoct/_tests/test_io.py | 170 ++++++++++++++++++++++++++++- src/hoct/cli.py | 20 ++-- 5 files changed, 357 insertions(+), 61 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d2bcee3..d760265 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,8 @@ dependencies = [ "pooch>=1.7.0", "scipy>=1.11.0", "scikit-image>=0.21.0", + "tifffile>=2023.1.0", + "zarr>=3.0", "h5py>=3.9.0", "toolz>=0.11.0", "dask[array]>=2022.3.0", diff --git a/src/hoct/_io.py b/src/hoct/_io.py index 7cd37f0..0d03633 100644 --- a/src/hoct/_io.py +++ b/src/hoct/_io.py @@ -1,79 +1,161 @@ """Image loading helpers for the high-level CLI. -Provides a single :func:`load_array` helper that accepts either a file or a -folder of frames and returns a ``(T, Y, X)`` or ``(T, Z, Y, X)`` numpy array +:func:`load_array` accepts a single file, a Zarr store, or a folder of +single-frame files, and returns a ``(T, Y, X)`` or ``(T, Z, Y, X)`` numpy array suitable for :func:`hoct.predict`. -Image reading uses ``bioio`` (optional dependency, ``hoct[bioio]``). +Supported inputs +---------------- +* **Single TIFF** — a multi-page ``(T, [Z,] Y, X)`` stack, read with ``tifffile``. +* **Zarr array** ("simple zarr") — any layout; length-1 axes are collapsed. +* **OME-Zarr** — a ``(t, c, z, y, x)`` multiscale group; the highest-resolution + level is read, the first channel is kept, and length-1 axes are collapsed. +* **Folder of frames** — one single-timepoint file per timepoint, sorted + alphabetically and stacked along a new leading T axis. + +TIFF and Zarr are read with always-available dependencies. Any other +single-file format falls back to ``bioio`` (optional extra ``hoct[bioio]``). """ from pathlib import Path import numpy as np +_TIFF_SUFFIXES = {".tif", ".tiff"} +_ZARR_MARKERS = ("zarr.json", ".zarray", ".zgroup") +# Channel ("c") and RGB-sample ("s") axes are reduced to their first index. +_CHANNEL_AXES = ("c", "s") -def _require_bioio(): - """Import ``bioio.BioImage`` lazily and surface a clear install hint on failure.""" - try: - from bioio import BioImage # type: ignore[import-not-found] - except ImportError as exc: # pragma: no cover - exercised only when extra missing - raise ImportError( - "Reading raw images requires the 'bioio' extra. Install it with: pip install 'hoct[bioio]'" - ) from exc - return BioImage +def _is_zarr(path: Path) -> bool: + """Return True if ``path`` is a Zarr store (``.zarr`` suffix or zarr metadata).""" + if path.suffix == ".zarr": + return True + if path.is_dir(): + return any((path / marker).exists() for marker in _ZARR_MARKERS) + return False -def _is_image_file(path: Path) -> bool: - """Skip dotfiles, directories, and obvious non-image siblings.""" - if not path.is_file(): - return False - if path.name.startswith("."): - return False - return True +def is_frame_folder(path: Path) -> bool: + """Return True if ``path`` is a folder of single-frame files (not a Zarr store).""" + path = Path(path) + return path.is_dir() and not _is_zarr(path) -def _load_timeseries(path: Path) -> np.ndarray: - """Load a single file as a (T, [Z,] Y, X) array via bioio. - Reads the first channel (``C=0``) and drops the channel axis. If the data - has a singleton Z dimension, it is squeezed to yield a 2D+t volume. - """ - BioImage = _require_bioio() - img = BioImage(str(path)) - data = np.asarray(img.get_image_data("TZYX", C=0)) # (T, Z, Y, X) - if data.shape[1] == 1: - data = data[:, 0] # (T, Y, X) +def _collapse_singleton_axes(data: np.ndarray) -> np.ndarray: + """Drop length-1 axes, always keeping the trailing two (Y, X).""" + drop = tuple(axis for axis in range(data.ndim - 2) if data.shape[axis] == 1) + return np.squeeze(data, axis=drop) if drop else data + + +def _select_first_channel(data: np.ndarray, axes: str) -> np.ndarray: + """Keep only the first index of any channel/sample axis named in ``axes``.""" + names = [a.lower() for a in axes] + for channel in _CHANNEL_AXES: + if channel in names: + index = names.index(channel) + data = data.take(0, axis=index) + names.pop(index) return data -def _load_frame(path: Path) -> np.ndarray: - """Load a single file as one frame and return a ([Z,] Y, X) array. +def _reduce_to_movie(data: np.ndarray, axes: str) -> np.ndarray: + """Reduce a labelled array to ``(T, [Z,] Y, X)``. - Used for folder-of-frames inputs where each file represents one timepoint. - Reads ``C=0, T=0`` and squeezes a singleton Z to produce 2D frames. + Keeps the first channel of any channel axis, then collapses length-1 axes. + ``axes`` is a per-dimension code such as ``"TCZYX"`` or ``"TYX"``. """ - BioImage = _require_bioio() + data = _select_first_channel(data, axes) + return _collapse_singleton_axes(data) + + +def _load_tiff(path: Path) -> np.ndarray: + """Read a single TIFF stack and reduce it to ``(T, [Z,] Y, X)``.""" + import tifffile + + with tifffile.TiffFile(str(path)) as tif: + series = tif.series[0] + data = np.asarray(series.asarray()) + axes = series.axes + return _reduce_to_movie(data, axes) + + +def _ome_multiscales(attrs: dict) -> list | None: + """Return the OME-NGFF ``multiscales`` list, supporting v0.4 and v0.5 layouts.""" + if "multiscales" in attrs: + return attrs["multiscales"] + ome = attrs.get("ome") + if isinstance(ome, dict) and "multiscales" in ome: + return ome["multiscales"] + return None + + +def _load_ome_zarr(group, multiscales: list) -> np.ndarray: + """Read the highest-resolution level of an OME-Zarr group as ``(T, [Z,] Y, X)``.""" + metadata = multiscales[0] + axes = "".join(axis["name"] if isinstance(axis, dict) else axis for axis in metadata["axes"]) + # OME datasets are ordered from highest to lowest resolution. + dataset_path = metadata["datasets"][0]["path"] + data = np.asarray(group[dataset_path][:]) + return _reduce_to_movie(data, axes) + + +def _load_zarr(path: Path) -> np.ndarray: + """Read a Zarr store: an OME-Zarr group or a plain ("simple") array.""" + import zarr + + node = zarr.open(str(path), mode="r") + if isinstance(node, zarr.Group): + multiscales = _ome_multiscales(dict(node.attrs)) + if multiscales is None: + raise ValueError(f"Zarr group at {path} has no OME 'multiscales' metadata.") + return _load_ome_zarr(node, multiscales) + return _collapse_singleton_axes(np.asarray(node[:])) + + +def _load_with_bioio(path: Path) -> np.ndarray: + """Fallback reader for single files in non-TIFF, non-Zarr formats.""" + try: + from bioio import BioImage # type: ignore[import-not-found] + except ImportError as exc: # pragma: no cover - exercised only when extra missing + raise ImportError( + f"Reading '{path.suffix}' files requires the 'bioio' extra. Install it with: pip install 'hoct[bioio]'" + ) from exc img = BioImage(str(path)) - data = np.asarray(img.get_image_data("ZYX", C=0, T=0)) # (Z, Y, X) - if data.shape[0] == 1: - data = data[0] # (Y, X) - return data + data = np.asarray(img.get_image_data("TZYX", C=0)) # (T, Z, Y, X) + return _collapse_singleton_axes(data) + + +def _is_image_file(path: Path) -> bool: + """Skip dotfiles and directories when listing a frame folder.""" + return path.is_file() and not path.name.startswith(".") + + +def _read_file(path: Path) -> np.ndarray: + """Read a single file (Zarr, TIFF, or via bioio) and collapse length-1 axes.""" + if _is_zarr(path): + return _load_zarr(path) + if path.suffix.lower() in _TIFF_SUFFIXES: + return _load_tiff(path) + return _load_with_bioio(path) def load_array(path: Path) -> np.ndarray: - """Load image data from a file or folder. + """Load image data from a file, a Zarr store, or a folder of frames. Conventions ----------- - * **File**: the entire time series lives in one file. Returns - ``(T, Y, X)`` for 2D+t or ``(T, Z, Y, X)`` for 3D+t. - * **Folder**: each file is one timepoint, sorted alphabetically. Frames are - stacked along a new T axis. Returns the same shapes as above. + * **Single file / Zarr store**: holds the entire time series. Returns + ``(T, Y, X)`` for 2D+t or ``(T, Z, Y, X)`` for 3D+t. Length-1 axes (e.g. a + singleton channel or Z) are collapsed. + * **Folder**: each file is one timepoint, sorted alphabetically and stacked + along a new T axis. Returns the same shapes as above. Parameters ---------- path - Path to an image file or to a folder of single-frame image files. + Path to an image file, a ``.zarr`` store, or a folder of single-frame + image files. Returns ------- @@ -84,14 +166,14 @@ def load_array(path: Path) -> np.ndarray: if not path.exists(): raise FileNotFoundError(path) - if path.is_dir(): + if is_frame_folder(path): files = sorted(p for p in path.iterdir() if _is_image_file(p)) if not files: raise ValueError(f"No image files found in folder: {path}") - frames = [_load_frame(p) for p in files] + frames = [_read_file(p) for p in files] shapes = {f.shape for f in frames} if len(shapes) > 1: raise ValueError(f"Frames in {path} have inconsistent shapes: {shapes}") return np.stack(frames, axis=0) - return _load_timeseries(path) + return _read_file(path) diff --git a/src/hoct/_tests/test_cli_track.py b/src/hoct/_tests/test_cli_track.py index ade4808..f5b20e6 100644 --- a/src/hoct/_tests/test_cli_track.py +++ b/src/hoct/_tests/test_cli_track.py @@ -6,10 +6,9 @@ import pytest import tifffile import tracksdata as td +import zarr from typer.testing import CliRunner -pytest.importorskip("bioio") # CLI requires the bioio extra - from hoct._tests.conftest import MODEL_PATH from hoct.cli import app @@ -45,6 +44,21 @@ def _write_folder(folder: Path, stack: np.ndarray, frame_axes: str) -> None: _write_imagej_tiff(folder / f"frame_{t:03d}.tif", frame, frame_axes) +def _write_ome_zarr(path: Path, stack: np.ndarray) -> None: + """Write a (T, Y, X) stack as an OME-Zarr group with (t, c, z, y, x) axes.""" + data = stack[:, np.newaxis, np.newaxis] # (T, 1, 1, Y, X) + group = zarr.open_group(str(path), mode="w") + level = group.create_array("0", shape=data.shape, dtype=data.dtype) + level[:] = data + group.attrs["multiscales"] = [ + { + "version": "0.4", + "axes": [{"name": name} for name in "tczyx"], + "datasets": [{"path": "0"}], + } + ] + + @pytest.fixture def model_path() -> Path: if not MODEL_PATH.exists(): @@ -63,6 +77,17 @@ def movie_2d_files(tmp_path): return img_path, seg_path +@pytest.fixture +def movie_2d_ome_zarr(tmp_path): + """OME-Zarr (t, c=1, z=1, y, x) image and segmentation stores.""" + images, labels = _make_2d_movie() + img_path = tmp_path / "image.ome.zarr" + seg_path = tmp_path / "segm.ome.zarr" + _write_ome_zarr(img_path, images) + _write_ome_zarr(seg_path, labels) + return img_path, seg_path + + @pytest.fixture def movie_2d_folders(tmp_path): """Folder-of-frames 2D+t image and segmentation.""" @@ -113,6 +138,27 @@ def test_track_single_file_runs_end_to_end(tmp_path, model_path, movie_2d_files) _assert_solution_geff(output) +def test_track_ome_zarr_inputs_match_single_file(tmp_path, model_path, movie_2d_files, movie_2d_ome_zarr): + """OME-Zarr (t, c, z, y, x) input produces the same solution as the equivalent TIFF.""" + file_img, file_seg = movie_2d_files + zarr_img, zarr_seg = movie_2d_ome_zarr + + out_file = tmp_path / "from_file.geff" + out_zarr = tmp_path / "from_zarr.geff" + + for img, seg, out in [(file_img, file_seg, out_file), (zarr_img, zarr_seg, out_zarr)]: + result = runner.invoke( + app, + ["track", str(img), str(seg), "-m", str(model_path), "-o", str(out), "--device", "cpu"], + ) + assert result.exit_code == 0, f"track failed for {img}:\n{result.stdout}" + + g_file = _assert_solution_geff(out_file) + g_zarr = _assert_solution_geff(out_zarr) + assert g_file.num_nodes() == g_zarr.num_nodes() + assert g_file.num_edges() == g_zarr.num_edges() + + def test_track_folder_inputs_match_single_file(tmp_path, model_path, movie_2d_files, movie_2d_folders): """Folder-of-frames input produces the same solution as the equivalent single file.""" file_img, file_seg = movie_2d_files diff --git a/src/hoct/_tests/test_io.py b/src/hoct/_tests/test_io.py index eb3fda0..04a8ae2 100644 --- a/src/hoct/_tests/test_io.py +++ b/src/hoct/_tests/test_io.py @@ -1,19 +1,47 @@ -"""Tests for hoct._io image loading helpers.""" +"""Tests for hoct._io image loading helpers. + +TIFF and Zarr inputs are read with always-available dependencies, so these +tests do not require the optional ``bioio`` extra. +""" import numpy as np import pytest import tifffile +import zarr -pytest.importorskip("bioio") - -from hoct._io import load_array +from hoct._io import is_frame_folder, load_array def _write_imagej_tiff(path, array, axes): - """Write ``array`` as an ImageJ-tagged TIFF so bioio resolves the dim order.""" + """Write ``array`` as an ImageJ-tagged TIFF carrying the dim order.""" tifffile.imwrite(path, array, imagej=True, metadata={"axes": axes}) +def _write_zarr_array(path, array): + """Write ``array`` as a plain ("simple") Zarr array.""" + z = zarr.create_array(store=str(path), shape=array.shape, dtype=array.dtype) + z[:] = array + + +def _write_ome_zarr(path, array, axes): + """Write ``array`` as a minimal single-scale OME-Zarr group.""" + group = zarr.open_group(str(path), mode="w") + level = group.create_array("0", shape=array.shape, dtype=array.dtype) + level[:] = array + group.attrs["multiscales"] = [ + { + "version": "0.4", + "axes": [{"name": name} for name in axes], + "datasets": [{"path": "0"}], + } + ] + + +# --------------------------------------------------------------------------- +# Single TIFF +# --------------------------------------------------------------------------- + + def test_load_single_2d_timeseries(tmp_path): """A single TYX TIFF round-trips to a (T, Y, X) array.""" arr = np.random.randint(0, 100, (4, 16, 16), dtype=np.uint16) @@ -38,6 +66,116 @@ def test_load_single_3d_timeseries(tmp_path): np.testing.assert_array_equal(result, arr) +def test_load_tiff_collapses_singleton_z(tmp_path): + """A TZYX TIFF with Z=1 collapses to (T, Y, X).""" + arr = np.random.randint(0, 100, (4, 1, 8, 8), dtype=np.uint16) + path = tmp_path / "movie.tif" + _write_imagej_tiff(path, arr, "TZYX") + + result = load_array(path) + + assert result.shape == (4, 8, 8) + np.testing.assert_array_equal(result, arr[:, 0]) + + +def test_load_tiff_keeps_first_channel(tmp_path): + """A TCYX TIFF keeps the first channel and drops the channel axis.""" + arr = np.random.randint(0, 100, (4, 2, 8, 8), dtype=np.uint16) + path = tmp_path / "movie.tif" + _write_imagej_tiff(path, arr, "TCYX") + + result = load_array(path) + + assert result.shape == (4, 8, 8) + np.testing.assert_array_equal(result, arr[:, 0]) + + +# --------------------------------------------------------------------------- +# Simple Zarr +# --------------------------------------------------------------------------- + + +def test_load_simple_zarr_2d(tmp_path): + """A plain (T, Y, X) Zarr array round-trips unchanged.""" + arr = np.arange(4 * 8 * 8, dtype=np.uint16).reshape(4, 8, 8) + path = tmp_path / "movie.zarr" + _write_zarr_array(path, arr) + + result = load_array(path) + + assert result.shape == (4, 8, 8) + np.testing.assert_array_equal(result, arr) + + +def test_load_simple_zarr_collapses_singleton_axes(tmp_path): + """A plain (T, 1, Y, X) Zarr array collapses to (T, Y, X).""" + arr = np.arange(4 * 1 * 8 * 8, dtype=np.uint16).reshape(4, 1, 8, 8) + path = tmp_path / "movie.zarr" + _write_zarr_array(path, arr) + + result = load_array(path) + + assert result.shape == (4, 8, 8) + np.testing.assert_array_equal(result, arr[:, 0]) + + +# --------------------------------------------------------------------------- +# OME-Zarr (t, c, z, y, x) +# --------------------------------------------------------------------------- + + +def test_load_ome_zarr_collapses_to_2d(tmp_path): + """A (t, c=1, z=1, y, x) OME-Zarr collapses to (T, Y, X).""" + arr = np.arange(4 * 1 * 1 * 8 * 8, dtype=np.uint16).reshape(4, 1, 1, 8, 8) + path = tmp_path / "movie.ome.zarr" + _write_ome_zarr(path, arr, "tczyx") + + result = load_array(path) + + assert result.shape == (4, 8, 8) + np.testing.assert_array_equal(result, arr[:, 0, 0]) + + +def test_load_ome_zarr_keeps_real_z(tmp_path): + """A (t, c=1, z>1, y, x) OME-Zarr collapses to (T, Z, Y, X).""" + arr = np.arange(3 * 1 * 5 * 8 * 8, dtype=np.uint16).reshape(3, 1, 5, 8, 8) + path = tmp_path / "volume.ome.zarr" + _write_ome_zarr(path, arr, "tczyx") + + result = load_array(path) + + assert result.shape == (3, 5, 8, 8) + np.testing.assert_array_equal(result, arr[:, 0]) + + +def test_load_ome_zarr_keeps_first_channel(tmp_path): + """A multichannel (t, c>1, z=1, y, x) OME-Zarr keeps the first channel.""" + arr = np.arange(3 * 2 * 1 * 8 * 8, dtype=np.uint16).reshape(3, 2, 1, 8, 8) + path = tmp_path / "movie.ome.zarr" + _write_ome_zarr(path, arr, "tczyx") + + result = load_array(path) + + assert result.shape == (3, 8, 8) + np.testing.assert_array_equal(result, arr[:, 0, 0]) + + +def test_ome_zarr_group_without_multiscales_raises(tmp_path): + """A Zarr group lacking OME 'multiscales' metadata is rejected clearly.""" + path = tmp_path / "plain_group.zarr" + group = zarr.open_group(str(path), mode="w") + level = group.create_array("0", shape=(4, 8, 8), dtype=np.uint16) + level[:] = np.zeros((4, 8, 8), dtype=np.uint16) + + with pytest.raises(ValueError, match="multiscales"): + load_array(path) + + +# --------------------------------------------------------------------------- +# Folder of frames +# --------------------------------------------------------------------------- + + def test_load_folder_of_2d_frames(tmp_path): """Folder of YX frames stacks alphabetically into (T, Y, X).""" folder = tmp_path / "frames" @@ -115,3 +253,25 @@ def test_inconsistent_frame_shapes_raise(tmp_path): def test_missing_path_raises(tmp_path): with pytest.raises(FileNotFoundError): load_array(tmp_path / "does_not_exist.tif") + + +# --------------------------------------------------------------------------- +# Layout detection +# --------------------------------------------------------------------------- + + +def test_is_frame_folder_distinguishes_zarr_from_frame_folder(tmp_path): + """A Zarr store is a single input; a plain folder is a frame folder.""" + zarr_path = tmp_path / "movie.zarr" + _write_zarr_array(zarr_path, np.zeros((4, 8, 8), dtype=np.uint16)) + + folder = tmp_path / "frames" + folder.mkdir() + _write_imagej_tiff(folder / "a.tif", np.zeros((4, 4), dtype=np.uint8), "YX") + + tiff_path = tmp_path / "movie.tif" + _write_imagej_tiff(tiff_path, np.zeros((4, 8, 8), dtype=np.uint16), "TYX") + + assert is_frame_folder(folder) is True + assert is_frame_folder(zarr_path) is False + assert is_frame_folder(tiff_path) is False diff --git a/src/hoct/cli.py b/src/hoct/cli.py index e449843..1ccce96 100644 --- a/src/hoct/cli.py +++ b/src/hoct/cli.py @@ -16,7 +16,7 @@ from hoct import __version__ from hoct._api import predict as predict_from_graph -from hoct._io import load_array +from hoct._io import is_frame_folder, load_array from hoct._models import DEFAULT_MODEL, resolve_model from hoct.data import FrameDataset from hoct.features import REGIONPROPS, create_graph @@ -277,7 +277,9 @@ def predict( @app.command() def track( - image_path: Path = typer.Argument(..., help="Path to image (file or folder of frames)", exists=True), + image_path: Path = typer.Argument( + ..., help="Path to image (TIFF, Zarr/OME-Zarr, or folder of frames)", exists=True + ), segm_path: Path = typer.Argument(..., help="Path to segmentation labels (same format as image)", exists=True), model_path: Path | None = typer.Option( None, @@ -331,9 +333,10 @@ def track( Reads the images and segmentation, builds the candidate graph, runs the model, solves tracking, and writes a GEFF directory. - Both inputs accept either a single file (whole time series) or a folder - of single-frame files (sorted alphabetically). They must use the same - layout — file with file, or folder with folder. + Both inputs accept a single file (TIFF), a Zarr/OME-Zarr store (whole time + series), or a folder of single-frame files (sorted alphabetically). They + must use the same layout — single input with single input, or frame folder + with frame folder. Example: hoct track images.tif segmentation.tif -m model.pt -o tracks.geff @@ -344,8 +347,11 @@ def track( console.print(f"[red]Output directory {output} already exists. Use --overwrite to overwrite.[/red]") raise typer.Exit(code=1) - if image_path.is_dir() != segm_path.is_dir(): - console.print("[red]Image and segmentation paths must use the same layout (both files or both folders).[/red]") + if is_frame_folder(image_path) != is_frame_folder(segm_path): + console.print( + "[red]Image and segmentation paths must use the same layout " + "(both single inputs or both frame folders).[/red]" + ) raise typer.Exit(code=1) if output_format is OutputFormat.CTC and full_graph: From 211d7a5dca57df1aafdec371433a90b6bc17b521 Mon Sep 17 00:00:00 2001 From: Jordao Bragantini Date: Mon, 15 Jun 2026 16:26:17 -0700 Subject: [PATCH 2/2] Load Zarr stores and TIFF folders lazily as dask arrays Avoid materializing whole datasets in memory: - Zarr arrays and OME-Zarr levels are read with dask.array.from_zarr; the channel selection and length-1 axis collapse run on the lazy array (using indexing and .squeeze, since dask arrays have no .take). - Folders of single-suffix TIFFs are stacked lazily with dask.array.image.imread (one frame per chunk), with per-frame shape validation done from TIFF headers (no pixel reads). Non-TIFF / mixed-suffix folders keep the eager fallback. create_graph already accepts dask arrays, so these flow end to end. Adds laziness assertions to the loader tests. --- src/hoct/_io.py | 106 ++++++++++++++++++++++++++++++------- src/hoct/_tests/test_io.py | 12 +++-- 2 files changed, 96 insertions(+), 22 deletions(-) diff --git a/src/hoct/_io.py b/src/hoct/_io.py index 0d03633..5113e30 100644 --- a/src/hoct/_io.py +++ b/src/hoct/_io.py @@ -1,7 +1,7 @@ """Image loading helpers for the high-level CLI. :func:`load_array` accepts a single file, a Zarr store, or a folder of -single-frame files, and returns a ``(T, Y, X)`` or ``(T, Z, Y, X)`` numpy array +single-frame files, and returns a ``(T, Y, X)`` or ``(T, Z, Y, X)`` array suitable for :func:`hoct.predict`. Supported inputs @@ -13,6 +13,10 @@ * **Folder of frames** — one single-timepoint file per timepoint, sorted alphabetically and stacked along a new leading T axis. +Zarr stores and folders of TIFFs are returned as **lazy** ``dask`` arrays, so +large datasets are never fully materialized in memory; downstream code reads +them frame by frame. Other inputs are returned as eager numpy arrays. + TIFF and Zarr are read with always-available dependencies. Any other single-file format falls back to ``bioio`` (optional extra ``hoct[bioio]``). """ @@ -20,6 +24,7 @@ from pathlib import Path import numpy as np +from numpy.typing import ArrayLike _TIFF_SUFFIXES = {".tif", ".tiff"} _ZARR_MARKERS = ("zarr.json", ".zarray", ".zgroup") @@ -42,24 +47,31 @@ def is_frame_folder(path: Path) -> bool: return path.is_dir() and not _is_zarr(path) -def _collapse_singleton_axes(data: np.ndarray) -> np.ndarray: - """Drop length-1 axes, always keeping the trailing two (Y, X).""" +def _collapse_singleton_axes(data: ArrayLike) -> ArrayLike: + """Drop length-1 axes, always keeping the trailing two (Y, X). + + Uses ``.squeeze`` so the operation stays lazy on dask arrays. + """ drop = tuple(axis for axis in range(data.ndim - 2) if data.shape[axis] == 1) - return np.squeeze(data, axis=drop) if drop else data + return data.squeeze(axis=drop) if drop else data + +def _select_first_channel(data: ArrayLike, axes: str) -> ArrayLike: + """Keep only the first index of any channel/sample axis named in ``axes``. -def _select_first_channel(data: np.ndarray, axes: str) -> np.ndarray: - """Keep only the first index of any channel/sample axis named in ``axes``.""" + Uses basic indexing (not ``.take``, which dask arrays lack) so the result + stays lazy on dask arrays. + """ names = [a.lower() for a in axes] for channel in _CHANNEL_AXES: if channel in names: index = names.index(channel) - data = data.take(0, axis=index) + data = data[(slice(None),) * index + (0,)] names.pop(index) return data -def _reduce_to_movie(data: np.ndarray, axes: str) -> np.ndarray: +def _reduce_to_movie(data: ArrayLike, axes: str) -> ArrayLike: """Reduce a labelled array to ``(T, [Z,] Y, X)``. Keeps the first channel of any channel axis, then collapses length-1 axes. @@ -69,6 +81,23 @@ def _reduce_to_movie(data: np.ndarray, axes: str) -> np.ndarray: return _collapse_singleton_axes(data) +def _reduced_shape(shape: tuple[int, ...], axes: str) -> tuple[int, ...]: + """Frame shape after :func:`_reduce_to_movie` (channel dropped, length-1 axes removed). + + Mirrors :func:`_reduce_to_movie` on the shape alone, so a folder of frames + can be validated from TIFF headers without reading any pixels. + """ + names = [a.lower() for a in axes] + dims = list(shape) + for channel in _CHANNEL_AXES: + if channel in names: + index = names.index(channel) + names.pop(index) + dims.pop(index) + keep_from = len(dims) - 2 + return tuple(dim for i, dim in enumerate(dims) if i >= keep_from or dim != 1) + + def _load_tiff(path: Path) -> np.ndarray: """Read a single TIFF stack and reduce it to ``(T, [Z,] Y, X)``.""" import tifffile @@ -80,6 +109,15 @@ def _load_tiff(path: Path) -> np.ndarray: return _reduce_to_movie(data, axes) +def _tiff_reduced_shape(path: Path) -> tuple[int, ...]: + """Return a TIFF's reduced frame shape, reading headers only (no pixel data).""" + import tifffile + + with tifffile.TiffFile(str(path)) as tif: + series = tif.series[0] + return _reduced_shape(tuple(series.shape), series.axes) + + def _ome_multiscales(attrs: dict) -> list | None: """Return the OME-NGFF ``multiscales`` list, supporting v0.4 and v0.5 layouts.""" if "multiscales" in attrs: @@ -90,18 +128,21 @@ def _ome_multiscales(attrs: dict) -> list | None: return None -def _load_ome_zarr(group, multiscales: list) -> np.ndarray: - """Read the highest-resolution level of an OME-Zarr group as ``(T, [Z,] Y, X)``.""" +def _load_ome_zarr(group, multiscales: list) -> ArrayLike: + """Lazily read the highest-resolution level of an OME-Zarr group as ``(T, [Z,] Y, X)``.""" + import dask.array as da + metadata = multiscales[0] axes = "".join(axis["name"] if isinstance(axis, dict) else axis for axis in metadata["axes"]) # OME datasets are ordered from highest to lowest resolution. dataset_path = metadata["datasets"][0]["path"] - data = np.asarray(group[dataset_path][:]) + data = da.from_zarr(group[dataset_path]) return _reduce_to_movie(data, axes) -def _load_zarr(path: Path) -> np.ndarray: - """Read a Zarr store: an OME-Zarr group or a plain ("simple") array.""" +def _load_zarr(path: Path) -> ArrayLike: + """Lazily read a Zarr store: an OME-Zarr group or a plain ("simple") array.""" + import dask.array as da import zarr node = zarr.open(str(path), mode="r") @@ -110,7 +151,7 @@ def _load_zarr(path: Path) -> np.ndarray: if multiscales is None: raise ValueError(f"Zarr group at {path} has no OME 'multiscales' metadata.") return _load_ome_zarr(node, multiscales) - return _collapse_singleton_axes(np.asarray(node[:])) + return _collapse_singleton_axes(da.from_zarr(node)) def _load_with_bioio(path: Path) -> np.ndarray: @@ -131,7 +172,7 @@ def _is_image_file(path: Path) -> bool: return path.is_file() and not path.name.startswith(".") -def _read_file(path: Path) -> np.ndarray: +def _read_file(path: Path) -> ArrayLike: """Read a single file (Zarr, TIFF, or via bioio) and collapse length-1 axes.""" if _is_zarr(path): return _load_zarr(path) @@ -140,7 +181,25 @@ def _read_file(path: Path) -> np.ndarray: return _load_with_bioio(path) -def load_array(path: Path) -> np.ndarray: +def _load_tiff_folder(path: Path, files: list[Path]) -> ArrayLike: + """Lazily stack a folder of single-frame TIFFs along a new leading T axis. + + Frame shapes are validated up front from TIFF headers (no pixel reads); the + pixels themselves are loaded lazily, one frame per dask chunk. + """ + from dask.array.image import imread as dask_imread + + shapes = {_tiff_reduced_shape(f) for f in files} + if len(shapes) > 1: + raise ValueError(f"Frames in {path} have inconsistent shapes: {shapes}") + + # Single shared suffix lets dask glob exactly the validated files, in the + # same sorted order. ``_load_tiff`` applies the per-frame reduction. + suffix = files[0].suffix.lower() + return dask_imread(str(path / f"*{suffix}"), imread=_load_tiff) + + +def load_array(path: Path) -> ArrayLike: """Load image data from a file, a Zarr store, or a folder of frames. Conventions @@ -151,6 +210,9 @@ def load_array(path: Path) -> np.ndarray: * **Folder**: each file is one timepoint, sorted alphabetically and stacked along a new T axis. Returns the same shapes as above. + Zarr stores and folders of TIFFs are returned as lazy ``dask`` arrays; other + inputs are returned as eager numpy arrays. + Parameters ---------- path @@ -159,8 +221,8 @@ def load_array(path: Path) -> np.ndarray: Returns ------- - np.ndarray - ``(T, Y, X)`` or ``(T, Z, Y, X)`` array. + ArrayLike + ``(T, Y, X)`` or ``(T, Z, Y, X)`` numpy or dask array. """ path = Path(path) if not path.exists(): @@ -170,8 +232,14 @@ def load_array(path: Path) -> np.ndarray: files = sorted(p for p in path.iterdir() if _is_image_file(p)) if not files: raise ValueError(f"No image files found in folder: {path}") + + suffixes = {f.suffix.lower() for f in files} + if suffixes <= _TIFF_SUFFIXES and len(suffixes) == 1: + return _load_tiff_folder(path, files) + + # Eager fallback for non-TIFF (or mixed-suffix) frame folders. frames = [_read_file(p) for p in files] - shapes = {f.shape for f in frames} + shapes = {np.shape(f) for f in frames} if len(shapes) > 1: raise ValueError(f"Frames in {path} have inconsistent shapes: {shapes}") return np.stack(frames, axis=0) diff --git a/src/hoct/_tests/test_io.py b/src/hoct/_tests/test_io.py index 04a8ae2..2d5d378 100644 --- a/src/hoct/_tests/test_io.py +++ b/src/hoct/_tests/test_io.py @@ -4,6 +4,7 @@ tests do not require the optional ``bioio`` extra. """ +import dask.array as da import numpy as np import pytest import tifffile @@ -96,13 +97,14 @@ def test_load_tiff_keeps_first_channel(tmp_path): def test_load_simple_zarr_2d(tmp_path): - """A plain (T, Y, X) Zarr array round-trips unchanged.""" + """A plain (T, Y, X) Zarr array round-trips unchanged as a lazy dask array.""" arr = np.arange(4 * 8 * 8, dtype=np.uint16).reshape(4, 8, 8) path = tmp_path / "movie.zarr" _write_zarr_array(path, arr) result = load_array(path) + assert isinstance(result, da.Array), "Zarr should load lazily via dask" assert result.shape == (4, 8, 8) np.testing.assert_array_equal(result, arr) @@ -125,13 +127,14 @@ def test_load_simple_zarr_collapses_singleton_axes(tmp_path): def test_load_ome_zarr_collapses_to_2d(tmp_path): - """A (t, c=1, z=1, y, x) OME-Zarr collapses to (T, Y, X).""" + """A (t, c=1, z=1, y, x) OME-Zarr collapses to (T, Y, X), loaded lazily.""" arr = np.arange(4 * 1 * 1 * 8 * 8, dtype=np.uint16).reshape(4, 1, 1, 8, 8) path = tmp_path / "movie.ome.zarr" _write_ome_zarr(path, arr, "tczyx") result = load_array(path) + assert isinstance(result, da.Array), "OME-Zarr should load lazily via dask" assert result.shape == (4, 8, 8) np.testing.assert_array_equal(result, arr[:, 0, 0]) @@ -177,7 +180,7 @@ def test_ome_zarr_group_without_multiscales_raises(tmp_path): def test_load_folder_of_2d_frames(tmp_path): - """Folder of YX frames stacks alphabetically into (T, Y, X).""" + """Folder of YX frames stacks alphabetically into a lazy (T, Y, X) array.""" folder = tmp_path / "frames" folder.mkdir() frames = [np.full((8, 8), t, dtype=np.uint8) for t in range(5)] @@ -186,6 +189,9 @@ def test_load_folder_of_2d_frames(tmp_path): result = load_array(folder) + assert isinstance(result, da.Array), "a folder of TIFFs should load lazily via dask" + # One dask chunk per frame, so frames load on demand rather than all at once. + assert result.chunks[0] == (1,) * 5 assert result.shape == (5, 8, 8) for t in range(5): np.testing.assert_array_equal(result[t], frames[t])