From e3de0508f9a1d097633067125fc6dbcbab1f7546 Mon Sep 17 00:00:00 2001 From: Alexandr Kalinin Date: Tue, 2 Jun 2026 15:28:28 -0700 Subject: [PATCH 1/3] perf(dynacell/eval): memoize embedding loads across cross-condition pairs _probe_pair re-read each NPZ from disk on every call, so the shared mock reference side was loaded once per pair: run_for_group read the 8 mock files (4 features x 2 sources) 16 times, and run re-read each condition once per pair. Thread an optional per-call dict cache keyed on the NPZ path through _probe_pair -> _load_embeddings; run / run_for_group create one and pass it so each file is read once per group. Measured on a synthetic 3-condition group: run_for_group drops from 32 to 24 np.load reads (~26 MB of redundant mock NPZ I/O per group avoided). The cache is local to each call and released on return, so memory stays bounded to one group's embeddings rather than accumulating across groups (which a module-level lru_cache would do). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../evaluation/cross_condition_probe.py | 34 +++++++++++++++---- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/applications/dynacell/src/dynacell/evaluation/cross_condition_probe.py b/applications/dynacell/src/dynacell/evaluation/cross_condition_probe.py index 2ea5d4e3d..d63232c37 100644 --- a/applications/dynacell/src/dynacell/evaluation/cross_condition_probe.py +++ b/applications/dynacell/src/dynacell/evaluation/cross_condition_probe.py @@ -65,14 +65,27 @@ def _detect_condition(eval_dir: Path) -> str: raise ValueError(f"cannot infer condition from eval_dir name {name!r}: expected trailing _{{mock,denv,zikv}}") -def _load_embeddings(eval_dir: Path, source: str, feature: str) -> tuple[np.ndarray, np.ndarray]: +def _load_embeddings( + eval_dir: Path, + source: str, + feature: str, + cache: dict[Path, tuple[np.ndarray, np.ndarray]] | None = None, +) -> tuple[np.ndarray, np.ndarray]: """Return ``(embeddings, fov_ids)`` from one ``*_single_cell_embeddings.npz``. - ``np.load`` raises ``FileNotFoundError`` when the NPZ is missing. + ``np.load`` raises ``FileNotFoundError`` when the NPZ is missing. When + *cache* is given, the result is memoized on the resolved NPZ path so the + shared reference (``mock``) side is read from disk once per group instead + of once per pair (see :func:`run` / :func:`run_for_group`). """ npz_path = eval_dir / "embeddings" / f"{source}_{feature}_single_cell_embeddings.npz" + if cache is not None and npz_path in cache: + return cache[npz_path] with np.load(npz_path) as data: - return np.asarray(data["embeddings"]), np.asarray(data["fov"]) + result = (np.asarray(data["embeddings"]), np.asarray(data["fov"])) + if cache is not None: + cache[npz_path] = result + return result def _probe_pair( @@ -82,6 +95,7 @@ def _probe_pair( source: str, n_splits: int, rng_seed: int, + cache: dict[Path, tuple[np.ndarray, np.ndarray]] | None = None, ) -> dict: """Run one ``fov_stratified_auroc`` call for the given (pair, feature, source). @@ -106,8 +120,8 @@ def _probe_pair( row["skipped_reason"] = "missing eval dir for one side of pair" return row try: - x0, fov0 = _load_embeddings(eval_dirs_by_condition[c0], source, feature) - x1, fov1 = _load_embeddings(eval_dirs_by_condition[c1], source, feature) + x0, fov0 = _load_embeddings(eval_dirs_by_condition[c0], source, feature, cache) + x1, fov1 = _load_embeddings(eval_dirs_by_condition[c1], source, feature, cache) except FileNotFoundError as e: row["skipped_reason"] = f"missing embeddings file: {e}" return row @@ -196,12 +210,15 @@ def run_for_group( if "mock" not in by_condition: return [] + # Shared across pairs so the mock reference embeddings are read once, not + # re-read for every infected condition. Local to this call -> released on return. + cache: dict[Path, tuple[np.ndarray, np.ndarray]] = {} written: list[Path] = [] for ref, cond in _DEFAULT_PAIRS: # ref == "mock" for every default pair if cond not in by_condition: continue rows = [ - _probe_pair(by_condition, (ref, cond), feature, source, n_splits, rng_seed) + _probe_pair(by_condition, (ref, cond), feature, source, n_splits, rng_seed, cache) for feature in _FEATURE_TYPES for source in _SOURCES ] @@ -239,11 +256,14 @@ def run( raise ValueError(f"duplicate condition {cond!r}: {eval_dirs_by_condition[cond]} and {d}") eval_dirs_by_condition[cond] = d + # Shared across pairs so a condition's embeddings (e.g. the mock reference + # reused by every pair) are read once. Local to this call -> released on return. + cache: dict[Path, tuple[np.ndarray, np.ndarray]] = {} rows = [] for feature in _FEATURE_TYPES: for pair in pairs: for source in _SOURCES: - rows.append(_probe_pair(eval_dirs_by_condition, pair, feature, source, n_splits, rng_seed)) + rows.append(_probe_pair(eval_dirs_by_condition, pair, feature, source, n_splits, rng_seed, cache)) _write_rows(out_path, rows) return out_path From df4619c7839e7f15f2170bee974b66284bc059dd Mon Sep 17 00:00:00 2001 From: Alexandr Kalinin Date: Tue, 2 Jun 2026 15:32:19 -0700 Subject: [PATCH 2/3] fix(dynacell/eval): raise on duplicate condition in run_for_group run_for_group built its condition->dir map with a last-wins assignment, so two dirs mapping to the same condition (e.g. two *_mock dirs) silently overwrote each other and produced a probe against an arbitrary dir with no signal. run() already raises ValueError on duplicates; mirror that here so an ambiguous group surfaces as an error (the grouped pipeline catches it and safely skips the probe). Unrecognized-token dirs are still skipped, as documented. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/dynacell/evaluation/cross_condition_probe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/applications/dynacell/src/dynacell/evaluation/cross_condition_probe.py b/applications/dynacell/src/dynacell/evaluation/cross_condition_probe.py index d63232c37..070c97905 100644 --- a/applications/dynacell/src/dynacell/evaluation/cross_condition_probe.py +++ b/applications/dynacell/src/dynacell/evaluation/cross_condition_probe.py @@ -196,7 +196,9 @@ def run_for_group( eval_dirs : list[Path] Per-condition eval dirs of one (model, pool, organelle) group. The condition is inferred from each dir's trailing ``_{mock,denv,zikv}``; - dirs without a recognized token are ignored. + dirs without a recognized token are ignored. Two dirs mapping to the + same condition raise ``ValueError`` (an ambiguous group) rather than + silently picking one. n_splits, rng_seed : int Forwarded to :func:`fov_stratified_auroc`. """ @@ -206,6 +208,8 @@ def run_for_group( cond = _detect_condition(d) except ValueError: continue + if cond in by_condition: + raise ValueError(f"duplicate condition {cond!r}: {by_condition[cond]} and {d}") by_condition[cond] = d if "mock" not in by_condition: return [] From 9444980ac920305834f9d22abd73e8c6d429d3ba Mon Sep 17 00:00:00 2001 From: Alexandr Kalinin Date: Tue, 2 Jun 2026 15:39:05 -0700 Subject: [PATCH 3/3] fix(dynacell/eval): lazily import cubic in instance_metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit instance_metrics imported cubic.metrics.average_precision at module top level, and pipeline_cache imports DEFAULT_IOU_THRESHOLDS from it — so the whole pipeline import chain pulled the GPU-only cubic stack. The minimal `test-dynacell-configs` CI job (uv sync --group test, no eval extra) then failed to import dynacell.evaluation.pipeline with `ModuleNotFoundError: No module named 'cubic'`, taking down every grouped / parallel / parallel_cpu eval test at collection. Move the cubic import into instance_average_precision (its only use), so importing the module for the threshold constant no longer needs cubic. The AP computation still hard-requires cubic and fails loudly there. Mirrors the lazy torch_fidelity imports in feature_metrics. Verified: dynacell.evaluation.pipeline now imports with cubic unavailable. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../dynacell/src/dynacell/evaluation/instance_metrics.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/applications/dynacell/src/dynacell/evaluation/instance_metrics.py b/applications/dynacell/src/dynacell/evaluation/instance_metrics.py index 78d817a56..a202bc3b7 100644 --- a/applications/dynacell/src/dynacell/evaluation/instance_metrics.py +++ b/applications/dynacell/src/dynacell/evaluation/instance_metrics.py @@ -8,7 +8,6 @@ """ import numpy as np -from cubic.metrics import average_precision DEFAULT_IOU_THRESHOLDS = (0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95) """IoU thresholds for the AP sweep (Cellpose / StarDist standard 0.50..0.95).""" @@ -70,6 +69,12 @@ def instance_average_precision( ap_vals = [0.0] * len(thresholds) tp, fp, fn = 0.0, float(n_pred), float(n_gt) else: + # Imported lazily (not at module top) so importing this module for + # DEFAULT_IOU_THRESHOLDS / _relabel_sequential — e.g. pipeline_cache pulling + # the threshold constant — does not require the GPU-only cubic stack. The + # actual AP computation still hard-requires cubic and fails loudly here. + from cubic.metrics import average_precision + ap, tp_arr, fp_arr, fn_arr = average_precision(gt, pred, thresholds) ap_vals = [float(a) for a in np.atleast_1d(ap)] idx = thresholds.index(_PRIMARY_THRESHOLD) if _PRIMARY_THRESHOLD in thresholds else 0