From e3de0508f9a1d097633067125fc6dbcbab1f7546 Mon Sep 17 00:00:00 2001
From: Alexandr Kalinin <alxndrkalinin@users.noreply.github.com>
Date: Tue, 2 Jun 2026 15:28:28 -0700
Subject: [PATCH 1/3] perf(dynacell/eval): memoize embedding loads across
 cross-condition pairs

_probe_pair re-read each NPZ from disk on every call, so the shared mock
reference side was loaded once per pair: run_for_group read the 8 mock
files (4 features x 2 sources) 16 times, and run re-read each condition
once per pair. Thread an optional per-call dict cache keyed on the NPZ
path through _probe_pair -> _load_embeddings; run / run_for_group create
one and pass it so each file is read once per group.

Measured on a synthetic 3-condition group: run_for_group drops from 32 to
24 np.load reads (~26 MB of redundant mock NPZ I/O per group avoided). The
cache is local to each call and released on return, so memory stays bounded
to one group's embeddings rather than accumulating across groups (which a
module-level lru_cache would do).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../evaluation/cross_condition_probe.py       | 34 +++++++++++++++----
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/applications/dynacell/src/dynacell/evaluation/cross_condition_probe.py b/applications/dynacell/src/dynacell/evaluation/cross_condition_probe.py
index 2ea5d4e3d..d63232c37 100644
--- a/applications/dynacell/src/dynacell/evaluation/cross_condition_probe.py
+++ b/applications/dynacell/src/dynacell/evaluation/cross_condition_probe.py
@@ -65,14 +65,27 @@ def _detect_condition(eval_dir: Path) -> str:
     raise ValueError(f"cannot infer condition from eval_dir name {name!r}: expected trailing _{{mock,denv,zikv}}")
 
 
-def _load_embeddings(eval_dir: Path, source: str, feature: str) -> tuple[np.ndarray, np.ndarray]:
+def _load_embeddings(
+    eval_dir: Path,
+    source: str,
+    feature: str,
+    cache: dict[Path, tuple[np.ndarray, np.ndarray]] | None = None,
+) -> tuple[np.ndarray, np.ndarray]:
     """Return ``(embeddings, fov_ids)`` from one ``*_single_cell_embeddings.npz``.
 
-    ``np.load`` raises ``FileNotFoundError`` when the NPZ is missing.
+    ``np.load`` raises ``FileNotFoundError`` when the NPZ is missing. When
+    *cache* is given, the result is memoized on the resolved NPZ path so the
+    shared reference (``mock``) side is read from disk once per group instead
+    of once per pair (see :func:`run` / :func:`run_for_group`).
     """
     npz_path = eval_dir / "embeddings" / f"{source}_{feature}_single_cell_embeddings.npz"
+    if cache is not None and npz_path in cache:
+        return cache[npz_path]
     with np.load(npz_path) as data:
-        return np.asarray(data["embeddings"]), np.asarray(data["fov"])
+        result = (np.asarray(data["embeddings"]), np.asarray(data["fov"]))
+    if cache is not None:
+        cache[npz_path] = result
+    return result
 
 
 def _probe_pair(
@@ -82,6 +95,7 @@ def _probe_pair(
     source: str,
     n_splits: int,
     rng_seed: int,
+    cache: dict[Path, tuple[np.ndarray, np.ndarray]] | None = None,
 ) -> dict:
     """Run one ``fov_stratified_auroc`` call for the given (pair, feature, source).
 
@@ -106,8 +120,8 @@ def _probe_pair(
         row["skipped_reason"] = "missing eval dir for one side of pair"
         return row
     try:
-        x0, fov0 = _load_embeddings(eval_dirs_by_condition[c0], source, feature)
-        x1, fov1 = _load_embeddings(eval_dirs_by_condition[c1], source, feature)
+        x0, fov0 = _load_embeddings(eval_dirs_by_condition[c0], source, feature, cache)
+        x1, fov1 = _load_embeddings(eval_dirs_by_condition[c1], source, feature, cache)
     except FileNotFoundError as e:
         row["skipped_reason"] = f"missing embeddings file: {e}"
         return row
@@ -196,12 +210,15 @@ def run_for_group(
     if "mock" not in by_condition:
         return []
 
+    # Shared across pairs so the mock reference embeddings are read once, not
+    # re-read for every infected condition. Local to this call -> released on return.
+    cache: dict[Path, tuple[np.ndarray, np.ndarray]] = {}
     written: list[Path] = []
     for ref, cond in _DEFAULT_PAIRS:  # ref == "mock" for every default pair
         if cond not in by_condition:
             continue
         rows = [
-            _probe_pair(by_condition, (ref, cond), feature, source, n_splits, rng_seed)
+            _probe_pair(by_condition, (ref, cond), feature, source, n_splits, rng_seed, cache)
             for feature in _FEATURE_TYPES
             for source in _SOURCES
         ]
@@ -239,11 +256,14 @@ def run(
             raise ValueError(f"duplicate condition {cond!r}: {eval_dirs_by_condition[cond]} and {d}")
         eval_dirs_by_condition[cond] = d
 
+    # Shared across pairs so a condition's embeddings (e.g. the mock reference
+    # reused by every pair) are read once. Local to this call -> released on return.
+    cache: dict[Path, tuple[np.ndarray, np.ndarray]] = {}
     rows = []
     for feature in _FEATURE_TYPES:
         for pair in pairs:
             for source in _SOURCES:
-                rows.append(_probe_pair(eval_dirs_by_condition, pair, feature, source, n_splits, rng_seed))
+                rows.append(_probe_pair(eval_dirs_by_condition, pair, feature, source, n_splits, rng_seed, cache))
 
     _write_rows(out_path, rows)
     return out_path

From df4619c7839e7f15f2170bee974b66284bc059dd Mon Sep 17 00:00:00 2001
From: Alexandr Kalinin <alxndrkalinin@users.noreply.github.com>
Date: Tue, 2 Jun 2026 15:32:19 -0700
Subject: [PATCH 2/3] fix(dynacell/eval): raise on duplicate condition in
 run_for_group

run_for_group built its condition->dir map with a last-wins assignment,
so two dirs mapping to the same condition (e.g. two *_mock dirs) silently
overwrote each other and produced a probe against an arbitrary dir with no
signal. run() already raises ValueError on duplicates; mirror that here so
an ambiguous group surfaces as an error (the grouped pipeline catches it
and safely skips the probe). Unrecognized-token dirs are still skipped, as
documented.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../src/dynacell/evaluation/cross_condition_probe.py        | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/applications/dynacell/src/dynacell/evaluation/cross_condition_probe.py b/applications/dynacell/src/dynacell/evaluation/cross_condition_probe.py
index d63232c37..070c97905 100644
--- a/applications/dynacell/src/dynacell/evaluation/cross_condition_probe.py
+++ b/applications/dynacell/src/dynacell/evaluation/cross_condition_probe.py
@@ -196,7 +196,9 @@ def run_for_group(
     eval_dirs : list[Path]
         Per-condition eval dirs of one (model, pool, organelle) group. The
         condition is inferred from each dir's trailing ``_{mock,denv,zikv}``;
-        dirs without a recognized token are ignored.
+        dirs without a recognized token are ignored. Two dirs mapping to the
+        same condition raise ``ValueError`` (an ambiguous group) rather than
+        silently picking one.
     n_splits, rng_seed : int
         Forwarded to :func:`fov_stratified_auroc`.
     """
@@ -206,6 +208,8 @@ def run_for_group(
             cond = _detect_condition(d)
         except ValueError:
             continue
+        if cond in by_condition:
+            raise ValueError(f"duplicate condition {cond!r}: {by_condition[cond]} and {d}")
         by_condition[cond] = d
     if "mock" not in by_condition:
         return []

From 9444980ac920305834f9d22abd73e8c6d429d3ba Mon Sep 17 00:00:00 2001
From: Alexandr Kalinin <alxndrkalinin@users.noreply.github.com>
Date: Tue, 2 Jun 2026 15:39:05 -0700
Subject: [PATCH 3/3] fix(dynacell/eval): lazily import cubic in
 instance_metrics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

instance_metrics imported cubic.metrics.average_precision at module top
level, and pipeline_cache imports DEFAULT_IOU_THRESHOLDS from it — so the
whole pipeline import chain pulled the GPU-only cubic stack. The minimal
`test-dynacell-configs` CI job (uv sync --group test, no eval extra) then
failed to import dynacell.evaluation.pipeline with
`ModuleNotFoundError: No module named 'cubic'`, taking down every
grouped / parallel / parallel_cpu eval test at collection.

Move the cubic import into instance_average_precision (its only use), so
importing the module for the threshold constant no longer needs cubic.
The AP computation still hard-requires cubic and fails loudly there.
Mirrors the lazy torch_fidelity imports in feature_metrics. Verified:
dynacell.evaluation.pipeline now imports with cubic unavailable.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../dynacell/src/dynacell/evaluation/instance_metrics.py   | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/applications/dynacell/src/dynacell/evaluation/instance_metrics.py b/applications/dynacell/src/dynacell/evaluation/instance_metrics.py
index 78d817a56..a202bc3b7 100644
--- a/applications/dynacell/src/dynacell/evaluation/instance_metrics.py
+++ b/applications/dynacell/src/dynacell/evaluation/instance_metrics.py
@@ -8,7 +8,6 @@
 """
 
 import numpy as np
-from cubic.metrics import average_precision
 
 DEFAULT_IOU_THRESHOLDS = (0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95)
 """IoU thresholds for the AP sweep (Cellpose / StarDist standard 0.50..0.95)."""
@@ -70,6 +69,12 @@ def instance_average_precision(
         ap_vals = [0.0] * len(thresholds)
         tp, fp, fn = 0.0, float(n_pred), float(n_gt)
     else:
+        # Imported lazily (not at module top) so importing this module for
+        # DEFAULT_IOU_THRESHOLDS / _relabel_sequential — e.g. pipeline_cache pulling
+        # the threshold constant — does not require the GPU-only cubic stack. The
+        # actual AP computation still hard-requires cubic and fails loudly here.
+        from cubic.metrics import average_precision
+
         ap, tp_arr, fp_arr, fn_arr = average_precision(gt, pred, thresholds)
         ap_vals = [float(a) for a in np.atleast_1d(ap)]
         idx = thresholds.index(_PRIMARY_THRESHOLD) if _PRIMARY_THRESHOLD in thresholds else 0