diff --git a/CLAUDE.md b/CLAUDE.md index e06edfc..e63604c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -71,13 +71,15 @@ protspace prepare -i -m -o [options] ### protspace stats Usage -Compute per-projection quality statistics for an existing project directory (also available inline via `prepare --stats`). Cluster-validity → `statistics.parquet` (bundle 5th part) + per-protein `cluster_*`/`silhouette_*` annotation columns + auto legend styles; faithfulness → each projection's `info_json.quality`. +Compute per-projection quality statistics for an existing project directory (also available inline via `prepare --stats`). Cluster-validity → `statistics.parquet` (bundle 5th part) + per-protein `cluster_elbow_*` / `cluster_silhouette_*` membership columns (each value a `cluster N` label with the per-point silhouette attached as `|score`) + auto legend styles; faithfulness (local kNN + global metrics, tagged `scope`) → each projection's `info_json.quality`. `--cluster-selection elbow|silhouette|both` picks the K-selection method(s). ```bash # Standalone (embeddings needed for faithfulness) protspace stats -i emb.h5 -p project_dir -o statistics.parquet # Enrich annotations in place + emit cluster legend styles for `bundle --settings` protspace stats -i emb.h5 -p project_dir -o statistics.parquet -a annotations.parquet --settings-out styles.json +# Elbow + silhouette-optimal clusterings side by side +protspace stats -i emb.h5 -p project_dir -o statistics.parquet -a annotations.parquet --cluster-selection both # Fold a stats parquet + settings into a bundle protspace bundle -p project_dir -a annotations.parquet -s statistics.parquet --settings styles.json -o out.parquetbundle ``` @@ -210,7 +212,7 @@ HDF5 file (float16 embeddings) ## Output Format `.parquetbundle` = concatenated Apache Parquet tables separated by `---PARQUET_DELIMITER---`: -1. `protein_annotations` — identifier + annotation columns (incl. per-protein `cluster_*`/`silhouette_*` when `--stats`) +1. `protein_annotations` — identifier + annotation columns (incl. per-protein `cluster_elbow_*` / `cluster_silhouette_*` membership, with per-point silhouette attached as `value|score`, when `--stats`) 2. `projections_metadata` — projection names, dimensions, parameters (faithfulness rides in `info_json.quality` when `--stats`) 3. `projections_data` — reduced coordinates per protein per projection 4. `settings` (optional) — annotation styles, pinned values, display config @@ -238,9 +240,9 @@ uv run pytest tests/ --cov=src/protspace # With coverage | `test_settings_converter.py` | 31 | Settings table ↔ visualization state conversion | | `test_uniprot_annotation_retriever.py` | 24 | UniProt API mocking, inactive entry resolution | | `test_pipeline_utils.py` | 70 | ReductionPipeline, EmbeddingSet, method parsing, multi-input merging, inline param overrides | -| `test_stats.py` | 37 | Projection statistics: elbow, cluster-validity, faithfulness (dual continuity), subsample determinism/order-invariance, silhouette consistency | -| `test_stats_cli.py` | 11 | `protspace stats` CLI + `prepare` stats wiring, `--settings-out` guard | -| `test_stats_carriage.py` | 9 | Routing rows to bundle parts (metadata quality, annotation columns, cluster legend) | +| `test_stats.py` | 43 | Projection statistics: elbow, cluster-validity, faithfulness (dual continuity + global metrics), cluster-selection (elbow/silhouette/both), subsample determinism/order-invariance, silhouette consistency | +| `test_stats_cli.py` | 12 | `protspace stats` CLI + `prepare` stats wiring, `--settings-out` guard, `--cluster-selection` validation | +| `test_stats_carriage.py` | 10 | Routing rows to bundle parts (metadata quality, annotation columns, cluster legend) | | `test_stats_bundle.py` | 7 | Optional 5th (statistics) bundle part round-trip | | `test_biocentral_embedder.py` | 23 | Biocentral API client, embedding flow | | `test_fasta.py` | 17 | FASTA parsing, edge cases, CSV annotation loading | diff --git a/README.md b/README.md index 01f5f31..badf440 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ ProtSpace is a visualization tool for exploring **protein embeddings** or **simi - **Multiple projections**: PCA, UMAP, t-SNE, MDS, PaCMAP, LocalMAP - **Automatic annotations**: UniProt, InterPro, and Taxonomy -- **Quality metrics** _(opt-in)_: cluster-validity + faithfulness per projection (`--stats`) +- **Quality metrics** _(opt-in)_: per-projection cluster-validity + faithfulness (local & global) via `--stats` - **Structure viewer**: Integrated protein structure visualization - **Export**: PNG, PDF, SVG, HTML diff --git a/docs/cli.md b/docs/cli.md index c2b2d48..e35a08a 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -131,6 +131,7 @@ This produces three projections: `ProtT5 — PCA 2`, `ProtT5 — UMAP 2 (n=15)`, | `-o, --output` | Output directory. | `.` | | `--bundled / --no-bundled` | Bundle into single `.parquetbundle`. | bundled | | `--stats / --no-stats` | Compute projection quality statistics (cluster-validity + faithfulness). See [Projection Statistics](#projection-statistics---stats). | off | +| `--cluster-selection` | With `--stats`, how to choose the cluster count K: `elbow`, `silhouette`, or `both`. | `elbow` | | `--keep-tmp` | Cache intermediates for resumability. | on | | `--no-log` | Skip writing `run.log`. | off | | `--dump-cache` | Print cached annotations and exit. | off | @@ -185,9 +186,13 @@ Compute per-projection quality statistics for an existing project directory and protspace stats -i embeddings/prot_t5.h5 -p projections/ -o statistics.parquet # Also enrich an annotations parquet in place with per-protein cluster-membership -# + silhouette columns, and write the auto cluster-legend styles for `bundle` +# columns, and write the auto cluster-legend styles for `bundle` protspace stats -i embeddings/prot_t5.h5 -p projections/ -o statistics.parquet \ -a annotations.parquet --settings-out cluster_styles.json + +# Emit both the elbow and the silhouette-optimal clustering +protspace stats -i embeddings/prot_t5.h5 -p projections/ -o statistics.parquet \ + -a annotations.parquet --cluster-selection both ``` | Flag | Description | Default | @@ -195,7 +200,8 @@ protspace stats -i embeddings/prot_t5.h5 -p projections/ -o statistics.parquet \ | `-i, --input` | HDF5 embedding file(s) (for faithfulness). Repeat for multi-embedding; `-i file.h5:name` to override the name. | — | | `-p, --projections` | Project directory with `projections_metadata.parquet` + `projections_data.parquet`. | — | | `-o, --output` | Output `statistics.parquet` path. | — | -| `-a, --annotations` | Annotations parquet to enrich in place with per-protein `cluster_*` / `silhouette_*` columns. | — | +| `-a, --annotations` | Annotations parquet to enrich in place with per-protein `cluster_*` membership columns (per-point silhouette attached as `value|score`). | — | +| `--cluster-selection` | Cluster count K selection: `elbow`, `silhouette`, or `both`. | `elbow` | | `--settings-out` | Write auto cluster-legend styles here (JSON) for `bundle --settings`. Requires `-a`. | — | | `--metric` | High-dim distance metric for faithfulness when the projection metadata omits one (e.g. PCA/MDS). | `euclidean` | | `--seed` | Random seed. | `42` | @@ -204,11 +210,15 @@ protspace stats -i embeddings/prot_t5.h5 -p projections/ -o statistics.parquet \ `prepare --stats` (opt-in) and the standalone `protspace stats` command compute two families of per-projection quality metrics and bake them into the output: -- **Cluster validity** — KMeans with an elbow-chosen K labels the projection, scored by **silhouette**, **Davies–Bouldin**, and **Calinski–Harabasz**. Written to the tidy `statistics.parquet` (the bundle's 5th part). Per-protein **cluster-membership** (`cluster_`) and **silhouette** (`silhouette_`) columns are also added to the annotations, and the membership columns get an auto Kelly-palette legend (the bundle's 4th settings part). -- **Faithfulness** — how well the projection preserves the source embedding's neighbourhoods: **kNN-overlap**, **trustworthiness**, and **continuity**. These per-projection scalars ride in each projection's `info_json.quality`. +- **Cluster validity** — KMeans labels the projection, scored by **silhouette**, **Davies–Bouldin**, and **Calinski–Harabasz**, written to the tidy `statistics.parquet` (the bundle's 5th part). The cluster count K is chosen by the inertia **elbow** and/or by **max silhouette** — `--cluster-selection elbow|silhouette|both`. Each selection also becomes a per-protein membership column — `cluster_elbow_` and/or `cluster_silhouette_` — with the point's **silhouette attached to its value** as `cluster N|` (the same `value|score` convention as UniProt evidence codes / InterPro bit scores; suppressed by `--no-scores`). Membership columns get an auto Kelly-palette legend (the bundle's 4th settings part); in `statistics.parquet` the two selections are distinguished by `label_kind` (`kmeans_elbow` / `kmeans_silhouette`). +- **Faithfulness** — how well the projection preserves the source embedding's structure; each row is tagged `scope`: + - **local** (kNN-neighbourhood): **kNN-overlap**, **trustworthiness**, **continuity**. + - **global** (whole-layout): **random_triplet** (relative-ordering accuracy over random triplets, ∈[0,1]) and **spearman_distance** (rank correlation of all pairwise distances, ∈[−1,1]). + + These per-projection scalars ride in each projection's `info_json.quality`. Notes: -- Off by default — the compute (a KMeans elbow sweep) and the extra bundle columns/styles are opt-in. +- Off by default — the compute (a KMeans sweep + faithfulness) and the extra bundle columns/styles are opt-in. - Uses the projection's own high-dim metric (e.g. `cosine`) for faithfulness; falls back to `--metric` / `euclidean` when the reducer doesn't record one. - Best-effort: a failure for one statistic or projection is logged and skipped, never failing the run. At large scale the heavier metrics are subsampled (silhouette/faithfulness) or fit on a bounded subsample (KMeans elbow) with a deterministic seed. diff --git a/notebooks/ProtSpace_Preparation.ipynb b/notebooks/ProtSpace_Preparation.ipynb index 1853235..af715e5 100644 --- a/notebooks/ProtSpace_Preparation.ipynb +++ b/notebooks/ProtSpace_Preparation.ipynb @@ -313,7 +313,10 @@ "source": [ "## 📊 Quality statistics (optional)\n", "\n", - "Gauge how well each projection preserves your data. The CLI can bake two metric families into the bundle — **cluster-validity** (silhouette, Davies–Bouldin, Calinski–Harabasz) and **faithfulness** (kNN-overlap, trustworthiness, continuity):\n", + "Gauge how well each projection preserves your data. The CLI bakes two metric families into the bundle:\n", + "\n", + "- **cluster-validity** — silhouette, Davies–Bouldin, Calinski–Harabasz on a KMeans clustering; choose the cluster count K by `elbow`, `silhouette`, or `both` (`--cluster-selection`).\n", + "- **faithfulness** — *local* neighbourhood preservation (kNN-overlap, trustworthiness, continuity) and *global* layout preservation (random_triplet, spearman_distance).\n", "\n", "```bash\n", "# inline during prepare (opt-in)\n", @@ -323,7 +326,7 @@ "protspace stats -i embeddings.h5 -p output/tmp -o statistics.parquet\n", "```\n", "\n", - "These also add auto-colored per-protein `cluster_` / `silhouette_` columns you can explore directly in the viewer. See [the CLI docs](https://github.com/tsenoner/protspace/blob/main/docs/cli.md#projection-statistics---stats)." + "This also adds an auto-colored per-protein `cluster_elbow_` membership column — with each point's silhouette confidence attached to its value — that you can explore directly in the viewer. See [the CLI docs](https://github.com/tsenoner/protspace/blob/main/docs/cli.md#projection-statistics---stats)." ] }, { diff --git a/src/protspace/cli/common_options.py b/src/protspace/cli/common_options.py index f691203..7c62378 100644 --- a/src/protspace/cli/common_options.py +++ b/src/protspace/cli/common_options.py @@ -16,6 +16,14 @@ class Metric(str, Enum): manhattan = "manhattan" +class ClusterSelection(str, Enum): + """How `--stats` chooses the cluster count K.""" + + elbow = "elbow" # inertia elbow (default) + silhouette = "silhouette" # max-silhouette K + both = "both" # emit both clusterings + + # --------------------------------------------------------------------------- # Shared option types # --------------------------------------------------------------------------- diff --git a/src/protspace/cli/prepare.py b/src/protspace/cli/prepare.py index 09f2131..78c9c35 100644 --- a/src/protspace/cli/prepare.py +++ b/src/protspace/cli/prepare.py @@ -18,6 +18,7 @@ from protspace.cli.app import app, setup_logging from protspace.cli.common_options import ( + ClusterSelection, Metric, Opt_BatchSize, Opt_Eps, @@ -120,8 +121,18 @@ typer.Option( "--stats/--no-stats", help="Compute projection quality statistics (cluster-validity + " - "faithfulness); adds cluster_*/silhouette_* columns + legend styles to the " - "bundle. Opt-in (off by default): can be slow on large runs.", + "faithfulness); adds cluster_* membership columns (with per-point " + "silhouette confidence) + legend styles to the bundle. Opt-in (off by " + "default): can be slow on large runs.", + rich_help_panel="Output", + ), +] +Opt_ClusterSelection = Annotated[ + ClusterSelection, + typer.Option( + "--cluster-selection", + help="With --stats, how to choose the cluster count K: 'elbow' (default), " + "'silhouette' (max-silhouette K), or 'both' (emit both clusterings).", rich_help_panel="Output", ), ] @@ -301,6 +312,7 @@ def prepare( annotations: Opt_Annotations = None, scores: Opt_Scores = True, stats: Opt_Stats = False, + cluster_selection: Opt_ClusterSelection = ClusterSelection.elbow, refetch: Opt_Refetch = None, # Output output: Opt_Output = Path("."), @@ -517,6 +529,7 @@ def prepare( keep_tmp=keep_tmp, no_scores=not scores, stats=stats, + cluster_selection=cluster_selection.value, refetch_stages=refetch_stages, annotations=annotation_list, intermediate_dir=cache_dir, diff --git a/src/protspace/cli/stats.py b/src/protspace/cli/stats.py index 202b662..fce1e3f 100644 --- a/src/protspace/cli/stats.py +++ b/src/protspace/cli/stats.py @@ -14,6 +14,7 @@ import typer from protspace.cli.app import app, setup_logging +from protspace.cli.common_options import ClusterSelection logger = logging.getLogger(__name__) @@ -148,11 +149,11 @@ def _merge_quality_into_metadata(meta_path: Path, quality_by_name: dict) -> None def _merge_annotations_with_columns(ann_path: Path, report) -> int: """Merge the report's per-protein ``AnnotationColumn``s into ``ann_path``. - Rewrites the annotations parquet in place with the computed ``cluster_*`` / - ``silhouette_*`` columns joined by identifier. Added columns are stringified - (membership → category labels, silhouette → numeric strings, absent → empty) - so they match the prepare path's all-string annotations and the frontend's - content-based type inference. Returns the number of columns added. + Rewrites the annotations parquet in place with the computed ``cluster_*`` + membership columns joined by identifier (each value a ``cluster N`` label with + the per-point silhouette attached as ``|score``). Added columns are stringified + (absent → empty) so they match the prepare path's all-string annotations and the + frontend's content-based type inference. Returns the number of columns added. """ import pyarrow as pa import pyarrow.parquet as pq @@ -199,7 +200,8 @@ def stats( "-a", "--annotations", help="Annotations parquet to enrich in place with per-protein " - "cluster-membership + silhouette columns. Omit to skip per-protein outputs.", + "cluster-membership columns (per-point silhouette attached as |score). " + "Omit to skip per-protein outputs.", ), ] = None, settings_out: Annotated[ @@ -218,6 +220,14 @@ def stats( help="High-dim distance metric for faithfulness when the projection metadata omits one (e.g. PCA/MDS).", ), ] = "euclidean", + cluster_selection: Annotated[ + ClusterSelection, + typer.Option( + "--cluster-selection", + help="How to choose the cluster count K: 'elbow' (default), 'silhouette' " + "(max-silhouette K), or 'both' (emit both clusterings).", + ), + ] = ClusterSelection.elbow, verbose: Annotated[ int, typer.Option("-v", "--verbose", count=True, help="Increase verbosity.") ] = 0, @@ -251,10 +261,12 @@ def stats( ) reductions = _load_reductions(projections, default_metric=metric) - # Per-protein outputs (cluster membership + per-point silhouette) are only - # computed when there's an annotations file to land them in — silhouette_samples + # Per-protein output (cluster membership with attached per-point silhouette) is + # only computed when there's an annotations file to land it in — silhouette_samples # is O(n^2), so we don't pay for it with nowhere to write. - params = {} if annotations is not None else {"cluster_annotations": False} + params = {"cluster_selection": cluster_selection.value} + if annotations is None: + params["cluster_annotations"] = False report = compute_statistics( embedding_sets, reductions, diff --git a/src/protspace/data/processors/pipeline.py b/src/protspace/data/processors/pipeline.py index 96a340a..c0d7ffa 100644 --- a/src/protspace/data/processors/pipeline.py +++ b/src/protspace/data/processors/pipeline.py @@ -74,6 +74,7 @@ class PipelineConfig: keep_tmp: bool = False no_scores: bool = False stats: bool = False + cluster_selection: str = "elbow" # elbow | silhouette | both (for --stats) refetch_stages: frozenset[str] = field(default_factory=frozenset) annotations: list[str] | None = None intermediate_dir: Path | None = None @@ -731,6 +732,12 @@ def _compute_statistics( embedding_sets, all_reductions, rng_seed=self.config.reducer_params.random_state, + params={ + "cluster_selection": self.config.cluster_selection, + # Silhouette-as-confidence on cluster values is a score, so it + # honours --no-scores like UniProt/InterPro annotation scores. + "include_scores": not self.config.no_scores, + }, # Faithfulness high-dim metric: reducers like PCA/MDS/PaCMAP omit # 'metric' from their params, so fall back to the run's metric # rather than silently assuming euclidean. diff --git a/src/protspace/stats/carriage.py b/src/protspace/stats/carriage.py index 8b599ae..ad0ea7c 100644 --- a/src/protspace/stats/carriage.py +++ b/src/protspace/stats/carriage.py @@ -75,9 +75,10 @@ def merge_annotation_columns( Each ``AnnotationColumn`` is joined onto ``frame`` by identifier (proteins absent from a column get no value, not a fabricated one). Mutates ``frame`` in - place and returns the names of the columns added — membership stays a - non-numeric string, per-point silhouette a float, so the downstream - ``.astype(str)`` writer yields categorical / continuous inference respectively. + place and returns the names of the columns added — membership values are + non-numeric ``cluster N`` strings (optionally carrying an attached + ``|silhouette`` confidence, like ECO / InterPro bit scores), so the downstream + ``.astype(str)`` writer keeps them categorical. """ if id_col not in getattr(frame, "columns", []): return [] @@ -102,9 +103,9 @@ def build_cluster_legend_settings(report: StatsReport) -> dict: Returns ``{column_name: LegendPersistedSettings}`` (the bundle's settings part format) with a full envelope per ``categorical`` ``AnnotationColumn`` — every field the frontend's ``sanitizeLegendSettingsEntry`` requires, categories keyed - by the exact label strings with a Kelly-palette ``color`` + ``zOrder`` + ``shape`` - — so clusters are colored when selected without a manual styling step. Numeric - columns (per-point silhouette) are left to the default continuous ramp. + by the bare ``cluster N`` label (any attached ``|silhouette`` confidence is + stripped) with a Kelly-palette ``color`` + ``zOrder`` + ``shape`` — so clusters + are colored when selected without a manual styling step. """ from protspace.data.io.settings_converter import KELLYS_COLORS @@ -112,7 +113,13 @@ def build_cluster_legend_settings(report: StatsReport) -> dict: for col in report.annotation_columns: if col.kind != "categorical": continue - labels = sorted(set(col.values.values()), key=_cluster_label_sort_key) + # Membership values may carry an attached ``|silhouette`` confidence + # (value|score) — strip it to recover the bare "cluster N" category, matching + # how the frontend splits score-bearing annotation values. + labels = sorted( + {str(v).split("|", 1)[0] for v in col.values.values()}, + key=_cluster_label_sort_key, + ) categories = { label: { "zOrder": i, diff --git a/src/protspace/stats/cluster/kmeans_elbow.py b/src/protspace/stats/cluster/kmeans_elbow.py index 54f0374..ceff109 100644 --- a/src/protspace/stats/cluster/kmeans_elbow.py +++ b/src/protspace/stats/cluster/kmeans_elbow.py @@ -25,6 +25,10 @@ class ElbowResult: k_range: list[int] inertia: list[float] knee_confidence: str # "high" | "low" + # Alternative K maximising the (sampled) silhouette over the sweep, and its + # full-coverage labels — populated only when ``silhouette_selection`` is set. + silhouette_k: int | None = None + silhouette_labels: np.ndarray | None = None def chord_deviation(y: np.ndarray) -> np.ndarray: @@ -54,6 +58,8 @@ def kmeans_elbow( n_init: int = 10, knee_min_deviation: float = 0.05, max_fit_sample: int = 50_000, + silhouette_selection: bool = False, + silhouette_sample: int = 5000, ) -> ElbowResult | None: """Sweep KMeans over K and pick the elbow via max chord deviation. @@ -64,6 +70,11 @@ def kmeans_elbow( then labels for *all* n points are recovered with a single ``predict`` pass. At or below the threshold the full-batch ``KMeans`` is used, so small/medium inputs are unchanged. + + When ``silhouette_selection`` is set, the K maximising the (sampled) silhouette + over the sweep is also returned (``silhouette_k`` / ``silhouette_labels``) as an + alternative to the elbow K. This costs one silhouette pass per K, so it is + computed only on request; ``silhouette_sample`` bounds that cost. """ from sklearn.cluster import KMeans, MiniBatchKMeans @@ -103,20 +114,49 @@ def _labels_full(k: int) -> np.ndarray: km = models_by_k[k] return km.predict(X) if subsample else km.labels_ + def _silhouette_k(): + # K maximising the (sampled) silhouette over the sweep, scored on x_fit + # with each K's fitted labels. Returns (k, full-coverage labels) or (None, None). + from sklearn.metrics import silhouette_score + + kw = {} + if x_fit.shape[0] > silhouette_sample: + kw = {"sample_size": silhouette_sample, "random_state": rng_seed} + best_k, best_s = None, -np.inf + for kk in k_range: + try: + s = float(silhouette_score(x_fit, models_by_k[kk].labels_, **kw)) + except Exception: # noqa: BLE001 - a degenerate K is skipped + continue + if s > best_s: + best_s, best_k = s, kk + if best_k is None: + return None, None + return best_k, _labels_full(best_k) + + sil_k, sil_labels = _silhouette_k() if silhouette_selection else (None, None) + if len(k_range) < 3: # Too short to find a chord knee; take the smallest K, flag low confidence. k = k_range[0] - return ElbowResult(k, _labels_full(k), k_range, inertia, "low") - - dev = chord_deviation(np.asarray(inertia, dtype=float)) - k_idx = int(np.argmax(dev)) - k = k_range[k_idx] - # With only 3 swept points the chord knee is structurally pinned to the middle - # K; require a wider sweep before claiming high confidence. - knee_confidence = ( - "high" - if len(k_range) >= 4 and float(dev.max()) >= knee_min_deviation - else "low" + knee_confidence = "low" + else: + dev = chord_deviation(np.asarray(inertia, dtype=float)) + k = k_range[int(np.argmax(dev))] + # With only 3 swept points the chord knee is structurally pinned to the middle + # K; require a wider sweep before claiming high confidence. + knee_confidence = ( + "high" + if len(k_range) >= 4 and float(dev.max()) >= knee_min_deviation + else "low" + ) + + return ElbowResult( + k, + _labels_full(k), + k_range, + inertia, + knee_confidence, + silhouette_k=sil_k, + silhouette_labels=sil_labels, ) - - return ElbowResult(k, _labels_full(k), k_range, inertia, knee_confidence) diff --git a/src/protspace/stats/driver.py b/src/protspace/stats/driver.py index a2cfa0e..252b1a7 100644 --- a/src/protspace/stats/driver.py +++ b/src/protspace/stats/driver.py @@ -91,7 +91,10 @@ def compute_statistics( (embedding name), ``ids`` (coords row identifiers), and ``info`` (reducer params, used for the high-dim ``metric``). rng_seed: deterministic seed. - params: tunables (``k``, ``k_max``, ``sample_threshold``, ``hard_ceiling``). + params: tunables — ``k``, ``k_max``, ``sample_threshold``, ``hard_ceiling``, + ``max_fit_sample``, ``n_triplets_per_point``; ``cluster_selection`` + (``elbow`` | ``silhouette`` | ``both``); ``cluster_annotations`` and + ``include_scores`` (per-protein membership column + attached silhouette). Returns: A ``StatsReport`` (may be partial/empty; never raises on a statistic error). diff --git a/src/protspace/stats/metrics/faithfulness.py b/src/protspace/stats/metrics/faithfulness.py index 166222e..953979d 100644 --- a/src/protspace/stats/metrics/faithfulness.py +++ b/src/protspace/stats/metrics/faithfulness.py @@ -1,4 +1,12 @@ -"""Projection-faithfulness statistics: kNN-overlap, trustworthiness, continuity. +"""Projection-faithfulness statistics vs. the source embedding. + +Two families (tagged by ``scope`` in each row's ``extra``): + local — kNN-neighbourhood preservation: ``knn_overlap``, ``trustworthiness``, + ``continuity`` (do nearby points stay nearby?). + global — whole-layout preservation: ``random_triplet`` (relative-ordering + accuracy over random triplets) and ``spearman_distance`` (rank + correlation of all pairwise distances) — the mid/long-range structure + the local metrics miss. These compare a projection to its *source embedding*. The high-dimensional distance metric (the reducer's own metric, euclidean by default unless the @@ -33,6 +41,7 @@ DEFAULT_K = 15 DEFAULT_SAMPLE_THRESHOLD = 5000 DEFAULT_HARD_CEILING = 20000 +DEFAULT_N_TRIPLETS_PER_POINT = 5 def _subsample_seed(rng_seed: int, ids: list[str]) -> int: @@ -102,6 +111,61 @@ def _continuity(embedding, coords, k: int, metric: str) -> float: return float(c) +def _random_triplet_accuracy( + embedding, coords, k_per_point: int, metric: str, rng +) -> float: + """Global structure: fraction of random triplets (i, j, l) whose distance + ordering agrees between the embedding and the projection. + + For each anchor i and two random others j, l: does "is j or l closer to i?" + match in high-dim (``metric``) and low-dim (euclidean)? 0.5 ≈ chance, 1.0 = + every relative ordering preserved. Samples ``k_per_point`` triplets per point + (O(n·k_per_point)), so it probes mid/long-range layout, unlike the kNN metrics. + """ + from sklearn.metrics.pairwise import paired_distances + + n = embedding.shape[0] + anchors = np.repeat(np.arange(n), k_per_point) + t = anchors.shape[0] + # Two DISTINCT others per anchor, neither equal to the anchor. Offsets in + # [1, n-1] added mod n never land on the anchor; drawing m's offset from + # [1, n-2] then skipping over j's offset guarantees j != m. This excludes + # self-pairs (distance 0), which would trivially agree and inflate the score. + off_j = rng.integers(1, n, t) + off_m = rng.integers(1, n - 1, t) + off_m += off_m >= off_j + j = (anchors + off_j) % n + m = (anchors + off_m) % n + emb_a, coords_a = embedding[anchors], coords[anchors] + d_hi_j = paired_distances(emb_a, embedding[j], metric=metric) + d_hi_m = paired_distances(emb_a, embedding[m], metric=metric) + d_lo_j = paired_distances(coords_a, coords[j], metric="euclidean") + d_lo_m = paired_distances(coords_a, coords[m], metric="euclidean") + agree = (d_hi_j < d_hi_m) == (d_lo_j < d_lo_m) + return float(np.mean(agree)) + + +def _spearman_distance(embedding, coords, metric: str) -> float: + """Global structure: Spearman (rank) correlation between all pairwise + embedding distances (``metric``) and projection distances (euclidean). + + Uses every unique pair (upper triangle), so it measures whether the *overall* + distance layout — not just local neighbourhoods — is preserved. Range [-1, 1], + higher = better. Computed with numpy ranks + Pearson (no scipy dependency). + """ + from sklearn.metrics import pairwise_distances + + n = embedding.shape[0] + iu = np.triu_indices(n, k=1) + hi = pairwise_distances(embedding, metric=metric)[iu] + lo = pairwise_distances(coords, metric="euclidean")[iu] + # Spearman = Pearson on ranks; argsort-of-argsort gives ordinal 0-based ranks + # (ties broken by index — a negligible deviation from midrank Spearman here). + rank_hi = np.argsort(np.argsort(hi)) + rank_lo = np.argsort(np.argsort(lo)) + return float(np.corrcoef(rank_hi, rank_lo)[0, 1]) + + class FaithfulnessStatistic: """kNN-overlap + trustworthiness + continuity of a projection vs its embedding.""" @@ -125,13 +189,7 @@ def compute(self, ctx: StatContext) -> list[StatRow]: if n < 3: return [] - k = int(ctx.params.get("k", DEFAULT_K)) - sample_threshold = int( - ctx.params.get("sample_threshold", DEFAULT_SAMPLE_THRESHOLD) - ) hard_ceiling = int(ctx.params.get("hard_ceiling", DEFAULT_HARD_CEILING)) - hi_metric = ctx.high_dim_metric or "euclidean" - base = { "space_kind": ctx.space_kind, "space_name": ctx.space_name, @@ -143,6 +201,8 @@ def compute(self, ctx: StatContext) -> list[StatRow]: "destination": "projection_metadata", } + # Bail before the canonical sort/copy below: past the ceiling every metric + # is skipped anyway, so sorting and copying emb/coords would be pure waste. if n > hard_ceiling: return [ StatRow( @@ -158,14 +218,28 @@ def compute(self, ctx: StatContext) -> list[StatRow]: ) ] + # Canonicalise row order by id up front so EVERY metric depends only on the + # id-SET, not the input row order. The kNN/Spearman metrics are already + # order-invariant, but the position-based triplet sampling below is not — so + # sorting here (matching the id-derived subsample seed) makes random_triplet + # reproducible across differently-ordered inputs too. + canonical = np.argsort(np.asarray(ids), kind="stable") + emb = emb[canonical] + coords = coords[canonical] + ids = [ids[int(i)] for i in canonical] + + k = int(ctx.params.get("k", DEFAULT_K)) + sample_threshold = int( + ctx.params.get("sample_threshold", DEFAULT_SAMPLE_THRESHOLD) + ) + hi_metric = ctx.high_dim_metric or "euclidean" + sampled = False if n > sample_threshold: rng = np.random.default_rng(_subsample_seed(ctx.rng_seed, ids)) - # Select in canonical id order so WHICH proteins are sampled depends - # only on the id-set (matching the order-invariant seed), not on the - # input row order. order[r] = original row of the r-th smallest id. - order = np.argsort(np.asarray(ids), kind="stable") - idx = np.sort(order[rng.permutation(n)[:sample_threshold]]) + # Rows are already in canonical id order, so a positional draw is itself + # id-canonical and thus row-order invariant. + idx = np.sort(rng.permutation(n)[:sample_threshold]) emb = emb[idx] coords = coords[idx] n = len(idx) @@ -182,34 +256,59 @@ def compute(self, ctx: StatContext) -> list[StatRow]: "sample_size": int(n), "embedding": ctx.embedding_name, } - # Each metric differs only in its value computation and the high-dim metric - # recorded in ``extra``. Trustworthiness and continuity are duals that both - # rank the embedding by ``hi_metric`` (continuity via ``_continuity`` since - # sklearn can only metric-rank its first arg). Each is best-effort — a - # failure drops only that row. + n_per_point = int( + ctx.params.get("n_triplets_per_point", DEFAULT_N_TRIPLETS_PER_POINT) + ) + triplet_rng = np.random.default_rng(ctx.rng_seed) + # Two families, each entry (name, value_fn, extra). ``scope="local"`` are the + # kNN-neighbourhood metrics (trustworthiness/continuity are metric-consistent + # duals via ``_continuity``); ``scope="global"`` probe the whole-layout + # structure the local metrics miss. Each is best-effort — a failure drops + # only that row. + local = {"metric": hi_metric, "scope": "local"} metrics = ( - ("knn_overlap", lambda: _knn_overlap(emb, coords, k, hi_metric), hi_metric), + ( + "knn_overlap", + lambda: _knn_overlap(emb, coords, k, hi_metric), + local, + ), ( "trustworthiness", lambda: float( trustworthiness(emb, coords, n_neighbors=k, metric=hi_metric) ), - hi_metric, + local, ), ( "continuity", lambda: _continuity(emb, coords, k, hi_metric), - hi_metric, + local, + ), + ( + "random_triplet", + lambda: _random_triplet_accuracy( + emb, coords, n_per_point, hi_metric, triplet_rng + ), + { + "metric": hi_metric, + "scope": "global", + "n_triplets": int(n_per_point * n), + }, + ), + ( + "spearman_distance", + lambda: _spearman_distance(emb, coords, hi_metric), + {"metric": hi_metric, "scope": "global"}, ), ) rows: list[StatRow] = [] - for metric_name, value_fn, extra_metric in metrics: + for metric_name, value_fn, extra_extra in metrics: try: rows.append( StatRow( metric=metric_name, value=value_fn(), - extra={**common, "metric": extra_metric}, + extra={**common, **extra_extra}, **base, ) ) diff --git a/src/protspace/stats/metrics/validity.py b/src/protspace/stats/metrics/validity.py index 8dd239e..e2a12ac 100644 --- a/src/protspace/stats/metrics/validity.py +++ b/src/protspace/stats/metrics/validity.py @@ -1,14 +1,25 @@ """Cluster-validity statistics on projection coordinates. -KMeans (with an elbow-chosen K) labels the projection; silhouette, Davies-Bouldin -and Calinski-Harabasz score that labelling. The chosen K is emitted as a +KMeans labels the projection; silhouette, Davies-Bouldin and Calinski-Harabasz +score that labelling. The K can be chosen by the inertia **elbow** and/or by +**max silhouette** (``ctx.params["cluster_selection"]`` = ``elbow`` | ``silhouette`` +| ``both``); each selection is emitted with its own ``label_kind`` +(``kmeans_elbow`` / ``kmeans_silhouette``). The chosen K is emitted as a ``metric_kind="meta"`` row so consumers can exclude it from validity aggregates. +Each labelling also becomes a per-protein ``cluster_*`` membership column whose +per-point silhouette rides along as an attached ``value|score`` confidence — the +same convention as UniProt evidence codes / InterPro bit scores — so no separate +silhouette column is needed. Suppressed when ``ctx.params["include_scores"]`` is +False (``--no-scores``). + scikit-learn imports are function-local to keep CLI startup fast. """ from __future__ import annotations +from typing import NamedTuple + import numpy as np from protspace.stats.base import AnnotationColumn, StatContext, StatRow @@ -37,38 +48,91 @@ def _silhouette(X, labels, *, rng_seed: int, sample_threshold: int): return float(silhouette_score(X, labels)), {"sampled": False, "sample_size": int(n)} +class _Labeling(NamedTuple): + """One K-selection's clustering: how it was chosen + its column and labels.""" + + label_kind: str # "kmeans_elbow" | "kmeans_silhouette" (statistics.parquet tag) + col_name: str # "cluster_elbow_" | "cluster_silhouette_" + selection_name: str # "elbow" | "silhouette" + requested_k: int + labels: np.ndarray + + class ClusterValidityStatistic: - """Elbow K + silhouette / Davies-Bouldin / Calinski-Harabasz on the coords.""" + """Elbow / silhouette K + silhouette / Davies-Bouldin / Calinski-Harabasz.""" family = "cluster_validity" requires_embedding = False def compute(self, ctx: StatContext) -> list: - from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score - X = np.asarray(ctx.coords, dtype=float) n = X.shape[0] - rng_seed = ctx.rng_seed - sample_threshold = int( - ctx.params.get("sample_threshold", DEFAULT_SAMPLE_THRESHOLD) - ) - k_max = ctx.params.get("k_max") + params = ctx.params + sample_threshold = int(params.get("sample_threshold", DEFAULT_SAMPLE_THRESHOLD)) + selection = str(params.get("cluster_selection", "elbow")).lower() + # The CLI validates this via a Typer enum; guard the raw stats API too so an + # unrecognised value falls back to the default rather than silently emitting + # no labelling at all (best-effort: never drop a projection's whole output). + if selection not in ("elbow", "silhouette", "both"): + selection = "elbow" res = kmeans_elbow( X, - rng_seed=rng_seed, - k_max=k_max, - max_fit_sample=int( - ctx.params.get("max_fit_sample", DEFAULT_MAX_FIT_SAMPLE) - ), + rng_seed=ctx.rng_seed, + k_max=params.get("k_max"), + max_fit_sample=int(params.get("max_fit_sample", DEFAULT_MAX_FIT_SAMPLE)), + silhouette_selection=selection in ("silhouette", "both"), + silhouette_sample=sample_threshold, ) if res is None: # n < 3 return [] - labels = res.labels - k = res.k + + # Which labelling(s) to emit. Each K-selection method is named explicitly + # (cluster_elbow_ / cluster_silhouette_) so the column name — the + # only signal that survives to the frontend — carries the provenance. + labelings: list[_Labeling] = [] + if selection in ("elbow", "both"): + labelings.append( + _Labeling( + "kmeans_elbow", + f"cluster_elbow_{ctx.space_name}", + "elbow", + res.k, + res.labels, + ) + ) + if selection in ("silhouette", "both") and res.silhouette_labels is not None: + labelings.append( + _Labeling( + "kmeans_silhouette", + f"cluster_silhouette_{ctx.space_name}", + "silhouette", + int(res.silhouette_k), + res.silhouette_labels, + ) + ) + + out: list = [] + for labeling in labelings: + out.extend(self._emit_labeling(ctx, X, n, res, labeling)) + return out + + def _emit_labeling(self, ctx, X, n, res, labeling: _Labeling) -> list: + """Rows + membership column for one labelling (elbow or silhouette-K).""" + from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score + + rng_seed = ctx.rng_seed + params = ctx.params + sample_threshold = int(params.get("sample_threshold", DEFAULT_SAMPLE_THRESHOLD)) + label_kind = labeling.label_kind + col_name = labeling.col_name + selection_name = labeling.selection_name + labels = labeling.labels + k = int(labeling.requested_k) + # Report the ACHIEVED number of distinct clusters (KMeans can collapse on - # coincident points), keeping the elbow's requested K in extra. The cluster - # sizes also feed the Davies-Bouldin / Calinski-Harabasz singleton guard. + # coincident points), keeping the requested K in extra. The cluster sizes + # also feed the Davies-Bouldin / Calinski-Harabasz singleton guard. unique_labels, label_counts = np.unique(labels, return_counts=True) achieved = int(len(unique_labels)) has_singleton = bool((label_counts < 2).any()) @@ -77,20 +141,19 @@ def compute(self, ctx: StatContext) -> list: "space_kind": ctx.space_kind, "space_name": ctx.space_name, "stat_family": self.family, - "label_kind": "kmeans_elbow", + "label_kind": label_kind, } - # Decide up front whether the exact per-point silhouette column will be - # emitted. When it is, its per-point values are the single source of truth: - # the aggregate `silhouette` row is their mean (== the exact unsampled - # silhouette_score by definition), so the headline value equals - # mean(silhouette_{proj}) and no redundant sampled score is computed. + # Decide up front whether the exact per-point silhouette will be computed. + # When it is, its mean is the exact aggregate silhouette (== unsampled + # silhouette_score) AND its per-point values ride along on the membership + # column, so nothing is computed twice. silhouette_ok = 2 <= k <= n - 1 hard_ceiling = int( - ctx.params.get("silhouette_hard_ceiling", DEFAULT_SILHOUETTE_HARD_CEILING) + params.get("silhouette_hard_ceiling", DEFAULT_SILHOUETTE_HARD_CEILING) ) want_per_point = ( - ctx.params.get("cluster_annotations", True) + params.get("cluster_annotations", True) and achieved >= 2 and len(ctx.ids) == n ) @@ -103,19 +166,22 @@ def compute(self, ctx: StatContext) -> list: except Exception: # noqa: BLE001 - per-point silhouette is best-effort per_point_samples = None - # Holds scalar StatRows plus, below, any per-protein AnnotationColumns. + meta_extra = { + "requested_k": k, + "selection": selection_name, + "k_range": [res.k_range[0], res.k_range[-1]], + "inertia": res.inertia, + "seed": rng_seed, + } + if selection_name == "elbow": + meta_extra["knee_confidence"] = res.knee_confidence + rows: list = [ StatRow( metric="n_clusters", metric_kind="meta", value=float(achieved), - extra={ - "requested_k": k, - "k_range": [res.k_range[0], res.k_range[-1]], - "inertia": res.inertia, - "knee_confidence": res.knee_confidence, - "seed": rng_seed, - }, + extra=meta_extra, **base, ) ] @@ -125,12 +191,10 @@ def compute(self, ctx: StatContext) -> list: try: if per_point_samples is not None: # Exact aggregate over all n, consistent with the per-point - # silhouette_{proj} column below. + # values attached to the membership column below. sil = float(per_point_samples.mean()) sx = {"sampled": False, "sample_size": int(n)} else: - # No exact column (n > hard_ceiling, disabled, or id mismatch): - # fall back to the sampled/exact silhouette_score. sil, sx = _silhouette( X, labels, rng_seed=rng_seed, sample_threshold=sample_threshold ) @@ -165,41 +229,40 @@ def compute(self, ctx: StatContext) -> list: except Exception: # noqa: BLE001 pass - # Per-protein outputs (route-projection-statistics Phase 2): the elbow-K - # labelling becomes a categorical membership column and per-point silhouette - # a numeric column, both joined by identifier. Gated identically to the - # per_point_samples decision above. + # Per-protein membership: a categorical `cluster N` label, with the per-point + # silhouette attached as a `value|score` confidence (like ECO / InterPro bit + # scores) so a single column carries both membership and its confidence. if want_per_point: - ann_extra = { - "projection": ctx.space_name, - "k": int(k), - "seed": rng_seed, - "computed": True, - } - # Membership as NON-numeric label strings so the frontend's content-based - # type inference reads the column as categorical, not a numeric ramp. + include_scores = bool(params.get("include_scores", True)) + sil_by_id = {} + if per_point_samples is not None: + sil_by_id = { + pid: float(s) + for pid, s in zip(ctx.ids, per_point_samples, strict=False) + } + + def _membership(pid, lbl): + label = f"cluster {int(lbl)}" + if include_scores and pid in sil_by_id: + return f"{label}|{sil_by_id[pid]:.4f}" + return label + rows.append( AnnotationColumn( - name=f"cluster_{ctx.space_name}", + name=col_name, kind="categorical", values={ - pid: f"cluster {int(lbl)}" + pid: _membership(pid, lbl) for pid, lbl in zip(ctx.ids, labels, strict=False) }, - extra=ann_extra, + extra={ + "projection": ctx.space_name, + "selection": selection_name, + "k": k, + "seed": rng_seed, + "computed": True, + "has_silhouette_score": bool(sil_by_id) and include_scores, + }, ) ) - if per_point_samples is not None: - rows.append( - AnnotationColumn( - name=f"silhouette_{ctx.space_name}", - kind="numeric", - values={ - pid: float(s) - for pid, s in zip(ctx.ids, per_point_samples, strict=False) - }, - extra=ann_extra, - ) - ) - return rows diff --git a/tests/test_stats.py b/tests/test_stats.py index ac7db1c..284578a 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -554,7 +554,7 @@ def test_report_collects_annotation_columns_separately_from_rows(): assert [c.name for c in report.annotation_columns] == ["cluster_PCA_2"] -def test_cluster_validity_emits_per_protein_annotation_columns(): +def test_cluster_validity_emits_membership_with_attached_silhouette(): from protspace.stats.base import AnnotationColumn X, _ = _blobs(n=200, centers=4, dim=2, seed=31) @@ -563,21 +563,17 @@ def test_cluster_validity_emits_per_protein_annotation_columns(): StatContext("projection", "PCA_2", coords=X, ids=ids) ) cols = {o.name: o for o in outs if isinstance(o, AnnotationColumn)} - assert {"cluster_PCA_2", "silhouette_PCA_2"} <= set(cols) - - mem = cols["cluster_PCA_2"] + # Single membership column now; the per-point silhouette rides on its value as + # `cluster N|` (like ECO / InterPro bit scores), not a 2nd column. + assert set(cols) == {"cluster_elbow_PCA_2"} + mem = cols["cluster_elbow_PCA_2"] assert mem.destination == "annotation" and mem.kind == "categorical" assert set(mem.values) == set(ids) # one value per protein, joined by id - # non-numeric label strings so content-based inference reads categorical - assert all( - isinstance(v, str) and v.startswith("cluster ") for v in mem.values.values() - ) - assert mem.extra["k"] >= 2 and mem.extra.get("computed") is True - - sil = cols["silhouette_PCA_2"] - assert sil.kind == "numeric" - assert set(sil.values) == set(ids) - assert all(isinstance(v, float) for v in sil.values.values()) + assert mem.extra["has_silhouette_score"] is True + for v in mem.values.values(): + label, _, score = v.partition("|") + assert label.startswith("cluster ") # categorical part + assert -1.0 <= float(score) <= 1.0 # attached per-point silhouette def test_per_point_silhouette_skipped_beyond_hard_ceiling(): @@ -594,9 +590,12 @@ def test_per_point_silhouette_skipped_beyond_hard_ceiling(): params={"silhouette_hard_ceiling": 50}, # n=200 > 50 ) ) - names = {o.name for o in outs if isinstance(o, AnnotationColumn)} - assert "cluster_PCA_2" in names # membership is cheap, still emitted - assert "silhouette_PCA_2" not in names # O(n^2) silhouette skipped + cols = {o.name: o for o in outs if isinstance(o, AnnotationColumn)} + assert set(cols) == {"cluster_elbow_PCA_2"} # membership is cheap, still emitted + mem = cols["cluster_elbow_PCA_2"] + # O(n^2) per-point silhouette skipped → no attached score, plain labels. + assert mem.extra["has_silhouette_score"] is False + assert all("|" not in v for v in mem.values.values()) def test_cluster_annotations_can_be_disabled(): @@ -693,8 +692,8 @@ def test_elbow_result_has_no_silhouette_optimal_k(): def test_aggregate_silhouette_equals_per_point_mean(): - """When the per-point silhouette column is emitted, the aggregate silhouette is - exactly its mean (consistent, not a separate sampled estimate).""" + """The aggregate silhouette is exactly the mean of the per-point silhouettes + attached to the membership column values (consistent, not a sampled estimate).""" from protspace.stats.base import AnnotationColumn X, _ = _blobs(n=300, centers=4, dim=2, seed=45) @@ -704,23 +703,26 @@ def test_aggregate_silhouette_equals_per_point_mean(): ) agg = next(o for o in outs if isinstance(o, StatRow) and o.metric == "silhouette") col = next( - o for o in outs if isinstance(o, AnnotationColumn) and o.name == "silhouette_P" + o + for o in outs + if isinstance(o, AnnotationColumn) and o.name == "cluster_elbow_P" ) + per_point = [float(v.split("|", 1)[1]) for v in col.values.values()] assert agg.extra["sampled"] is False - assert agg.value == pytest.approx( - float(np.mean(list(col.values.values()))), abs=1e-9 - ) + assert agg.value == pytest.approx(float(np.mean(per_point)), abs=1e-4) -def test_faithfulness_subsample_is_row_order_invariant(): - """With subsampling active, the same id-set in a different row order selects the - same proteins, so faithfulness is identical (row-order invariant). This fails on - the old positional selection.""" +@pytest.mark.parametrize("sample_threshold", [60, 1000]) +def test_faithfulness_is_row_order_invariant(sample_threshold): + """All faithfulness metrics — including the position-sampled random_triplet — + depend only on the id-set, not the input row order, in BOTH the subsampled + (threshold=60 < n) and non-subsampled (threshold=1000 > n) regimes. The old + code broke this for random_triplet in the non-subsampled path.""" X, _ = _blobs(n=120, centers=4, dim=6, seed=46) coords = PCA(n_components=2, random_state=0).fit_transform(X) headers = [f"p{i}" for i in range(120)] emb = _EmbSet("e", X, headers) - params = {"sample_threshold": 60} # n=120 > 60 → subsample path + params = {"sample_threshold": sample_threshold} base = compute_statistics( [emb], @@ -744,6 +746,135 @@ def test_faithfulness_subsample_is_row_order_invariant(): ) b = {r.metric: r.value for r in base.rows} p = {r.metric: r.value for r in permuted.rows} - assert b["trustworthiness"] == pytest.approx(p["trustworthiness"], abs=1e-9) - assert b["knn_overlap"] == pytest.approx(p["knn_overlap"], abs=1e-9) - assert b["continuity"] == pytest.approx(p["continuity"], abs=1e-9) + for metric in ( + "trustworthiness", + "knn_overlap", + "continuity", + "random_triplet", + "spearman_distance", + ): + assert b[metric] == pytest.approx(p[metric], abs=1e-9), metric + + +# --------------------------------------------------------------------------- # +# 8. cluster-selection (elbow / silhouette / both) + global faithfulness +# --------------------------------------------------------------------------- # + + +def test_cluster_selection_silhouette_emits_silhouette_labeling(): + from protspace.stats.base import AnnotationColumn + + X, _ = _blobs(n=200, centers=4, dim=2, seed=51) + ids = [f"p{i}" for i in range(200)] + outs = ClusterValidityStatistic().compute( + StatContext( + "projection", + "PCA_2", + coords=X, + ids=ids, + params={"cluster_selection": "silhouette"}, + ) + ) + cols = {o.name for o in outs if isinstance(o, AnnotationColumn)} + assert cols == {"cluster_silhouette_PCA_2"} + rows = [o for o in outs if isinstance(o, StatRow)] + assert {r.label_kind for r in rows} == {"kmeans_silhouette"} + meta = next(r for r in rows if r.metric == "n_clusters") + assert meta.extra["selection"] == "silhouette" + + +def test_cluster_selection_both_emits_two_labelings(): + from protspace.stats.base import AnnotationColumn + + X, _ = _blobs(n=200, centers=4, dim=2, seed=52) + ids = [f"p{i}" for i in range(200)] + outs = ClusterValidityStatistic().compute( + StatContext( + "projection", + "PCA_2", + coords=X, + ids=ids, + params={"cluster_selection": "both"}, + ) + ) + cols = {o.name for o in outs if isinstance(o, AnnotationColumn)} + assert cols == {"cluster_elbow_PCA_2", "cluster_silhouette_PCA_2"} + kinds = {r.label_kind for r in outs if isinstance(r, StatRow)} + assert kinds == {"kmeans_elbow", "kmeans_silhouette"} + + +def test_cluster_membership_omits_score_when_scores_disabled(): + from protspace.stats.base import AnnotationColumn + + X, _ = _blobs(n=200, centers=4, dim=2, seed=53) + ids = [f"p{i}" for i in range(200)] + outs = ClusterValidityStatistic().compute( + StatContext( + "projection", "P", coords=X, ids=ids, params={"include_scores": False} + ) + ) + mem = next(o for o in outs if isinstance(o, AnnotationColumn)) + assert all("|" not in v for v in mem.values.values()) + assert mem.extra["has_silhouette_score"] is False + + +def test_kmeans_elbow_silhouette_selection_returns_alt_k(): + X, _ = _blobs(n=200, centers=4, dim=2, seed=56) + res = kmeans_elbow(X, rng_seed=42, silhouette_selection=True) + assert res.silhouette_k is not None and res.silhouette_labels is not None + assert len(res.silhouette_labels) == 200 + assert res.silhouette_k in res.k_range + # default (no selection) leaves the alternative-K fields unpopulated + res2 = kmeans_elbow(X, rng_seed=42) + assert res2.silhouette_k is None and res2.silhouette_labels is None + + +def test_faithfulness_emits_global_metrics_tagged_by_scope(): + X, _ = _blobs(n=150, centers=4, dim=8, seed=54) + coords = PCA(n_components=2, random_state=0).fit_transform(X) + ids = [str(i) for i in range(150)] + rows = { + r.metric: r + for r in FaithfulnessStatistic().compute( + StatContext( + "projection", + "P", + coords=coords, + ids=ids, + embedding=X, + embedding_name="e", + ) + ) + } + assert {"random_triplet", "spearman_distance"} <= set(rows) + assert 0.0 <= rows["random_triplet"].value <= 1.0 + assert -1.0 <= rows["spearman_distance"].value <= 1.0 + assert rows["random_triplet"].extra["scope"] == "global" + assert rows["spearman_distance"].extra["scope"] == "global" + assert rows["knn_overlap"].extra["scope"] == "local" + + +def test_global_metrics_higher_for_faithful_projection(): + X, _ = _blobs(n=150, centers=5, dim=8, seed=55) + faithful = PCA(n_components=2, random_state=0).fit_transform(X) + rand = np.random.default_rng(0).normal(size=(150, 2)) + ids = [str(i) for i in range(150)] + + def q(coords): + return { + r.metric: r.value + for r in FaithfulnessStatistic().compute( + StatContext( + "projection", + "P", + coords=coords, + ids=ids, + embedding=X, + embedding_name="e", + ) + ) + } + + good, bad = q(faithful), q(rand) + assert good["spearman_distance"] > bad["spearman_distance"] + assert good["random_triplet"] > bad["random_triplet"] diff --git a/tests/test_stats_carriage.py b/tests/test_stats_carriage.py index 7d3aa33..ad40a64 100644 --- a/tests/test_stats_carriage.py +++ b/tests/test_stats_carriage.py @@ -236,13 +236,13 @@ def test_annotation_columns_are_typed_in_protein_annotations_table(): table = BaseProcessor({}, {})._create_protein_annotations_table(metadata) cols = table.column_names - assert "cluster_P" in cols and "silhouette_P" in cols + # Single membership column; per-point silhouette is attached to its value. + assert "cluster_elbow_P" in cols and "silhouette_P" not in cols d = table.to_pydict() - # membership: non-numeric category labels → categorical inference - assert all(v.startswith("cluster ") for v in d["cluster_P"]) - # per-point silhouette: clean numeric strings → continuous inference - for v in d["silhouette_P"]: - float(v) # must not raise + for v in d["cluster_elbow_P"]: + label, _, score = v.partition("|") + assert label.startswith("cluster ") # categorical part + float(score) # attached per-point silhouette parses as a number def test_router_multi_embedding_routes_each_projection_to_its_own_scores(): @@ -270,3 +270,27 @@ def test_router_multi_embedding_routes_each_projection_to_its_own_scores(): assert qa == driver_vals["A — PCA 2"] assert qb == driver_vals["B — PCA 2"] assert qa != qb # each projection scored against its own embedding + + +def test_legend_settings_strip_attached_silhouette_score(): + """Membership values carry a `|silhouette` confidence; the auto legend must key + categories by the bare `cluster N` label (stripping the attached score).""" + from protspace.stats.base import AnnotationColumn, StatsReport + from protspace.stats.carriage import build_cluster_legend_settings + + report = StatsReport() + report.add( + [ + AnnotationColumn( + name="cluster_P", + kind="categorical", + values={ + "a": "cluster 0|0.5123", + "b": "cluster 1|0.3011", + "c": "cluster 0|0.6200", + }, + ) + ] + ) + settings = build_cluster_legend_settings(report) + assert set(settings["cluster_P"]["categories"]) == {"cluster 0", "cluster 1"} diff --git a/tests/test_stats_cli.py b/tests/test_stats_cli.py index 814ff0c..4db44fd 100644 --- a/tests/test_stats_cli.py +++ b/tests/test_stats_cli.py @@ -214,13 +214,14 @@ def test_stats_command_enriches_annotations_with_computed_columns(tmp_path): df = pq.read_table(str(ann_path)).to_pandas() cluster_cols = [c for c in df.columns if c.startswith("cluster_")] - sil_cols = [c for c in df.columns if c.startswith("silhouette_")] - assert cluster_cols and sil_cols + assert cluster_cols + assert not [c for c in df.columns if c.startswith("silhouette_")] # no 2nd column assert "organism" in df.columns # pre-existing annotation preserved assert "identifier" in df.columns - # membership categorical (non-numeric strings); silhouette numeric strings - assert str(df[cluster_cols[0]].iloc[0]).startswith("cluster ") - float(df[sil_cols[0]].iloc[0]) # must not raise + # membership value = "cluster N|" (category + attached confidence) + label, _, score = str(df[cluster_cols[0]].iloc[0]).partition("|") + assert label.startswith("cluster ") + float(score) # attached per-point silhouette parses as a number def test_stats_without_annotations_does_not_compute_per_protein(tmp_path): @@ -296,7 +297,7 @@ def test_stats_a_then_bundle_carries_computed_columns_into_bundle(tmp_path): ) # protein_annotations is 1st part cols = ann_table.column_names assert any(c.startswith("cluster_") for c in cols) - assert any(c.startswith("silhouette_") for c in cols) + assert not any(c.startswith("silhouette_") for c in cols) # folded into cluster_ assert "protein_id" in cols # identifier renamed by bundle @@ -454,3 +455,28 @@ def __init__(self, name, data, headers): # ...and _compute_statistics routed faithfulness into the reduction's info.quality quality = reductions[0]["info"]["quality"] assert {"knn_overlap", "trustworthiness", "continuity"} <= set(quality) + + +def test_stats_rejects_bad_cluster_selection(tmp_path): + """`--cluster-selection` is validated (fail-fast) rather than silently ignored.""" + from typer.testing import CliRunner + + from protspace.cli.app import app + + h5_path, proj, _ = _project_dir(tmp_path) + out = tmp_path / "statistics.parquet" + result = CliRunner().invoke( + app, + [ + "stats", + "-i", + f"{h5_path}:E", + "-p", + str(proj), + "-o", + str(out), + "--cluster-selection", + "bogus", + ], + ) + assert result.exit_code != 0