diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2212780..835f6b5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,14 +43,16 @@ jobs: run: | python -c " import melite - assert hasattr(melite, '__version__'), '__version__ missing' - assert hasattr(melite, 'Config'), 'Config missing' - assert hasattr(melite, 'load_dataset'), 'load_dataset missing' - assert hasattr(melite, 'ResultManager'), 'ResultManager missing' - assert hasattr(melite, 'plot_cv_distributions'), 'plot_cv_distributions missing' - assert hasattr(melite, 'predict'), 'predict missing' + expected = ['Config', 'load_datasets', 'plot_cv_distributions', 'predict', '__version__'] + assert melite.__all__ == expected, melite.__all__ + for name in expected: + assert hasattr(melite, name), f'{name} missing' + assert 'load_dataset' not in melite.__all__, 'load_dataset must not be top-level public API' + assert 'ResultManager' not in melite.__all__, 'ResultManager must not be top-level public API' assert not hasattr(melite, 'Pipeline'), 'Pipeline must not be public' assert not hasattr(melite, '_load_toml'), '_load_toml must not be public' + from melite.result_manager import ResultManager + assert ResultManager is not None, 'ResultManager internal import missing' print(melite.__version__, 'OK') " @@ -73,13 +75,15 @@ jobs: ../.smoke_venv/bin/melite --version ../.smoke_venv/bin/python -c " import melite - assert hasattr(melite, '__version__'), '__version__ missing' - assert hasattr(melite, 'Config'), 'Config missing' - assert hasattr(melite, 'load_dataset'), 'load_dataset missing' - assert hasattr(melite, 'ResultManager'), 'ResultManager missing' - assert hasattr(melite, 'plot_cv_distributions'), 'plot_cv_distributions missing' - assert hasattr(melite, 'predict'), 'predict missing' + expected = ['Config', 'load_datasets', 'plot_cv_distributions', 'predict', '__version__'] + assert melite.__all__ == expected, melite.__all__ + for name in expected: + assert hasattr(melite, name), f'{name} missing' + assert 'load_dataset' not in melite.__all__, 'load_dataset must not be top-level public API' + assert 'ResultManager' not in melite.__all__, 'ResultManager must not be top-level public API' assert not hasattr(melite, 'Pipeline'), 'Pipeline must not be public' + from melite.result_manager import ResultManager + assert ResultManager is not None, 'ResultManager internal import missing' print(melite.__version__, 'wheel OK') " @@ -96,12 +100,14 @@ jobs: ../.smoke_sdist_venv/bin/melite --version ../.smoke_sdist_venv/bin/python -c " import melite - assert hasattr(melite, '__version__'), '__version__ missing' - assert hasattr(melite, 'Config'), 'Config missing' - assert hasattr(melite, 'load_dataset'), 'load_dataset missing' - assert hasattr(melite, 'ResultManager'), 'ResultManager missing' - assert hasattr(melite, 'plot_cv_distributions'), 'plot_cv_distributions missing' - assert hasattr(melite, 'predict'), 'predict missing' + expected = ['Config', 'load_datasets', 'plot_cv_distributions', 'predict', '__version__'] + assert melite.__all__ == expected, melite.__all__ + for name in expected: + assert hasattr(melite, name), f'{name} missing' + assert 'load_dataset' not in melite.__all__, 'load_dataset must not be top-level public API' + assert 'ResultManager' not in melite.__all__, 'ResultManager must not be top-level public API' + from melite.result_manager import ResultManager + assert ResultManager is not None, 'ResultManager internal import missing' print(melite.__version__, 'sdist OK') " diff --git a/CHANGELOG.md b/CHANGELOG.md index 60c734e..67388d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,31 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 --- +## [v0.2.0] - 2026-05-26 + +### Added +- Added canonical `[datasets.]` TOML registry entries for + user-defined numeric tabular datasets. +- Added strict generalized dataset loading through `load_datasets(config)`. +- Added dataset-aware benchmark result rows with `dataset`, `family`, + `method`, `variant`, `level`, and `description` fields. +- Added dataset-aware final export naming, such as + `Model_SVC_morgan_r2_2048.pkl` and `SVC_morgan_r2_2048.png`. + +### Changed +- `melite run` now consumes `cfg.DATASETS` as the canonical execution path. +- PCA and UMAP inputs are treated as ordinary dataset registry entries. +- Legacy `[benchmark].reduction_types` and `levels` are normalized into + dataset entries when `[datasets]` is absent. +- `melite export` prefers the new `dataset` column and falls back to legacy + `reduction_type` + `level` rows for older CSV files. + +### Fixed +- Registered datasets now fail clearly on missing files, missing `X`, + non-2D or non-numeric `X`, X/y length mismatch, and embedded-y mismatch. + +--- + ## [0.1.11] - 2026-05-26 ### Changed diff --git a/CITATION.cff b/CITATION.cff index 5a45869..9c9a2b0 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -2,7 +2,7 @@ cff-version: 1.2.0 message: "If you use this software, please cite it as below." type: software title: "MELITE: Multi-model Evaluation and Learning for Inference-ready Tabular Experiments" -version: "0.1.11" +version: "0.2.0" date-released: "2026-05-26" authors: - family-names: "Contreras-Torres" diff --git a/README.md b/README.md index 13293ae..0851733 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Project: MELITE PyPI distribution: melite Import package: melite CLI: melite -Version: 0.1.11 +Version: 0.2.0 License: LGPL-3.0-or-later Status: alpha / pre-stable ``` @@ -84,7 +84,7 @@ import numpy as np from melite import predict X_new = np.load("examples/sample_PCA70.npz")["X"] -result = predict("examples/output/Model_SVC_PCA70.pkl", X_new) +result = predict("examples/output/Model_SVC_sample_pca70.pkl", X_new) print(result["predictions"]) print(result["probabilities"]) ``` @@ -98,31 +98,41 @@ print(result["probabilities"]) | Select the best row by F1-macro. | Generate PCA or UMAP reductions from raw data. | | Export a final retrained `.pkl` model. | Act as a general AutoML framework. | | Run artifact-based inference through `predict()`. | Promise a stable 1.0 API yet. | -| Handle any numeric tabular matrix. | Use a generalized dataset layer yet; PCA/UMAP naming is historical. | +| Handle any numeric tabular matrix. | Generate or validate domain-specific descriptors. | -!!! note "Current dataset orchestration" - The current dataset orchestration still reflects MELITE's PCA/UMAP origin - and uses concepts such as reduction type and level. Future versions will - generalize dataset definitions so arbitrary prepared tabular matrices can - be registered directly. - -Future configuration may look conceptually like this; it is not current -behavior: +Datasets are registered as concrete tabular matrix candidates under +`[datasets.]`. The `dataset_id` is user-defined and is used in +`results.csv`, figures, and exported model filenames. ```toml -[datasets.morgan] -path = "data/morgan.npz" +[datasets.morgan_r2_2048] +path = "data/morgan_r2_2048.npz" label_path = "raw/labels.npy" +family = "fingerprints" +method = "Morgan" +variant = "r2_2048" -[datasets.descriptors] -path = "data/descriptors.npz" +[datasets.rdkit_descriptors] +path = "data/rdkit_descriptors.npz" label_path = "raw/labels.npy" +family = "descriptors" +method = "RDKit" [datasets.pca85] path = "data/PCA85.npz" label_path = "raw/labels.npy" +family = "dimensionality" +method = "PCA" +level = 85 ``` +Each registered dataset must define `path` and `label_path`. Optional metadata +fields are `family`, `method`, `variant`, `level`, and `description`; they are +reported for traceability and do not drive special-case model execution. +Registered datasets are loaded strictly: missing files, missing `X`, non-2D or +non-numeric `X`, length mismatches, and embedded `y` mismatches fail the run. +Legacy `[benchmark].reduction_types` and `levels` configs are still accepted +and are normalized into equivalent dataset entries such as `PCA70` and `UMAP90`. ## CLI @@ -148,8 +158,7 @@ melite export --row 0 --force ```python from melite import Config -from melite import load_dataset -from melite import ResultManager +from melite import load_datasets from melite import plot_cv_distributions from melite import predict from melite import __version__ @@ -162,14 +171,14 @@ contract and may change before 0.2.0. ```text raw/labels.npy <- target vector y, shape (n_samples,) -data/PCA70.npz <- required key: X, optional key: y +data/morgan_r2_2048.npz <- required key: X, optional key: y +data/rdkit_descriptors.npz data/PCA85.npz -data/UMAP70.npz -data/UMAP85.npz +data/UMAP90.npz ``` Each `.npz` file must contain an `X` array. If an embedded `y` array is present, -MELITE validates it against `raw/labels.npy`. +MELITE validates it against the configured `label_path`. ## Outputs @@ -177,9 +186,9 @@ MELITE validates it against `raw/labels.npy`. output/ |-- results.txt |-- results.csv -|-- Model__.pkl +|-- Model__.pkl `-- figures/ - `-- _.png + `-- _.png ``` Local inputs and generated artifacts such as `raw/`, `data/`, `output/`, @@ -187,7 +196,7 @@ Local inputs and generated artifacts such as `raw/`, `data/`, `output/`, ## Validation -The current `dev/v0.1.11` branch targets: +The current `dev/v0.2.0` branch targets: ```bash python -m pytest tests/ -v --basetemp=.review_pytest_tmp -o cache_dir=.review_pytest_cache diff --git a/docs/api.md b/docs/api.md index a69c6f4..050a425 100644 --- a/docs/api.md +++ b/docs/api.md @@ -1,13 +1,12 @@ # API Reference -MELITE exposes an intended public API through six symbols. The project is +MELITE exposes an intended public API through five symbols. The project is pre-stable, so this API may change before 0.2.0. Internal modules are importable directly but are not part of the public contract. ```python from melite import Config -from melite import load_dataset -from melite import ResultManager +from melite import load_datasets from melite import plot_cv_distributions from melite import predict from melite import __version__ @@ -21,15 +20,9 @@ from melite import __version__ --- -## load_dataset +## load_datasets -::: melite.load_dataset.load_dataset - ---- - -## ResultManager - -::: melite.result_manager.ResultManager +::: melite.load_dataset.load_datasets --- diff --git a/docs/configuration.md b/docs/configuration.md index 1dc9611..9a6de6b 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -9,8 +9,12 @@ override only the settings that need to change. [paths] output = "my_output/" -[benchmark] -levels = [70, 85, 95] +[datasets.morgan_r2_2048] +path = "data/morgan_r2_2048.npz" +label_path = "raw/labels.npy" +family = "fingerprints" +method = "Morgan" +variant = "r2_2048" [models] active = ["svc", "rf"] @@ -29,14 +33,15 @@ MELITE consumes pre-computed feature matrices and labels: ```text raw/labels.npy <- target vector y, shape (n_samples,) -data/PCA70.npz <- required key: X, optional key: y +data/morgan_r2_2048.npz <- required key: X, optional key: y +data/maccs.npz +data/rdkit_descriptors.npz data/PCA85.npz -data/UMAP70.npz -data/UMAP85.npz +data/UMAP90.npz ``` Each `.npz` file must contain an `X` array. If an embedded `y` array is present, -MELITE validates it against `raw/labels.npy` to avoid silent feature-label +MELITE validates it against the configured `label_path` to avoid silent feature-label mismatches. MELITE is tabular at the modeling level. The learning algorithms only consume @@ -44,26 +49,54 @@ numeric `X` and `y` arrays, so the feature matrix may come from PCA, UMAP, fingerprints, descriptors, clinical variables, experimental measurements, industrial features, or manually selected numeric features. -The current dataset orchestration still reflects MELITE's PCA/UMAP origin and -uses concepts such as reduction type and level. Future versions will generalize -dataset definitions so arbitrary prepared tabular matrices can be registered -directly. Future configuration may look conceptually like this; it is not -current behavior: +Each concrete matrix candidate is registered under `[datasets.]`. +Required fields are `path` and `label_path`. Optional metadata fields are +`family`, `method`, `variant`, `level`, and `description`; they are preserved +in reports for traceability and do not control special-case execution logic. ```toml -[datasets.morgan] -path = "data/morgan.npz" +[datasets.morgan_r2_2048] +path = "data/morgan_r2_2048.npz" +label_path = "raw/labels.npy" +family = "fingerprints" +method = "Morgan" +variant = "r2_2048" + +[datasets.maccs] +path = "data/maccs.npz" label_path = "raw/labels.npy" +family = "fingerprints" +method = "MACCS" -[datasets.descriptors] -path = "data/descriptors.npz" +[datasets.rdkit_descriptors] +path = "data/rdkit_descriptors.npz" label_path = "raw/labels.npy" +family = "descriptors" +method = "RDKit" [datasets.pca85] path = "data/PCA85.npz" label_path = "raw/labels.npy" +family = "dimensionality" +method = "PCA" +level = 85 + +[datasets.umap90] +path = "data/UMAP90.npz" +label_path = "raw/labels.npy" +family = "dimensionality" +method = "UMAP" +level = 90 ``` +Registered datasets are loaded strictly. A missing dataset file, missing +`label_path`, missing `X`, non-2D `X`, non-numeric `X`, length mismatch, or +embedded `y` mismatch raises an error instead of silently skipping the entry. + +Legacy `[benchmark].reduction_types` and `levels` remain supported for +compatibility. When `[datasets]` is absent, MELITE synthesizes entries such as +`PCA70` and `UMAP90` with dimensionality metadata. + ## Outputs By default, MELITE writes results under `output/`: @@ -72,9 +105,9 @@ By default, MELITE writes results under `output/`: output/ |-- results.txt |-- results.csv -|-- Model__.pkl +|-- Model__.pkl `-- figures/ - `-- _.png + `-- _.png ``` | Output | Purpose | diff --git a/docs/index.md b/docs/index.md index 5657608..2e74641 100644 --- a/docs/index.md +++ b/docs/index.md @@ -113,31 +113,36 @@ industrial features, or manually selected numeric features. | Select the best row by F1-macro. | Handle raw molecular data directly. | | Export a final retrained `.pkl` model. | Require internet access at runtime. | | Run artifact-based inference through `predict()`. | Train deep learning models. | -| Handle any numeric tabular matrix. | Use a generalized dataset layer yet; PCA/UMAP naming is historical. | +| Handle any numeric tabular matrix. | Generate descriptors or reductions from raw data. | -!!! note "Current dataset orchestration" - The current dataset orchestration still reflects MELITE's PCA/UMAP origin - and uses concepts such as reduction type and level. Future versions will - generalize dataset definitions so arbitrary prepared tabular matrices can - be registered directly. - -Future configuration may look conceptually like this; it is not current -behavior: +MELITE uses a dataset registry under `[datasets.]`. Each +`dataset_id` names one concrete numeric `X` matrix candidate. ```toml -[datasets.morgan] -path = "data/morgan.npz" +[datasets.morgan_r2_2048] +path = "data/morgan_r2_2048.npz" label_path = "raw/labels.npy" +family = "fingerprints" +method = "Morgan" -[datasets.descriptors] -path = "data/descriptors.npz" +[datasets.rdkit_descriptors] +path = "data/rdkit_descriptors.npz" label_path = "raw/labels.npy" +family = "descriptors" +method = "RDKit" [datasets.pca85] path = "data/PCA85.npz" label_path = "raw/labels.npy" +family = "dimensionality" +method = "PCA" +level = 85 ``` +Required fields are `path` and `label_path`; optional metadata fields are +`family`, `method`, `variant`, `level`, and `description`. Legacy +`[benchmark].reduction_types` and `levels` configs are still normalized into +dataset entries when `[datasets]` is absent. ## Quick Example @@ -152,7 +157,7 @@ import numpy as np from melite import predict X_new = np.load("examples/sample_PCA70.npz")["X"] -result = predict("examples/output/Model_SVC_PCA70.pkl", X_new) +result = predict("examples/output/Model_SVC_sample_pca70.pkl", X_new) print(result["predictions"]) ``` @@ -173,9 +178,7 @@ If you use MELITE in your research, please cite it using the metadata in [CITATION.cff](https://github.com/NanoBiostructuresRG/melite/blob/main/CITATION.cff). ```text -Contreras-Torres, F. F., & Murrieta, A. C. (2026). MELITE: Multi-model -Evaluation and Learning for Inference-ready Tabular Experiments (0.1.11). Tecnologico de -Monterrey. https://github.com/NanoBiostructuresRG/melite +Contreras-Torres, F. F., & Murrieta, A. C. (2026). MELITE: Multi-model Evaluation and Learning for Inference-ready Tabular Experiments. Zenodo. https://doi.org/10.5281/zenodo.20382752 ``` ## License diff --git a/docs/quickstart.md b/docs/quickstart.md index d32aca6..fa75749 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -35,7 +35,7 @@ import numpy as np from melite import predict X_new = np.load("examples/sample_PCA70.npz")["X"] -result = predict("examples/output/Model_SVC_PCA70.pkl", X_new) +result = predict("examples/output/Model_SVC_sample_pca70.pkl", X_new) print(result["predictions"]) # shape (n_samples,) print(result["probabilities"]) # shape (n_samples, n_classes) diff --git a/docs/release.md b/docs/release.md index e3e7253..d32d7f4 100644 --- a/docs/release.md +++ b/docs/release.md @@ -1,17 +1,27 @@ # Release Notes -MELITE `0.1.11` prepares the project documentation and package metadata for -the first PyPI publication as `melite`. +MELITE `0.2.0` introduces the generalized tabular dataset registry and keeps +legacy PCA/UMAP configuration compatibility. + +## 0.2.0 Highlights + +- Registers concrete tabular matrices under `[datasets.]`. +- Requires `path` and `label_path`; preserves optional metadata fields + `family`, `method`, `variant`, `level`, and `description`. +- Runs benchmarks through strict `cfg.DATASETS` loading. +- Exports dataset-based artifacts such as `Model_SVC_morgan_r2_2048.pkl`. +- Falls back to legacy `reduction_type` + `level` export rows for older CSVs. ## 0.1.11 Highlights +MELITE `0.1.11` prepared the project documentation and package metadata for +the first PyPI publication as `melite`. + - Uses final release metadata version `0.1.11`. - Clarifies that MELITE is tabular at the modeling level and consumes numeric `X` and `y` arrays. -- Documents that current dataset orchestration remains PCA/UMAP-oriented for - historical reasons. -- Records generalized `[datasets.*]` definitions as a future direction, not - current behavior. +- Documented generalized `[datasets.*]` definitions as a future direction at + that time. - Does not change functional training, selection, export, prediction, or CLI behavior. diff --git a/examples/example_config.toml b/examples/example_config.toml index 1d6d44d..29ceabf 100644 --- a/examples/example_config.toml +++ b/examples/example_config.toml @@ -11,10 +11,16 @@ dataset = "examples/" output = "examples/output/" [benchmark] -reduction_types = ["PCA"] -levels = [70] random_state = 42 +[datasets.sample_pca70] +path = "examples/sample_PCA70.npz" +label_path = "examples/sample_labels.npy" +family = "dimensionality" +method = "PCA" +level = 70 +description = "Bundled synthetic PCA-style example dataset." + [cv] n_splits = 3 n_repeats = 1 diff --git a/melite/__init__.py b/melite/__init__.py index f6936aa..37f86c9 100644 --- a/melite/__init__.py +++ b/melite/__init__.py @@ -6,8 +6,7 @@ The following symbols are part of the stable public API: from melite import Config - from melite import load_dataset - from melite import ResultManager + from melite import load_datasets from melite import plot_cv_distributions from melite import predict from melite import __version__ @@ -16,16 +15,14 @@ import logging from .config import Config -from .load_dataset import load_dataset -from .result_manager import ResultManager +from .load_dataset import load_datasets from .plot_metrics import plot_cv_distributions from .predict import predict from .version import __version__ __all__ = [ "Config", - "load_dataset", - "ResultManager", + "load_datasets", "plot_cv_distributions", "predict", "__version__", diff --git a/melite/config.py b/melite/config.py index 4008327..4e59618 100644 --- a/melite/config.py +++ b/melite/config.py @@ -72,6 +72,9 @@ class Config: Reduction methods to benchmark (e.g. ``["PCA", "UMAP"]``). REDUCTION_LEVELS : list of int Variance retention levels to benchmark (e.g. ``[70, 75, 80, 85, 90, 95]``). + DATASETS : dict + Normalized dataset registry keyed by user-defined dataset id. Each + entry contains ``path``, ``label_path``, and ``metadata`` keys. ACTIVE_MODELS : list of str Model keys to include in the benchmark (e.g. ``["svc", "rf", "xgb"]``). CV_CONFIG : dict @@ -124,6 +127,7 @@ def __init__( self.REDUCTION_TYPES = cfg["benchmark"]["reduction_types"] self.REDUCTION_LEVELS = cfg["benchmark"]["levels"] self.ACTIVE_MODELS = cfg["models"]["active"] + self.DATASETS = self._build_dataset_registry(cfg) # Cross-validation cv_section = cfg["cv_smoke"] if smoke else cfg["cv"] @@ -141,6 +145,53 @@ def __init__( # Hyperparameter grids # ------------------------------------------------------------------ # + def _build_dataset_registry(self, cfg: dict) -> dict: + datasets = cfg.get("datasets") + if datasets: + return self._normalize_user_datasets(datasets) + return self._synthesize_legacy_datasets() + + @staticmethod + def _normalize_user_datasets(datasets: dict) -> dict: + optional_metadata = {"family", "method", "variant", "level", "description"} + normalized = {} + for dataset_id, entry in datasets.items(): + missing = [key for key in ("path", "label_path") if key not in entry] + if missing: + missing_keys = ", ".join(missing) + raise ValueError( + f"Dataset '{dataset_id}' is missing required field(s): {missing_keys}" + ) + metadata = { + key: value + for key, value in entry.items() + if key in optional_metadata + } + normalized[dataset_id] = { + "path": entry["path"], + "label_path": entry["label_path"], + "metadata": metadata, + } + return normalized + + def _synthesize_legacy_datasets(self) -> dict: + datasets = {} + for reduction_type in self.REDUCTION_TYPES: + for level in self.REDUCTION_LEVELS: + dataset_id = f"{reduction_type}{level}" + datasets[dataset_id] = { + "path": os.path.join( + self.PATHS["DATASET"], f"{dataset_id}.npz" + ), + "label_path": os.path.join(self.PATHS["INPUT"], "labels.npy"), + "metadata": { + "family": "dimensionality", + "method": reduction_type, + "level": level, + }, + } + return datasets + def _build_param_grid(self) -> list: if self.SMOKE: return [ diff --git a/melite/export_best_model.py b/melite/export_best_model.py index 666bfb8..fd5126a 100644 --- a/melite/export_best_model.py +++ b/melite/export_best_model.py @@ -11,6 +11,7 @@ import ast import logging +import re import sys from pathlib import Path from typing import Any, Tuple @@ -19,6 +20,7 @@ import numpy as np import pandas as pd from .config import Config +from .load_dataset import _load_one_dataset from .plot_metrics import plot_cv_distributions from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold @@ -36,11 +38,19 @@ } METRIC_COLUMNS = [ - "reduction_type", "level", "model_name", - "f1_macro", "accuracy", "auc_roc", + "dataset", "family", "method", "variant", "level", "description", + "reduction_type", "model_name", "f1_macro", "accuracy", "auc_roc", ] +def _has_value(value: Any) -> bool: + return value is not None and not pd.isna(value) and str(value).strip() != "" + + +def _safe_filename_part(value: Any) -> str: + return re.sub(r"[^A-Za-z0-9_.-]+", "_", str(value).strip()).strip("_") + + class DatasetLoader: """Load feature matrices and labels for model retraining. @@ -57,6 +67,27 @@ def __init__(self, cfg: Config): self._data_root = Path(cfg.PATHS["DATASET"]) self._labels: np.ndarray | None = None + def load_row(self, row: pd.Series) -> Tuple[np.ndarray, np.ndarray]: + """Load the dataset referenced by a result row. + + New v0.2.0 result rows are resolved by their ``dataset`` id in + ``cfg.DATASETS``. Older result rows without ``dataset`` fall back to + the legacy ``reduction_type`` + ``level`` lookup. + """ + if "dataset" in row and _has_value(row.get("dataset")): + dataset_id = str(row.get("dataset")) + try: + spec = self._cfg.DATASETS[dataset_id] + except KeyError as exc: + raise KeyError( + f"Dataset '{dataset_id}' from results.csv is not registered " + "in cfg.DATASETS." + ) from exc + dataset = _load_one_dataset(dataset_id, spec) + return dataset["X"], dataset["y"] + + return self.load(row.reduction_type, int(row.level)) + def load(self, reduction: str, level: int) -> Tuple[np.ndarray, np.ndarray]: """Load feature matrix and label vector for a given configuration. @@ -100,7 +131,7 @@ def _try_individual_file(self, reduction: str, level: int) -> np.ndarray | None: return None arr = np.load(fp) self._ensure_labels() - return arr[arr.files[0]] + return arr["X"] if "X" in arr.files else arr[arr.files[0]] def _try_aggregated_file(self, reduction: str, level: int) -> np.ndarray | None: fp = self._data_root / f"{reduction}s.npz" @@ -181,7 +212,7 @@ def _cv_and_plot(self, model, X, y, row, save_dir: Path) -> None: plot_cv_distributions( scores["test_f1"], scores["test_acc"], scores.get("test_auc"), model_name=row.model_name, params=row.parameters, - save_to=save_dir / f"{row.model_name}_{row.reduction_type}{row.level}.png", + save_to=save_dir / f"{row.model_name}_{self._row_dataset_label(row)}.png", ) def _check_smoke_guard(self, row: pd.Series) -> None: @@ -232,18 +263,18 @@ def run(self) -> None: self._show_metrics() row = self._get_selected_row() self._check_smoke_guard(row) - X, y = self._loader.load(row.reduction_type, int(row.level)) + X, y = self._loader.load_row(row) model = self._build_model(row.model_name, row.parameters) figures_dir = Path(self._cfg.PATHS["OUTPUT"]) / "figures" self._cv_and_plot(model, X, y, row, figures_dir) logger.info( - "Training %s on %s%s using all available data...", - row.model_name, row.reduction_type, row.level, + "Training %s on %s using all available data...", + row.model_name, self._row_dataset_label(row), ) print( - f"\nTraining {row.model_name} on {row.reduction_type}{row.level} " + f"\nTraining {row.model_name} on {self._row_dataset_label(row)} " "using all available data..." ) model.fit(X, y) @@ -282,9 +313,15 @@ def _build_model(name: str, serialised_params: str) -> Any: except KeyError as exc: raise ValueError(f"Unsupported model type: {name}") from exc + @staticmethod + def _row_dataset_label(row: pd.Series) -> str: + if "dataset" in row and _has_value(row.get("dataset")): + return _safe_filename_part(row.get("dataset")) + return _safe_filename_part(f"{row.reduction_type}{int(row.level)}") + def _save_model(self, model: Any, row: pd.Series) -> Path: self._output_dir.mkdir(exist_ok=True) - filename = f"Model_{row.model_name}_{row.reduction_type}{row.level}.pkl" + filename = f"Model_{row.model_name}_{self._row_dataset_label(row)}.pkl" path = self._output_dir / filename joblib.dump(model, path) return path diff --git a/melite/load_dataset.py b/melite/load_dataset.py index 7d220f3..03d29e7 100644 --- a/melite/load_dataset.py +++ b/melite/load_dataset.py @@ -1,9 +1,8 @@ # SPDX-License-Identifier: LGPL-3.0-or-later """Dataset loading and label consistency validation for MELITE. -This module provides a single public function, :func:`load_dataset`, that -reads pre-computed ``.npz`` feature matrices from ``data/`` and the -authoritative label vector from ``raw/labels.npy``. +This module provides :func:`load_datasets` for the generalized dataset +registry. If a ``.npz`` file contains an embedded ``y`` array, it is compared element-wise against ``raw/labels.npy``. A :exc:`ValueError` is raised if @@ -11,17 +10,96 @@ from propagating into model training. """ -import os import logging import numpy as np from pathlib import Path -__all__ = ["load_dataset"] +__all__ = ["load_datasets"] logger = logging.getLogger(__name__) -def load_dataset(config, reduction_type: str, levels: list) -> dict: +def _count_differences(left: np.ndarray, right: np.ndarray) -> int | str: + return int(np.sum(left != right)) if left.shape == right.shape else "N/A" + + +def _load_one_dataset(dataset_id: str, spec: dict) -> dict: + data_path = Path(spec["path"]) + label_path = Path(spec["label_path"]) + metadata = dict(spec.get("metadata", {})) + + if not data_path.exists(): + raise FileNotFoundError( + f"Dataset '{dataset_id}' file not found: {data_path}" + ) + if not label_path.exists(): + raise FileNotFoundError( + f"Dataset '{dataset_id}' label_path not found: {label_path}" + ) + + y = np.load(label_path) + data = np.load(data_path) + logger.info("Keys in %s: %s", data_path, data.files) + + if "X" not in data.files: + raise ValueError( + f"Required key 'X' not found in {data_path}.\n" + f" Available keys: {list(data.files)}" + ) + + X = data["X"] + if X.ndim != 2: + raise ValueError( + f"Dataset '{dataset_id}' X must be 2D; got shape {X.shape}." + ) + if not np.issubdtype(X.dtype, np.number): + raise ValueError( + f"Dataset '{dataset_id}' X must be numeric; got dtype {X.dtype}." + ) + if len(y) != X.shape[0]: + raise ValueError( + f"Dataset '{dataset_id}' X/y length mismatch: " + f"X has {X.shape[0]} rows, y has {len(y)} labels." + ) + + if "y" in data.files: + y_from_file = data["y"] + if not np.array_equal(y_from_file, y): + n_diff = _count_differences(y_from_file, y) + raise ValueError( + f"Label mismatch in {data_path}:\n" + f" embedded y (shape={y_from_file.shape}) does not match\n" + f" {label_path} (shape={y.shape}).\n" + f" Differing elements: {n_diff}/" + f"{y.shape[0] if y_from_file.shape == y.shape else '?'}." + ) + + return {"X": X, "y": y, "metadata": metadata} + + +def load_datasets(config) -> dict: + """Load all datasets from ``config.DATASETS``. + + Returns + ------- + dict + Mapping of dataset id to dictionaries with ``X``, ``y``, and + ``metadata`` keys. Dataset ids are user-defined identifiers and are + not interpreted as method names. + """ + loaded = {} + for dataset_id, spec in config.DATASETS.items(): + loaded[dataset_id] = _load_one_dataset(dataset_id, spec) + logger.info( + "Loaded %s: X shape=%s, y shape=%s", + dataset_id, + loaded[dataset_id]["X"].shape, + loaded[dataset_id]["y"].shape, + ) + return loaded + + +def _load_dataset_legacy(config, reduction_type: str, levels: list) -> dict: """Load reduced feature matrices and labels for benchmarking. Reads ``raw/labels.npy`` as the authoritative label vector, then loads @@ -60,75 +138,51 @@ def load_dataset(config, reduction_type: str, levels: list) -> dict: Examples -------- - >>> from melite import Config, load_dataset + >>> from melite import Config >>> cfg = Config() >>> cfg.setup() - >>> dataset = load_dataset(cfg, "PCA", [70, 85]) + >>> from melite.load_dataset import _load_dataset_legacy + >>> dataset = _load_dataset_legacy(cfg, "PCA", [70, 85]) >>> X, y = dataset["PCA70"] >>> X.shape (182, 37) """ - try: - labels_path = os.path.join(config.PATHS["INPUT"], "labels.npy") - y = np.load(labels_path) - logger.info("Labels loaded: %s (shape=%s)", labels_path, y.shape) - except Exception as exc: - logger.error("Error loading labels '%s': %s", labels_path, exc) - return {} - reductions = {} loaded = 0 for level in levels: - data_file = f"{reduction_type}{level}.npz" - data_path = os.path.join(config.PATHS["DATASET"], data_file) + dataset_id = f"{reduction_type}{level}" + spec = { + "path": Path(config.PATHS["DATASET"]) / f"{dataset_id}.npz", + "label_path": Path(config.PATHS["INPUT"]) / "labels.npy", + "metadata": { + "family": "dimensionality", + "method": reduction_type, + "level": level, + }, + } try: - if not os.path.exists(data_path): - logger.warning( - "Expected file not found: %s\n" - " Place the reduced feature matrix at this path and retry.", - data_path, - ) - continue - - data = np.load(data_path) - logger.info("Keys in %s: %s", data_file, data.files) - - if "X" not in data.files: - raise ValueError( - f"Required key 'X' not found in {data_path}.\n" - f" Available keys: {list(data.files)}" - ) - - X = data["X"] - - if "y" in data.files: - y_from_file = data["y"] - if not np.array_equal(y_from_file, y): - n_diff = ( - int(np.sum(y_from_file != y)) - if y_from_file.shape == y.shape - else "N/A" - ) - raise ValueError( - f"Label mismatch in {data_path}:\n" - f" embedded y (shape={y_from_file.shape}) does not match\n" - f" {labels_path} (shape={y.shape}).\n" - f" Differing elements: {n_diff}/" - f"{y.shape[0] if y_from_file.shape == y.shape else '?'}." - ) - - reductions[f"{reduction_type}{level}"] = (X, y) + dataset = _load_one_dataset(dataset_id, spec) + reductions[dataset_id] = (dataset["X"], dataset["y"]) logger.info( - "Loaded %s: X shape=%s, y shape=%s", data_file, X.shape, y.shape + "Loaded %s: X shape=%s, y shape=%s", + dataset_id, + dataset["X"].shape, + dataset["y"].shape, ) loaded += 1 except ValueError: raise + except FileNotFoundError as exc: + logger.warning( + "Expected file not found: %s\n" + " Place the feature matrix and labels at the configured paths and retry.", + exc, + ) except Exception as exc: - logger.error("Error loading %s: %s", data_file, exc) + logger.error("Error loading %s: %s", dataset_id, exc) if loaded == 0: logger.warning( diff --git a/melite/main.py b/melite/main.py index f27a2de..1429600 100644 --- a/melite/main.py +++ b/melite/main.py @@ -7,11 +7,12 @@ """ import logging -import numpy as np from pathlib import Path +import numpy as np + from .config import Config -from .load_dataset import load_dataset +from .load_dataset import load_datasets from .model_training import MultiModelTrainer from .result_manager import ResultManager @@ -36,7 +37,7 @@ def __init__(self, config: Config): self.config = config self.model_trainer = MultiModelTrainer(config) - def run(self, X_train, y_train, reduction_type: str, level: int): + def run(self, X_train, y_train, reduction_type: str, level: int | None): """Train all models and return the best result for one dataset. Parameters @@ -46,9 +47,10 @@ def run(self, X_train, y_train, reduction_type: str, level: int): y_train : numpy.ndarray Label vector of shape ``(n_samples,)``. reduction_type : str - Reduction method prefix (e.g. ``"PCA"``). - level : int - Variance retention level (e.g. ``85``). + Legacy reduction method label when available; otherwise the + dataset id is passed through for trace logging. + level : int or None + Legacy variance retention level when available. Returns ------- @@ -88,12 +90,25 @@ def _clean_params(params): for k, v in params.items() } + @staticmethod + def _legacy_reduction_type(metadata: dict): + method = metadata.get("method") + level = metadata.get("level") + family = metadata.get("family") + if ( + family == "dimensionality" + and method in {"PCA", "UMAP"} + and level is not None + ): + return method + return None + def run(self) -> None: """Execute the benchmarking pipeline for all configured datasets. - Iterates over all reduction types and levels defined in the - configuration, trains all models for each dataset, and writes - ``output/results.txt`` and ``output/results.csv``. + Iterates over the normalized ``config.DATASETS`` registry, trains all + models for each dataset, and writes ``output/results.txt`` and + ``output/results.csv``. Notes ----- @@ -102,61 +117,78 @@ def run(self) -> None: results are not benchmark-quality. """ if self.config.SMOKE: - logger.info("SMOKE TEST — reduced grid and CV. Results are not benchmark-quality.") + logger.info( + "SMOKE TEST - reduced grid and CV. Results are not benchmark-quality." + ) print(_SMOKE_WARNING) - for reduction_type in self.config.REDUCTION_TYPES: - logger.info("Running with %s...", reduction_type) - - dataset = load_dataset( - self.config, reduction_type, self.config.REDUCTION_LEVELS + datasets = load_datasets(self.config) + + for dataset_id, dataset in datasets.items(): + X_train = dataset["X"] + y_train = dataset["y"] + metadata = dataset.get("metadata", {}) + family = metadata.get("family") + method = metadata.get("method") + variant = metadata.get("variant") + level = metadata.get("level") + description = metadata.get("description") + reduction_type = self._legacy_reduction_type(metadata) + + logger.info("Training with dataset %s.", dataset_id) + + ( + best_model, best_params, + best_f1, f1_std, + best_acc, acc_std, + best_auc, auc_std, + ) = self.pipeline.run(X_train, y_train, reduction_type or dataset_id, level) + + params = self._clean_params(best_params) + model_name = best_model.__class__.__name__ + + metadata_lines = [ + f"Family: {family}" if family is not None else None, + f"Method: {method}" if method is not None else None, + f"Variant: {variant}" if variant is not None else None, + f"Level: {level}" if level is not None else None, + f"Description: {description}" if description is not None else None, + ] + + self.final_results.append( + "\n".join([ + f"Results for dataset {dataset_id}:", + *[line for line in metadata_lines if line is not None], + f"Model Selected: {model_name}", + f"Best ML-model Parameters: {params}", + f"F1-macro (CV mean): {round(best_f1, 4)} +/- {round(f1_std, 4)}", + f"Accuracy (CV mean): {round(best_acc, 4)} +/- {round(acc_std, 4)}", + ( + f"AUC-ROC (CV mean): {round(best_auc, 4)} +/- {round(auc_std, 4)}" + if best_auc is not None + else "AUC-ROC (CV mean): N/A" + ), + "------------------------------", + ]) ) - if not dataset: - logger.warning("No data found for %s. Skipping.", reduction_type) - continue - - for key, (X_train, y_train) in dataset.items(): - level = int(key.replace(reduction_type, "")) - logger.info("Training with %s (level=%d).", key, level) - - ( - best_model, best_params, - best_f1, f1_std, - best_acc, acc_std, - best_auc, auc_std, - ) = self.pipeline.run(X_train, y_train, reduction_type, level) - - params = self._clean_params(best_params) - model_name = best_model.__class__.__name__ - - self.final_results.append( - "\n".join([ - f"Results for {key} (level {level}):", - f"Model Selected: {model_name}", - f"Best ML-model Parameters: {params}", - f"F1-macro (CV mean): {round(best_f1, 4)} ± {round(f1_std, 4)}", - f"Accuracy (CV mean): {round(best_acc, 4)} ± {round(acc_std, 4)}", - ( - f"AUC-ROC (CV mean): {round(best_auc, 4)} ± {round(auc_std, 4)}" - if best_auc is not None - else "AUC-ROC (CV mean): N/A" - ), - "------------------------------", - ]) - ) - - self.csv_rows.append({ - "reduction_type": reduction_type, - "level": int(key.replace(reduction_type, "")), - "model_name": model_name, - "parameters": str(params), - "f1_macro": round(best_f1, 4), - "f1_std": round(f1_std, 4), - "accuracy": round(best_acc, 4), - "acc_std": round(acc_std, 4), - "auc_roc": round(best_auc, 4) if best_auc is not None else "N/A", - "auc_std": round(auc_std, 4) if auc_std is not None else "N/A", - }) + + self.csv_rows.append({ + "dataset": dataset_id, + "family": family, + "method": method, + "variant": variant, + "level": level, + "description": description, + "reduction_type": reduction_type, + "model_name": model_name, + "parameters": str(params), + "f1_macro": round(best_f1, 4), + "f1_std": round(f1_std, 4), + "accuracy": round(best_acc, 4), + "acc_std": round(acc_std, 4), + "auc_roc": round(best_auc, 4) if best_auc is not None else "N/A", + "auc_std": round(auc_std, 4) if auc_std is not None else "N/A", + }) final_report = "\n".join(self.final_results) self.result_manager.write_results(final_report) diff --git a/melite/result_manager.py b/melite/result_manager.py index 437ffb8..7dcdf7b 100644 --- a/melite/result_manager.py +++ b/melite/result_manager.py @@ -91,9 +91,8 @@ def write_csv(self, rows: list[dict], path: Path | str, smoke: bool = False) -> ---------- rows : list of dict List of result dictionaries, one per trained configuration. Each - dict must contain the keys ``reduction_type``, ``level``, - ``model_name``, ``parameters``, ``f1_macro``, ``f1_std``, - ``accuracy``, ``acc_std``, ``auc_roc``, and ``auc_std``. + dict may include dataset identity and metadata fields in addition + to model performance metrics. path : str or pathlib.Path Destination path for the CSV file. Parent directories are created automatically if they do not exist. @@ -114,9 +113,9 @@ def write_csv(self, rows: list[dict], path: Path | str, smoke: bool = False) -> path.parent.mkdir(parents=True, exist_ok=True) fieldnames = [ - "reduction_type", "level", "model_name", "parameters", - "f1_macro", "f1_std", "accuracy", "acc_std", "auc_roc", "auc_std", - "smoke", + "dataset", "family", "method", "variant", "level", "description", + "reduction_type", "model_name", "parameters", "f1_macro", "f1_std", + "accuracy", "acc_std", "auc_roc", "auc_std", "smoke", ] try: with open(path, mode="w", newline="", encoding="utf-8") as f: diff --git a/melite/version.py b/melite/version.py index ff394e4..3ce2a66 100644 --- a/melite/version.py +++ b/melite/version.py @@ -6,7 +6,7 @@ and imported by ``result_manager`` to stamp generated reports. """ -__version__ = "0.1.11" +__version__ = "0.2.0" PROJECT_NAME = "MELITE" PROJECT_VERSION = __version__ PROJECT_STATUS = "alpha" diff --git a/pyproject.toml b/pyproject.toml index cda1e4d..a034a66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "melite" -version = "0.1.11" +dynamic = ["version"] description = "Tabular classification benchmarking toolkit for model selection, repeated stratified cross-validation, final model export, and artifact-based inference." readme = "README.md" license = "LGPL-3.0-or-later" diff --git a/tests/test_config.py b/tests/test_config.py index d7fa5b2..af948d8 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -78,3 +78,80 @@ def test_config_user_toml_falls_back_to_defaults_for_missing_keys(tmp_path): cfg = Config(user_config=user_toml) # random_state should still be the default (42) assert cfg.RANDOM_STATE == 42 + + +def test_config_synthesizes_legacy_dataset_registry(tmp_path): + toml_content = ( + '[paths]\ninput = "raw/"\ndataset = "data/"\noutput = "output/"\n' + '[benchmark]\nreduction_types = ["PCA", "UMAP"]\nlevels = [70, 75]\n' + ) + user_toml = tmp_path / "custom.toml" + user_toml.write_text(toml_content) + + cfg = Config(user_config=user_toml) + + assert set(cfg.DATASETS) == {"PCA70", "PCA75", "UMAP70", "UMAP75"} + assert Path(cfg.DATASETS["PCA70"]["path"]) == Path("data/PCA70.npz") + assert Path(cfg.DATASETS["PCA70"]["label_path"]) == Path("raw/labels.npy") + assert cfg.DATASETS["PCA70"]["metadata"] == { + "family": "dimensionality", + "method": "PCA", + "level": 70, + } + + +def test_config_uses_user_defined_dataset_registry(tmp_path): + toml_content = ''' +[datasets.morgan_r2_2048] +path = "data/morgan_r2_2048.npz" +label_path = "raw/labels.npy" +family = "fingerprints" +method = "Morgan" +variant = "r2_2048" +description = "Morgan radius 2 fingerprint" + +[datasets.rdkit_descriptors] +path = "data/rdkit_descriptors.npz" +label_path = "raw/labels.npy" +family = "descriptors" +''' + user_toml = tmp_path / "custom.toml" + user_toml.write_text(toml_content) + + cfg = Config(user_config=user_toml) + + assert set(cfg.DATASETS) == {"morgan_r2_2048", "rdkit_descriptors"} + assert cfg.DATASETS["morgan_r2_2048"] == { + "path": "data/morgan_r2_2048.npz", + "label_path": "raw/labels.npy", + "metadata": { + "family": "fingerprints", + "method": "Morgan", + "variant": "r2_2048", + "description": "Morgan radius 2 fingerprint", + }, + } + + +def test_config_user_dataset_requires_path(tmp_path): + toml_content = ''' +[datasets.maccs] +label_path = "raw/labels.npy" +''' + user_toml = tmp_path / "custom.toml" + user_toml.write_text(toml_content) + + with pytest.raises(ValueError, match="path"): + Config(user_config=user_toml) + + +def test_config_user_dataset_requires_label_path(tmp_path): + toml_content = ''' +[datasets.maccs] +path = "data/maccs.npz" +''' + user_toml = tmp_path / "custom.toml" + user_toml.write_text(toml_content) + + with pytest.raises(ValueError, match="label_path"): + Config(user_config=user_toml) diff --git a/tests/test_export.py b/tests/test_export.py index fa19810..629606b 100644 --- a/tests/test_export.py +++ b/tests/test_export.py @@ -1,10 +1,19 @@ # SPDX-License-Identifier: LGPL-3.0-or-later """Tests for melite.export_best_model.""" +import csv + +import numpy as np import pytest -from pathlib import Path -from melite.export_best_model import Finalizer + from melite.config import Config +from melite.export_best_model import Finalizer + + +class DummyModel: + def fit(self, X, y): + self.shape_ = X.shape + return self def _make_config(tmp_path): @@ -14,9 +23,35 @@ def _make_config(tmp_path): "DATASET": str(tmp_path / "data") + "/", "OUTPUT": str(tmp_path / "output") + "/", } + cfg.DATASETS = {} return cfg +def _write_labels(tmp_path, n_samples=20): + raw_dir = tmp_path / "raw" + raw_dir.mkdir(exist_ok=True) + y = np.array([0, 1] * (n_samples // 2), dtype=np.int64) + path = raw_dir / "labels.npy" + np.save(path, y) + return path, y + + +def _write_npz(tmp_path, name, X, y): + data_dir = tmp_path / "data" + data_dir.mkdir(exist_ok=True) + path = data_dir / f"{name}.npz" + np.savez(path, X=X, y=y) + return path + + +def _write_results_csv(path, fieldnames, row): + path.parent.mkdir(exist_ok=True) + with open(path, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerow(row) + + def test_missing_csv_raises_file_not_found_error(tmp_path): cfg = _make_config(tmp_path) missing_csv = tmp_path / "output" / "results.csv" @@ -63,7 +98,6 @@ def test_smoke_guard_allows_with_force(tmp_path, tmp_results_csv): output_dir = tmp_results_csv.parent finalizer = Finalizer(tmp_results_csv, output_dir, cfg, row_index=1, force=True) row = finalizer._get_selected_row() - # Should not raise finalizer._check_smoke_guard(row) @@ -72,5 +106,117 @@ def test_smoke_guard_allows_non_smoke_row(tmp_path, tmp_results_csv): output_dir = tmp_results_csv.parent finalizer = Finalizer(tmp_results_csv, output_dir, cfg, row_index=0, force=False) row = finalizer._get_selected_row() - # Row 0 is smoke=False — should not raise finalizer._check_smoke_guard(row) + + +def test_export_dataset_row_uses_dataset_id_for_artifact(monkeypatch, tmp_path): + label_path, y = _write_labels(tmp_path) + dataset_path = _write_npz(tmp_path, "morgan_r2_2048", np.ones((20, 5)), y) + cfg = _make_config(tmp_path) + cfg.DATASETS = { + "morgan_r2_2048": { + "path": str(dataset_path), + "label_path": str(label_path), + "metadata": {"family": "fingerprints", "method": "Morgan"}, + } + } + csv_path = tmp_path / "output" / "results.csv" + _write_results_csv( + csv_path, + [ + "dataset", "family", "method", "variant", "level", "description", + "reduction_type", "model_name", "parameters", "f1_macro", "accuracy", + "auc_roc", "smoke", + ], + { + "dataset": "morgan_r2_2048", + "family": "fingerprints", + "method": "Morgan", + "variant": "", + "level": "", + "description": "", + "reduction_type": "", + "model_name": "SVC", + "parameters": "{'kernel': 'linear', 'C': 1}", + "f1_macro": 0.8, + "accuracy": 0.8, + "auc_roc": 0.9, + "smoke": False, + }, + ) + monkeypatch.setattr(Finalizer, "_build_model", staticmethod(lambda *_: DummyModel())) + monkeypatch.setattr(Finalizer, "_cv_and_plot", lambda *args, **kwargs: None) + + Finalizer(csv_path, tmp_path / "output", cfg, row_index=0).run() + + assert (tmp_path / "output" / "Model_SVC_morgan_r2_2048.pkl").exists() + + +def test_export_legacy_row_falls_back_to_reduction_and_level(monkeypatch, tmp_path): + _, y = _write_labels(tmp_path) + _write_npz(tmp_path, "PCA70", np.ones((20, 5)), y) + cfg = _make_config(tmp_path) + csv_path = tmp_path / "output" / "results.csv" + _write_results_csv( + csv_path, + [ + "reduction_type", "level", "model_name", "parameters", + "f1_macro", "accuracy", "auc_roc", "smoke", + ], + { + "reduction_type": "PCA", + "level": 70, + "model_name": "SVC", + "parameters": "{'kernel': 'linear', 'C': 1}", + "f1_macro": 0.8, + "accuracy": 0.8, + "auc_roc": 0.9, + "smoke": False, + }, + ) + monkeypatch.setattr(Finalizer, "_build_model", staticmethod(lambda *_: DummyModel())) + monkeypatch.setattr(Finalizer, "_cv_and_plot", lambda *args, **kwargs: None) + + Finalizer(csv_path, tmp_path / "output", cfg, row_index=0).run() + + assert (tmp_path / "output" / "Model_SVC_PCA70.pkl").exists() + + +def test_cv_plot_uses_dataset_id_for_figure(monkeypatch, tmp_path): + import melite.export_best_model as export_module + + cfg = _make_config(tmp_path) + csv_path = tmp_path / "output" / "results.csv" + _write_results_csv( + csv_path, + ["dataset", "model_name", "parameters", "smoke"], + { + "dataset": "rdkit_descriptors", + "model_name": "SVC", + "parameters": "{'kernel': 'linear', 'C': 1}", + "smoke": False, + }, + ) + saved = {} + monkeypatch.setattr( + export_module, + "cross_validate", + lambda *args, **kwargs: { + "test_f1": np.array([0.8]), + "test_acc": np.array([0.8]), + "test_auc": np.array([0.9]), + }, + ) + monkeypatch.setattr( + export_module, + "plot_cv_distributions", + lambda *args, **kwargs: saved.update({"save_to": kwargs["save_to"]}), + ) + finalizer = Finalizer(csv_path, tmp_path / "output", cfg, row_index=0) + row = finalizer._get_selected_row() + + finalizer._cv_and_plot( + DummyModel(), np.ones((20, 5)), np.array([0, 1] * 10), row, tmp_path + ) + + assert saved["save_to"] == tmp_path / "SVC_rdkit_descriptors.png" diff --git a/tests/test_load_dataset.py b/tests/test_load_dataset.py index ad33cfc..a92bcc2 100644 --- a/tests/test_load_dataset.py +++ b/tests/test_load_dataset.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: LGPL-3.0-or-later """Tests for melite.load_dataset.""" +import numpy as np import pytest -from melite.load_dataset import load_dataset +from melite.load_dataset import load_datasets, _load_dataset_legacy def _make_config(tmp_path): @@ -16,9 +17,40 @@ def _make_config(tmp_path): return cfg +def _write_labels(tmp_path, n_samples=20): + raw_dir = tmp_path / "raw" + raw_dir.mkdir(exist_ok=True) + y = np.array([0, 1] * (n_samples // 2), dtype=np.int64) + if n_samples % 2: + y = np.append(y, 0) + label_path = raw_dir / "labels.npy" + np.save(label_path, y) + return label_path, y + + +def _write_dataset(tmp_path, name, X, y=None, embedded_y=None): + data_dir = tmp_path / "data" + data_dir.mkdir(exist_ok=True) + path = data_dir / f"{name}.npz" + if embedded_y is None: + if y is None: + np.savez(path, X=X) + else: + np.savez(path, X=X, y=y) + else: + np.savez(path, X=X, y=embedded_y) + return path + + +def _registry_config(tmp_path, datasets): + cfg = _make_config(tmp_path) + cfg.DATASETS = datasets + return cfg + + def test_valid_npz_with_matching_y_loads(tmp_path, tmp_labels, tmp_npz_valid): cfg = _make_config(tmp_path) - result = load_dataset(cfg, "PCA", [70]) + result = _load_dataset_legacy(cfg, "PCA", [70]) assert "PCA70" in result X, y = result["PCA70"] assert X.shape == (20, 5) @@ -27,7 +59,7 @@ def test_valid_npz_with_matching_y_loads(tmp_path, tmp_labels, tmp_npz_valid): def test_valid_npz_without_y_loads(tmp_path, tmp_labels, tmp_npz_no_y): cfg = _make_config(tmp_path) - result = load_dataset(cfg, "PCA", [70]) + result = _load_dataset_legacy(cfg, "PCA", [70]) assert "PCA70" in result @@ -35,7 +67,7 @@ def test_missing_file_warns_and_skips(tmp_path, tmp_labels, caplog): import logging cfg = _make_config(tmp_path) with caplog.at_level(logging.WARNING, logger="melite.load_dataset"): - result = load_dataset(cfg, "PCA", [70]) + result = _load_dataset_legacy(cfg, "PCA", [70]) assert result == {} assert any("not found" in msg.lower() for msg in caplog.messages) @@ -43,28 +75,190 @@ def test_missing_file_warns_and_skips(tmp_path, tmp_labels, caplog): def test_missing_X_key_raises_value_error(tmp_path, tmp_labels, tmp_npz_missing_X): cfg = _make_config(tmp_path) with pytest.raises(ValueError, match="Required key 'X' not found"): - load_dataset(cfg, "PCA", [70]) + _load_dataset_legacy(cfg, "PCA", [70]) def test_missing_X_error_includes_available_keys(tmp_path, tmp_labels, tmp_npz_missing_X): cfg = _make_config(tmp_path) with pytest.raises(ValueError, match="Available keys"): - load_dataset(cfg, "PCA", [70]) + _load_dataset_legacy(cfg, "PCA", [70]) def test_mismatched_y_raises_value_error(tmp_path, tmp_labels, tmp_npz_mismatched_y): cfg = _make_config(tmp_path) with pytest.raises(ValueError, match="Label mismatch"): - load_dataset(cfg, "PCA", [70]) + _load_dataset_legacy(cfg, "PCA", [70]) def test_mismatched_y_error_includes_shapes(tmp_path, tmp_labels, tmp_npz_mismatched_y): cfg = _make_config(tmp_path) with pytest.raises(ValueError, match=r"shape=\(20,\)"): - load_dataset(cfg, "PCA", [70]) + _load_dataset_legacy(cfg, "PCA", [70]) def test_mismatched_y_error_includes_diff_count(tmp_path, tmp_labels, tmp_npz_mismatched_y): cfg = _make_config(tmp_path) with pytest.raises(ValueError, match="Differing elements"): - load_dataset(cfg, "PCA", [70]) + _load_dataset_legacy(cfg, "PCA", [70]) + + +def test_load_datasets_loads_arbitrary_dataset_ids_and_metadata(tmp_path): + label_path, y = _write_labels(tmp_path, n_samples=20) + datasets = {} + specs = { + "morgan_r2_2048": ("fingerprints", "Morgan", None), + "maccs": ("fingerprints", "MACCS", None), + "rdkit_descriptors": ("descriptors", "RDKit", None), + "pca85": ("dimensionality", "PCA", 85), + "umap90": ("dimensionality", "UMAP", 90), + } + for index, (dataset_id, (family, method, level)) in enumerate(specs.items()): + path = _write_dataset( + tmp_path, + dataset_id, + np.full((20, 3 + index), index, dtype=np.float32), + y=y, + ) + metadata = {"family": family, "method": method} + if level is not None: + metadata["level"] = level + datasets[dataset_id] = { + "path": str(path), + "label_path": str(label_path), + "metadata": metadata, + } + + result = load_datasets(_registry_config(tmp_path, datasets)) + + assert list(result) == [ + "morgan_r2_2048", + "maccs", + "rdkit_descriptors", + "pca85", + "umap90", + ] + assert result["maccs"]["X"].shape == (20, 4) + assert np.array_equal(result["rdkit_descriptors"]["y"], y) + assert result["pca85"]["metadata"] == { + "family": "dimensionality", + "method": "PCA", + "level": 85, + } + + +def test_load_datasets_arbitrary_id_is_not_treated_as_method_name(tmp_path): + label_path, y = _write_labels(tmp_path, n_samples=20) + path = _write_dataset(tmp_path, "anything_user_wants", np.ones((20, 2)), y=y) + cfg = _registry_config(tmp_path, { + "not_a_method_name": { + "path": str(path), + "label_path": str(label_path), + "metadata": {"family": "custom"}, + } + }) + + result = load_datasets(cfg) + + assert set(result) == {"not_a_method_name"} + assert result["not_a_method_name"]["metadata"] == {"family": "custom"} + + +def test_load_datasets_missing_X_key_raises_value_error(tmp_path): + label_path, y = _write_labels(tmp_path, n_samples=20) + data_dir = tmp_path / "data" + data_dir.mkdir(exist_ok=True) + path = data_dir / "maccs.npz" + np.savez(path, y=y) + cfg = _registry_config(tmp_path, { + "maccs": {"path": str(path), "label_path": str(label_path), "metadata": {}} + }) + + with pytest.raises(ValueError, match="Required key 'X' not found"): + load_datasets(cfg) + + +def test_load_datasets_missing_npz_raises_file_not_found_error(tmp_path): + label_path, _ = _write_labels(tmp_path, n_samples=20) + cfg = _registry_config(tmp_path, { + "maccs": { + "path": str(tmp_path / "data" / "missing.npz"), + "label_path": str(label_path), + "metadata": {}, + } + }) + + with pytest.raises(FileNotFoundError, match="file not found"): + load_datasets(cfg) + + +def test_load_datasets_missing_label_path_raises_file_not_found_error(tmp_path): + path = _write_dataset(tmp_path, "maccs", np.ones((20, 2)), y=None) + cfg = _registry_config(tmp_path, { + "maccs": { + "path": str(path), + "label_path": str(tmp_path / "raw" / "missing.npy"), + "metadata": {}, + } + }) + + with pytest.raises(FileNotFoundError, match="label_path not found"): + load_datasets(cfg) + + +def test_load_datasets_non_2d_X_raises_value_error(tmp_path): + label_path, y = _write_labels(tmp_path, n_samples=20) + path = _write_dataset(tmp_path, "maccs", np.ones(20), y=y) + cfg = _registry_config(tmp_path, { + "maccs": {"path": str(path), "label_path": str(label_path), "metadata": {}} + }) + + with pytest.raises(ValueError, match="2D"): + load_datasets(cfg) + + +def test_load_datasets_non_numeric_X_raises_value_error(tmp_path): + label_path, y = _write_labels(tmp_path, n_samples=20) + X = np.array([["a", "b"]] * 20) + path = _write_dataset(tmp_path, "maccs", X, y=y) + cfg = _registry_config(tmp_path, { + "maccs": {"path": str(path), "label_path": str(label_path), "metadata": {}} + }) + + with pytest.raises(ValueError, match="numeric"): + load_datasets(cfg) + + +def test_load_datasets_X_y_length_mismatch_raises_value_error(tmp_path): + label_path, y = _write_labels(tmp_path, n_samples=20) + path = _write_dataset(tmp_path, "maccs", np.ones((19, 2)), y=None) + cfg = _registry_config(tmp_path, { + "maccs": {"path": str(path), "label_path": str(label_path), "metadata": {}} + }) + + with pytest.raises(ValueError, match="length mismatch"): + load_datasets(cfg) + + +def test_load_datasets_embedded_y_mismatch_raises_value_error(tmp_path): + label_path, y = _write_labels(tmp_path, n_samples=20) + bad_y = np.ones_like(y) + path = _write_dataset(tmp_path, "maccs", np.ones((20, 2)), embedded_y=bad_y) + cfg = _registry_config(tmp_path, { + "maccs": {"path": str(path), "label_path": str(label_path), "metadata": {}} + }) + + with pytest.raises(ValueError, match="Label mismatch"): + load_datasets(cfg) + + +def test_load_dataset_legacy_private_wrapper_remains_tuple_mapping(tmp_path): + label_path, y = _write_labels(tmp_path, n_samples=20) + _write_dataset(tmp_path, "PCA70", np.ones((20, 2)), y=y) + cfg = _make_config(tmp_path) + + result = _load_dataset_legacy(cfg, "PCA", [70]) + + assert set(result) == {"PCA70"} + X_loaded, y_loaded = result["PCA70"] + assert X_loaded.shape == (20, 2) + assert np.array_equal(y_loaded, np.load(label_path)) diff --git a/tests/test_main.py b/tests/test_main.py new file mode 100644 index 0000000..6d14278 --- /dev/null +++ b/tests/test_main.py @@ -0,0 +1,187 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Tests for melite.main orchestration.""" + +import csv + +import numpy as np +import pytest + +import melite.main as main_module +from melite.main import Main + + +class SVC: + pass + + +class DummyPipeline: + calls = [] + + def __init__(self, config): + self.config = config + + def run(self, X_train, y_train, reduction_type, level): + self.calls.append({ + "shape": X_train.shape, + "n_labels": len(y_train), + "reduction_type": reduction_type, + "level": level, + }) + return ( + SVC(), + {"C": 1.0, "kernel": "linear"}, + 0.8, + 0.01, + 0.82, + 0.02, + 0.9, + 0.03, + ) + + +def _write_labels(path, n_samples=8): + path.mkdir(parents=True, exist_ok=True) + y = np.array([0, 1] * (n_samples // 2), dtype=np.int64) + label_path = path / "labels.npy" + np.save(label_path, y) + return label_path, y + + +def _write_npz(path, name, X, y): + path.mkdir(parents=True, exist_ok=True) + data_path = path / f"{name}.npz" + np.savez(data_path, X=X, y=y) + return data_path + + +def _rows(csv_path): + with open(csv_path, encoding="utf-8") as f: + return list(csv.DictReader(f)) + + +def test_main_run_uses_arbitrary_dataset_ids(monkeypatch, tmp_path): + DummyPipeline.calls = [] + monkeypatch.setattr(main_module, "Pipeline", DummyPipeline) + + raw_dir = tmp_path / "raw" + data_dir = tmp_path / "data" + output_dir = tmp_path / "output" + label_path, y = _write_labels(raw_dir) + morgan_path = _write_npz(data_dir, "morgan_r2_2048", np.ones((8, 4)), y) + desc_path = _write_npz(data_dir, "rdkit_descriptors", np.ones((8, 3)), y) + pca_path = _write_npz(data_dir, "PCA85", np.ones((8, 2)), y) + config_path = tmp_path / "config.toml" + config_path.write_text(f''' +[paths] +input = "{raw_dir.as_posix()}/" +dataset = "{data_dir.as_posix()}/" +output = "{output_dir.as_posix()}/" + +[datasets.morgan_r2_2048] +path = "{morgan_path.as_posix()}" +label_path = "{label_path.as_posix()}" +family = "fingerprints" +method = "Morgan" +variant = "r2_2048" +description = "Morgan radius 2 fingerprint" + +[datasets.rdkit_descriptors] +path = "{desc_path.as_posix()}" +label_path = "{label_path.as_posix()}" +family = "descriptors" +method = "RDKit" + +[datasets.pca85] +path = "{pca_path.as_posix()}" +label_path = "{label_path.as_posix()}" +family = "dimensionality" +method = "PCA" +level = 85 +''') + + Main(user_config=config_path).run() + + rows = _rows(output_dir / "results.csv") + assert [row["dataset"] for row in rows] == [ + "morgan_r2_2048", + "rdkit_descriptors", + "pca85", + ] + assert rows[0]["family"] == "fingerprints" + assert rows[0]["method"] == "Morgan" + assert rows[0]["variant"] == "r2_2048" + assert rows[0]["description"] == "Morgan radius 2 fingerprint" + assert rows[0]["reduction_type"] == "" + assert rows[1]["method"] == "RDKit" + assert rows[1]["reduction_type"] == "" + assert rows[2]["method"] == "PCA" + assert rows[2]["reduction_type"] == "PCA" + assert rows[2]["level"] == "85" + assert DummyPipeline.calls[0]["reduction_type"] == "morgan_r2_2048" + assert DummyPipeline.calls[1]["reduction_type"] == "rdkit_descriptors" + assert DummyPipeline.calls[2]["reduction_type"] == "PCA" + assert DummyPipeline.calls[2]["level"] == 85 + + +def test_main_run_uses_legacy_registry_metadata(monkeypatch, tmp_path): + DummyPipeline.calls = [] + monkeypatch.setattr(main_module, "Pipeline", DummyPipeline) + + raw_dir = tmp_path / "raw" + data_dir = tmp_path / "data" + output_dir = tmp_path / "output" + label_path, y = _write_labels(raw_dir) + _write_npz(data_dir, "PCA70", np.ones((8, 2)), y) + config_path = tmp_path / "legacy.toml" + config_path.write_text(f''' +[paths] +input = "{raw_dir.as_posix()}/" +dataset = "{data_dir.as_posix()}/" +output = "{output_dir.as_posix()}/" + +[benchmark] +reduction_types = ["PCA"] +levels = [70] +''') + + Main(user_config=config_path).run() + + rows = _rows(output_dir / "results.csv") + assert rows[0]["dataset"] == "PCA70" + assert rows[0]["family"] == "dimensionality" + assert rows[0]["method"] == "PCA" + assert rows[0]["reduction_type"] == "PCA" + assert rows[0]["level"] == "70" + assert DummyPipeline.calls == [{ + "shape": (8, 2), + "n_labels": 8, + "reduction_type": "PCA", + "level": 70, + }] + + +def test_main_run_missing_registered_dataset_fails(monkeypatch, tmp_path): + DummyPipeline.calls = [] + monkeypatch.setattr(main_module, "Pipeline", DummyPipeline) + + raw_dir = tmp_path / "raw" + data_dir = tmp_path / "data" + output_dir = tmp_path / "output" + label_path, _ = _write_labels(raw_dir) + missing_path = data_dir / "missing.npz" + config_path = tmp_path / "missing.toml" + config_path.write_text(f''' +[paths] +input = "{raw_dir.as_posix()}/" +dataset = "{data_dir.as_posix()}/" +output = "{output_dir.as_posix()}/" + +[datasets.maccs] +path = "{missing_path.as_posix()}" +label_path = "{label_path.as_posix()}" +family = "fingerprints" +''') + + with pytest.raises(FileNotFoundError, match="maccs"): + Main(user_config=config_path).run() + assert DummyPipeline.calls == [] diff --git a/tests/test_public_api.py b/tests/test_public_api.py index 12ee216..494ce86 100644 --- a/tests/test_public_api.py +++ b/tests/test_public_api.py @@ -9,14 +9,9 @@ def test_config_importable_from_melite(): assert Config is not None -def test_load_dataset_importable_from_melite(): - from melite import load_dataset - assert callable(load_dataset) - - -def test_result_manager_importable_from_melite(): - from melite import ResultManager - assert ResultManager is not None +def test_load_datasets_importable_from_melite(): + from melite import load_datasets + assert callable(load_datasets) def test_plot_cv_distributions_importable_from_melite(): @@ -37,8 +32,7 @@ def test_version_importable_from_melite(): def test_dunder_all_contains_expected_symbols(): expected = { "Config", - "load_dataset", - "ResultManager", + "load_datasets", "plot_cv_distributions", "predict", "__version__", @@ -50,4 +44,11 @@ def test_private_helpers_not_in_dunder_all(): assert "_load_toml" not in melite.__all__ assert "_deep_merge" not in melite.__all__ assert "_scatter_with_jitter" not in melite.__all__ + assert "load_dataset" not in melite.__all__ + assert "ResultManager" not in melite.__all__ assert "Pipeline" not in melite.__all__ + + +def test_removed_top_level_symbols_not_exposed(): + assert not callable(getattr(melite, "load_dataset", None)) + assert not hasattr(melite, "ResultManager") diff --git a/tests/test_result_manager.py b/tests/test_result_manager.py index 8782d1a..3b031a5 100644 --- a/tests/test_result_manager.py +++ b/tests/test_result_manager.py @@ -57,6 +57,9 @@ def test_write_csv_correct_fieldnames(tmp_path): rm.write_csv(SAMPLE_ROWS, csv_path) with open(csv_path, encoding="utf-8") as f: reader = csv.DictReader(f) + assert "dataset" in reader.fieldnames + assert "family" in reader.fieldnames + assert "method" in reader.fieldnames assert "smoke" in reader.fieldnames assert "f1_macro" in reader.fieldnames