NanoBiostructuresRG · NanoBiostructuresRG · May 27, 2026 · May 26, 2026 · May 26, 2026 · May 27, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -43,14 +43,16 @@ jobs:
         run: |
           python -c "
           import melite
-          assert hasattr(melite, '__version__'),           '__version__ missing'
-          assert hasattr(melite, 'Config'),                'Config missing'
-          assert hasattr(melite, 'load_dataset'),          'load_dataset missing'
-          assert hasattr(melite, 'ResultManager'),         'ResultManager missing'
-          assert hasattr(melite, 'plot_cv_distributions'), 'plot_cv_distributions missing'
-          assert hasattr(melite, 'predict'),               'predict missing'
+          expected = ['Config', 'load_datasets', 'plot_cv_distributions', 'predict', '__version__']
+          assert melite.__all__ == expected, melite.__all__
+          for name in expected:
+              assert hasattr(melite, name), f'{name} missing'
+          assert 'load_dataset' not in melite.__all__,     'load_dataset must not be top-level public API'
+          assert 'ResultManager' not in melite.__all__,    'ResultManager must not be top-level public API'
           assert not hasattr(melite, 'Pipeline'),          'Pipeline must not be public'
           assert not hasattr(melite, '_load_toml'),        '_load_toml must not be public'
+          from melite.result_manager import ResultManager
+          assert ResultManager is not None,                'ResultManager internal import missing'
           print(melite.__version__, 'OK')
           "
 
@@ -73,13 +75,15 @@ jobs:
           ../.smoke_venv/bin/melite --version
           ../.smoke_venv/bin/python -c "
           import melite
-          assert hasattr(melite, '__version__'),           '__version__ missing'
-          assert hasattr(melite, 'Config'),                'Config missing'
-          assert hasattr(melite, 'load_dataset'),          'load_dataset missing'
-          assert hasattr(melite, 'ResultManager'),         'ResultManager missing'
-          assert hasattr(melite, 'plot_cv_distributions'), 'plot_cv_distributions missing'
-          assert hasattr(melite, 'predict'),               'predict missing'
+          expected = ['Config', 'load_datasets', 'plot_cv_distributions', 'predict', '__version__']
+          assert melite.__all__ == expected, melite.__all__
+          for name in expected:
+              assert hasattr(melite, name), f'{name} missing'
+          assert 'load_dataset' not in melite.__all__,     'load_dataset must not be top-level public API'
+          assert 'ResultManager' not in melite.__all__,    'ResultManager must not be top-level public API'
           assert not hasattr(melite, 'Pipeline'),          'Pipeline must not be public'
+          from melite.result_manager import ResultManager
+          assert ResultManager is not None,                'ResultManager internal import missing'
           print(melite.__version__, 'wheel OK')
           "
 
@@ -96,12 +100,14 @@ jobs:
           ../.smoke_sdist_venv/bin/melite --version
           ../.smoke_sdist_venv/bin/python -c "
           import melite
-          assert hasattr(melite, '__version__'),           '__version__ missing'
-          assert hasattr(melite, 'Config'),                'Config missing'
-          assert hasattr(melite, 'load_dataset'),          'load_dataset missing'
-          assert hasattr(melite, 'ResultManager'),         'ResultManager missing'
-          assert hasattr(melite, 'plot_cv_distributions'), 'plot_cv_distributions missing'
-          assert hasattr(melite, 'predict'),               'predict missing'
+          expected = ['Config', 'load_datasets', 'plot_cv_distributions', 'predict', '__version__']
+          assert melite.__all__ == expected, melite.__all__
+          for name in expected:
+              assert hasattr(melite, name), f'{name} missing'
+          assert 'load_dataset' not in melite.__all__,     'load_dataset must not be top-level public API'
+          assert 'ResultManager' not in melite.__all__,    'ResultManager must not be top-level public API'
+          from melite.result_manager import ResultManager
+          assert ResultManager is not None,                'ResultManager internal import missing'
           print(melite.__version__, 'sdist OK')
           "
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,31 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ---
 
+## [v0.2.0] - 2026-05-26
+
+### Added
+- Added canonical `[datasets.<dataset_id>]` TOML registry entries for
+  user-defined numeric tabular datasets.
+- Added strict generalized dataset loading through `load_datasets(config)`.
+- Added dataset-aware benchmark result rows with `dataset`, `family`,
+  `method`, `variant`, `level`, and `description` fields.
+- Added dataset-aware final export naming, such as
+  `Model_SVC_morgan_r2_2048.pkl` and `SVC_morgan_r2_2048.png`.
+
+### Changed
+- `melite run` now consumes `cfg.DATASETS` as the canonical execution path.
+- PCA and UMAP inputs are treated as ordinary dataset registry entries.
+- Legacy `[benchmark].reduction_types` and `levels` are normalized into
+  dataset entries when `[datasets]` is absent.
+- `melite export` prefers the new `dataset` column and falls back to legacy
+  `reduction_type` + `level` rows for older CSV files.
+
+### Fixed
+- Registered datasets now fail clearly on missing files, missing `X`,
+  non-2D or non-numeric `X`, X/y length mismatch, and embedded-y mismatch.
+
+---
+
 ## [0.1.11] - 2026-05-26
 
 ### Changed

diff --git a/CITATION.cff b/CITATION.cff
@@ -2,7 +2,7 @@ cff-version: 1.2.0
 message: "If you use this software, please cite it as below."
 type: software
 title: "MELITE: Multi-model Evaluation and Learning for Inference-ready Tabular Experiments"
-version: "0.1.11"
+version: "0.2.0"
 date-released: "2026-05-26"
 authors:
   - family-names: "Contreras-Torres"

diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ Project: MELITE
 PyPI distribution: melite
 Import package: melite
 CLI: melite
-Version: 0.1.11
+Version: 0.2.0
 License: LGPL-3.0-or-later
 Status: alpha / pre-stable
 ```
@@ -84,7 +84,7 @@ import numpy as np
 from melite import predict
 
 X_new = np.load("examples/sample_PCA70.npz")["X"]
-result = predict("examples/output/Model_SVC_PCA70.pkl", X_new)
+result = predict("examples/output/Model_SVC_sample_pca70.pkl", X_new)
 print(result["predictions"])
 print(result["probabilities"])
 ```
@@ -98,31 +98,41 @@ print(result["probabilities"])
 | Select the best row by F1-macro. | Generate PCA or UMAP reductions from raw data. |
 | Export a final retrained `.pkl` model. | Act as a general AutoML framework. |
 | Run artifact-based inference through `predict()`. | Promise a stable 1.0 API yet. |
-| Handle any numeric tabular matrix. | Use a generalized dataset layer yet; PCA/UMAP naming is historical. |
+| Handle any numeric tabular matrix. | Generate or validate domain-specific descriptors. |
 
-!!! note "Current dataset orchestration"
-    The current dataset orchestration still reflects MELITE's PCA/UMAP origin
-    and uses concepts such as reduction type and level. Future versions will
-    generalize dataset definitions so arbitrary prepared tabular matrices can
-    be registered directly.
-
-Future configuration may look conceptually like this; it is not current
-behavior:
+Datasets are registered as concrete tabular matrix candidates under
+`[datasets.<dataset_id>]`. The `dataset_id` is user-defined and is used in
+`results.csv`, figures, and exported model filenames.
 
 ```toml
-[datasets.morgan]
-path = "data/morgan.npz"
+[datasets.morgan_r2_2048]
+path = "data/morgan_r2_2048.npz"
 label_path = "raw/labels.npy"
+family = "fingerprints"
+method = "Morgan"
+variant = "r2_2048"
 
-[datasets.descriptors]
-path = "data/descriptors.npz"
+[datasets.rdkit_descriptors]
+path = "data/rdkit_descriptors.npz"
 label_path = "raw/labels.npy"
+family = "descriptors"
+method = "RDKit"
 
 [datasets.pca85]
 path = "data/PCA85.npz"
 label_path = "raw/labels.npy"
+family = "dimensionality"
+method = "PCA"
+level = 85
 ```
 
+Each registered dataset must define `path` and `label_path`. Optional metadata
+fields are `family`, `method`, `variant`, `level`, and `description`; they are
+reported for traceability and do not drive special-case model execution.
+Registered datasets are loaded strictly: missing files, missing `X`, non-2D or
+non-numeric `X`, length mismatches, and embedded `y` mismatches fail the run.
+Legacy `[benchmark].reduction_types` and `levels` configs are still accepted
+and are normalized into equivalent dataset entries such as `PCA70` and `UMAP90`.
 
 ## CLI
 
@@ -148,8 +158,7 @@ melite export --row 0 --force
 
 ```python
 from melite import Config
-from melite import load_dataset
-from melite import ResultManager
+from melite import load_datasets
 from melite import plot_cv_distributions
 from melite import predict
 from melite import __version__
@@ -162,32 +171,32 @@ contract and may change before 0.2.0.
 
 ```text
 raw/labels.npy          <- target vector y, shape (n_samples,)
-data/PCA70.npz          <- required key: X, optional key: y
+data/morgan_r2_2048.npz <- required key: X, optional key: y
+data/rdkit_descriptors.npz
 data/PCA85.npz
-data/UMAP70.npz
-data/UMAP85.npz
+data/UMAP90.npz
 ```
 
 Each `.npz` file must contain an `X` array. If an embedded `y` array is present,
-MELITE validates it against `raw/labels.npy`.
+MELITE validates it against the configured `label_path`.
 
 ## Outputs
 
 ```text
 output/
 |-- results.txt
 |-- results.csv
-|-- Model_<model>_<reduction><level>.pkl
+|-- Model_<model>_<dataset>.pkl
 `-- figures/
-    `-- <model>_<reduction><level>.png
+    `-- <model>_<dataset>.png
 ```
 
 Local inputs and generated artifacts such as `raw/`, `data/`, `output/`,
 `.pkl`, and `.joblib` files are intentionally ignored by Git.
 
 ## Validation
 
-The current `dev/v0.1.11` branch targets:
+The current `dev/v0.2.0` branch targets:
 
 ```bash
 python -m pytest tests/ -v --basetemp=.review_pytest_tmp -o cache_dir=.review_pytest_cache

diff --git a/docs/api.md b/docs/api.md
@@ -1,13 +1,12 @@
 # API Reference
 
-MELITE exposes an intended public API through six symbols. The project is
+MELITE exposes an intended public API through five symbols. The project is
 pre-stable, so this API may change before 0.2.0. Internal modules are importable
 directly but are not part of the public contract.
 
 ```python
 from melite import Config
-from melite import load_dataset
-from melite import ResultManager
+from melite import load_datasets
 from melite import plot_cv_distributions
 from melite import predict
 from melite import __version__
@@ -21,15 +20,9 @@ from melite import __version__
 
 ---
 
-## load_dataset
+## load_datasets
 
-::: melite.load_dataset.load_dataset
-
----
-
-## ResultManager
-
-::: melite.result_manager.ResultManager
+::: melite.load_dataset.load_datasets
 
 ---
 

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -9,8 +9,12 @@ override only the settings that need to change.
 [paths]
 output = "my_output/"
 
-[benchmark]
-levels = [70, 85, 95]
+[datasets.morgan_r2_2048]
+path = "data/morgan_r2_2048.npz"
+label_path = "raw/labels.npy"
+family = "fingerprints"
+method = "Morgan"
+variant = "r2_2048"
 
 [models]
 active = ["svc", "rf"]
@@ -29,41 +33,70 @@ MELITE consumes pre-computed feature matrices and labels:
 
 ```text
 raw/labels.npy          <- target vector y, shape (n_samples,)
-data/PCA70.npz          <- required key: X, optional key: y
+data/morgan_r2_2048.npz <- required key: X, optional key: y
+data/maccs.npz
+data/rdkit_descriptors.npz
 data/PCA85.npz
-data/UMAP70.npz
-data/UMAP85.npz
+data/UMAP90.npz
 ```
 
 Each `.npz` file must contain an `X` array. If an embedded `y` array is present,
-MELITE validates it against `raw/labels.npy` to avoid silent feature-label
+MELITE validates it against the configured `label_path` to avoid silent feature-label
 mismatches.
 
 MELITE is tabular at the modeling level. The learning algorithms only consume
 numeric `X` and `y` arrays, so the feature matrix may come from PCA, UMAP,
 fingerprints, descriptors, clinical variables, experimental measurements,
 industrial features, or manually selected numeric features.
 
-The current dataset orchestration still reflects MELITE's PCA/UMAP origin and
-uses concepts such as reduction type and level. Future versions will generalize
-dataset definitions so arbitrary prepared tabular matrices can be registered
-directly. Future configuration may look conceptually like this; it is not
-current behavior:
+Each concrete matrix candidate is registered under `[datasets.<dataset_id>]`.
+Required fields are `path` and `label_path`. Optional metadata fields are
+`family`, `method`, `variant`, `level`, and `description`; they are preserved
+in reports for traceability and do not control special-case execution logic.
 
 ```toml
-[datasets.morgan]
-path = "data/morgan.npz"
+[datasets.morgan_r2_2048]
+path = "data/morgan_r2_2048.npz"
+label_path = "raw/labels.npy"
+family = "fingerprints"
+method = "Morgan"
+variant = "r2_2048"
+
+[datasets.maccs]
+path = "data/maccs.npz"
 label_path = "raw/labels.npy"
+family = "fingerprints"
+method = "MACCS"
 
-[datasets.descriptors]
-path = "data/descriptors.npz"
+[datasets.rdkit_descriptors]
+path = "data/rdkit_descriptors.npz"
 label_path = "raw/labels.npy"
+family = "descriptors"
+method = "RDKit"
 
 [datasets.pca85]
 path = "data/PCA85.npz"
 label_path = "raw/labels.npy"
+family = "dimensionality"
+method = "PCA"
+level = 85
+
+[datasets.umap90]
+path = "data/UMAP90.npz"
+label_path = "raw/labels.npy"
+family = "dimensionality"
+method = "UMAP"
+level = 90
 ```
 
+Registered datasets are loaded strictly. A missing dataset file, missing
+`label_path`, missing `X`, non-2D `X`, non-numeric `X`, length mismatch, or
+embedded `y` mismatch raises an error instead of silently skipping the entry.
+
+Legacy `[benchmark].reduction_types` and `levels` remain supported for
+compatibility. When `[datasets]` is absent, MELITE synthesizes entries such as
+`PCA70` and `UMAP90` with dimensionality metadata.
+
 ## Outputs
 
 By default, MELITE writes results under `output/`:
@@ -72,9 +105,9 @@ By default, MELITE writes results under `output/`:
 output/
 |-- results.txt
 |-- results.csv
-|-- Model_<model>_<reduction><level>.pkl
+|-- Model_<model>_<dataset>.pkl
 `-- figures/
-    `-- <model>_<reduction><level>.png
+    `-- <model>_<dataset>.png
 ```
 
 | Output | Purpose |