diff --git a/biolearn/clinical/__init__.py b/biolearn/clinical/__init__.py new file mode 100644 index 0000000..380b060 --- /dev/null +++ b/biolearn/clinical/__init__.py @@ -0,0 +1,4 @@ +from biolearn.clinical.registry import BIOMARKER_REGISTRY +from biolearn.clinical.convert import convert_units + +__all__ = ["BIOMARKER_REGISTRY", "convert_units"] diff --git a/biolearn/clinical/convert.py b/biolearn/clinical/convert.py new file mode 100644 index 0000000..57729dd --- /dev/null +++ b/biolearn/clinical/convert.py @@ -0,0 +1,99 @@ +"""Unit conversion utilities for clinical biomarker data.""" + +import warnings +import pandas as pd +from biolearn.clinical.registry import BIOMARKER_REGISTRY + + +def convert_units(df, source_units=None, units=None): + """Convert biomarker columns to canonical units. + + Parameters + ---------- + df : DataFrame + DataFrame with biomarker columns (samples as rows). + source_units : str, optional + Named source preset (e.g. 'ukbiobank'). Applies preset unit + mappings for all biomarkers from that source. + units : dict, optional + Per-biomarker unit overrides. Keys are biomarker names, values + are source unit strings (e.g. ``{"creatinine": "umol/L"}``). + Overrides any preset from ``source_units``. + + Returns + ------- + DataFrame + Copy of df with converted columns. + + Raises + ------ + ValueError + If a specified unit has no known conversion. + """ + unit_map = {} + if source_units is not None: + unit_map.update(BIOMARKER_REGISTRY.get_source_preset(source_units)) + if units is not None: + unit_map.update(units) + + if not unit_map: + return df.copy() + + result = df.copy() + for biomarker, source_unit in unit_map.items(): + if biomarker not in result.columns: + continue + if biomarker not in BIOMARKER_REGISTRY: + warnings.warn( + f"Biomarker '{biomarker}' not in registry, skipping conversion." + ) + continue + + entry = BIOMARKER_REGISTRY.get(biomarker) + if source_unit == entry["unit"]: + continue # already in canonical units + + if source_unit not in entry["conversions"]: + raise ValueError( + f"No conversion from '{source_unit}' to '{entry['unit']}' " + f"for biomarker '{biomarker}'. " + f"Known source units: {list(entry['conversions'].keys())}" + ) + + converter = entry["conversions"][source_unit] + result[biomarker] = result[biomarker].apply(converter) + + return result + + +def validate_ranges(df, warn=True): + """Check biomarker values against expected ranges. + + Parameters + ---------- + df : DataFrame + DataFrame with biomarker columns (samples as rows). + warn : bool + If True, emit warnings for out-of-range values. + + Returns + ------- + dict + Mapping of biomarker name to count of out-of-range values. + """ + out_of_range = {} + for col in df.columns: + if col not in BIOMARKER_REGISTRY: + continue + lo, hi = BIOMARKER_REGISTRY.valid_range(col) + mask = (df[col] < lo) | (df[col] > hi) + count = mask.sum() + if count > 0: + out_of_range[col] = int(count) + if warn: + warnings.warn( + f"Biomarker '{col}': {count} values outside expected " + f"range [{lo}, {hi}] (unit: {BIOMARKER_REGISTRY.canonical_unit(col)}). " + f"Check units." + ) + return out_of_range diff --git a/biolearn/clinical/registry.py b/biolearn/clinical/registry.py new file mode 100644 index 0000000..1172d74 --- /dev/null +++ b/biolearn/clinical/registry.py @@ -0,0 +1,206 @@ +"""Biomarker registry defining canonical names, units, valid ranges, and conversions. + +The canonical units match NHANES conventions established in biolearn.load. +All clinical clocks expect data in these units. +""" + +_REGISTRY = { + "albumin": { + "unit": "g/dL", + "range": (1.0, 6.0), + "description": "Serum albumin", + "conversions": { + "g/L": lambda x: x / 10.0, + }, + }, + "creatinine": { + "unit": "mg/dL", + "range": (0.1, 15.0), + "description": "Serum creatinine", + "conversions": { + "umol/L": lambda x: x / 88.42, + }, + }, + "glucose": { + "unit": "mmol/L", + "range": (1.0, 40.0), + "description": "Fasting glucose", + "conversions": { + "mg/dL": lambda x: x * 0.05551, + }, + }, + "c_reactive_protein": { + "unit": "mg/dL", + "range": (0.01, 30.0), + "description": "C-reactive protein", + "conversions": { + "mg/L": lambda x: x / 10.0, + "nmol/L": lambda x: x / 95.24, + }, + }, + "white_blood_cell_count": { + "unit": "1000 cells/uL", + "range": (1.0, 50.0), + "description": "White blood cell count", + "conversions": {}, + }, + "lymphocyte_percent": { + "unit": "%", + "range": (1.0, 80.0), + "description": "Lymphocyte percentage", + "conversions": {}, + }, + "red_blood_cell_distribution_width": { + "unit": "%", + "range": (8.0, 30.0), + "description": "Red blood cell distribution width", + "conversions": {}, + }, + "mean_cell_volume": { + "unit": "fL", + "range": (50.0, 130.0), + "description": "Mean corpuscular volume", + "conversions": {}, + }, + "alkaline_phosphate": { + "unit": "U/L", + "range": (10.0, 500.0), + "description": "Alkaline phosphatase", + "conversions": {}, + }, + "hdl_cholesterol": { + "unit": "mmol/L", + "range": (0.2, 5.0), + "description": "HDL cholesterol", + "conversions": { + "mg/dL": lambda x: x / 38.67, + }, + }, + "hemoglobin": { + "unit": "g/dL", + "range": (4.0, 22.0), + "description": "Hemoglobin", + "conversions": { + "g/L": lambda x: x / 10.0, + }, + }, + "platelet_count": { + "unit": "1000 cells/uL", + "range": (10.0, 1000.0), + "description": "Platelet count", + "conversions": {}, + }, + "mean_cell_hemoglobin": { + "unit": "pg", + "range": (15.0, 45.0), + "description": "Mean corpuscular hemoglobin", + "conversions": {}, + }, + "basophil_percent": { + "unit": "%", + "range": (0.0, 10.0), + "description": "Basophil percentage", + "conversions": {}, + }, + "lymphocyte_number": { + "unit": "1000 cells/uL", + "range": (0.1, 20.0), + "description": "Lymphocyte count", + "conversions": {}, + }, + "red_blood_cell_count": { + "unit": "million cells/uL", + "range": (1.0, 10.0), + "description": "Red blood cell count", + "conversions": {}, + }, +} + +# Source unit presets for common data sources +_SOURCE_PRESETS = { + "nhanes": {}, # NHANES already uses canonical units after load.py processing + "ukbiobank": { + "creatinine": "umol/L", + "c_reactive_protein": "mg/L", + "albumin": "g/L", + "hemoglobin": "g/L", + "hdl_cholesterol": "mg/dL", + }, +} + + +class BiomarkerRegistry: + """Registry of canonical biomarker definitions for clinical clocks. + + Provides lookup of units, valid ranges, and conversion functions + for all biomarkers used by clinical aging clocks. + """ + + def __init__(self, registry=None): + self._registry = registry or _REGISTRY + + def get(self, name): + """Get biomarker definition by canonical name. + + Parameters + ---------- + name : str + Canonical biomarker name (e.g. 'albumin', 'creatinine'). + + Returns + ------- + dict + Biomarker definition with keys: unit, range, description, conversions. + + Raises + ------ + KeyError + If the biomarker name is not in the registry. + """ + if name not in self._registry: + raise KeyError( + f"Unknown biomarker: '{name}'. " + f"Known biomarkers: {', '.join(sorted(self._registry.keys()))}" + ) + return self._registry[name] + + def canonical_unit(self, name): + """Return the canonical unit for a biomarker.""" + return self.get(name)["unit"] + + def valid_range(self, name): + """Return the (min, max) valid range for a biomarker.""" + return self.get(name)["range"] + + def known_biomarkers(self): + """Return sorted list of all known biomarker names.""" + return sorted(self._registry.keys()) + + def get_source_preset(self, source_name): + """Return unit mapping for a named data source. + + Parameters + ---------- + source_name : str + Source preset name (e.g. 'nhanes', 'ukbiobank'). + + Returns + ------- + dict + Mapping of biomarker name to source unit string. + """ + if source_name not in _SOURCE_PRESETS: + raise ValueError( + f"Unknown source preset: '{source_name}'. " + f"Known presets: {', '.join(sorted(_SOURCE_PRESETS.keys()))}" + ) + return _SOURCE_PRESETS[source_name] + + def __contains__(self, name): + return name in self._registry + + def __len__(self): + return len(self._registry) + + +BIOMARKER_REGISTRY = BiomarkerRegistry() diff --git a/biolearn/data_library.py b/biolearn/data_library.py index dc3eeae..4f2f034 100644 --- a/biolearn/data_library.py +++ b/biolearn/data_library.py @@ -183,6 +183,7 @@ def __init__( rna=None, protein_alamar=None, protein_olink=None, + clinical=None, ): """ Initializes the GeoData instance. @@ -190,12 +191,15 @@ def __init__( Args: metadata (DataFrame): Metadata associated with genomic samples. dnam (DataFrame): Methylation data associated with genomic samples. + clinical (DataFrame): Clinical biomarker data with features as rows + and samples as columns (same orientation as dnam). """ self.metadata = metadata self.dnam = dnam self.rna = rna self.protein_alamar = protein_alamar self.protein_olink = protein_olink + self.clinical = clinical def _validate_metadata_omics_consistency(self): """Validate that metadata exists for all omics samples and vice versa.""" @@ -216,6 +220,9 @@ def _validate_metadata_omics_consistency(self): if self.protein_olink is not None: omics_samples.update(self.protein_olink.columns) omics_types.append("protein_olink") + if self.clinical is not None: + omics_samples.update(self.clinical.columns) + omics_types.append("clinical") if not omics_samples: return @@ -266,6 +273,11 @@ def copy(self): if self.protein_olink is not None else None ), + clinical=( + self.clinical.copy(deep=True) + if self.clinical is not None + else None + ), ) def quality_report(self, sites=None): @@ -358,6 +370,60 @@ def from_methylation_matrix(cls, matrix): return cls(metadata, dnam) + @classmethod + def from_clinical_matrix(cls, df, source_units=None, units=None): + """Creates a GeoData instance from a clinical biomarker DataFrame. + + Separates metadata columns (age, sex, mortality) from biomarker + columns, converts units if needed, and transposes biomarkers to + features-as-rows (matching GeoData's internal convention). + + Parameters + ---------- + df : DataFrame + DataFrame with samples as rows and biomarkers/metadata as columns. + Index should be sample identifiers. + source_units : str, optional + Named source preset for unit conversion (e.g. 'ukbiobank'). + units : dict, optional + Per-biomarker unit overrides (e.g. ``{"creatinine": "umol/L"}``). + + Returns + ------- + GeoData + Instance with clinical and metadata layers populated. + """ + from biolearn.clinical.convert import convert_units, validate_ranges + + df = df.copy() + + # Separate metadata columns from biomarker columns + metadata_cols = ["age", "sex", "is_dead", "months_until_death"] + existing_meta = [c for c in metadata_cols if c in df.columns] + biomarker_cols = [c for c in df.columns if c not in metadata_cols] + + metadata = ( + df[existing_meta] + if existing_meta + else pd.DataFrame(index=df.index) + ) + + biomarkers = df[biomarker_cols] + + # Convert units if requested + if source_units is not None or units is not None: + biomarkers = convert_units( + biomarkers, source_units=source_units, units=units + ) + + # Warn about out-of-range values + validate_ranges(biomarkers, warn=True) + + # Transpose to features-as-rows, samples-as-columns + clinical = biomarkers.T + + return cls(metadata=metadata, clinical=clinical) + def save_csv(self, folder_path, name): """ Saves the GeoData instance to CSV files according to the DNA Methylation Array Data Standard V-2410. @@ -410,6 +476,9 @@ def save_csv(self, folder_path, name): folder_path, f"{name}_protein_olink.csv" ) self.protein_olink.to_csv(protein_file) + if self.clinical is not None: + clinical_file = os.path.join(folder_path, f"{name}_clinical.csv") + self.clinical.to_csv(clinical_file) @classmethod def load_csv(cls, folder_path, name, series_part="all", validate=True): @@ -509,12 +578,20 @@ def load_csv(cls, folder_path, name, series_part="all", validate=True): else None ) + clinical_file = os.path.join(folder_path, f"{name}_clinical.csv") + clinical_df = ( + pd.read_csv(clinical_file, index_col=0, skipinitialspace=True) + if os.path.exists(clinical_file) + else None + ) + geodata = cls( metadata_df, dnam=dnam_df, rna=rna_df, protein_alamar=protein_alamar_df, protein_olink=protein_olink_df, + clinical=clinical_df, ) if validate and metadata_df is not None: diff --git a/biolearn/load.py b/biolearn/load.py index a044c82..e1e7fa6 100644 --- a/biolearn/load.py +++ b/biolearn/load.py @@ -208,3 +208,26 @@ def load_nhanes(year): ) df = df.rename({"LB2RDW": "LBXRDW", "LB2WBCSI": "LBXWBCSI"}, axis=1) return df + + +def load_nhanes_as_geodata(year): + """Load NHANES data and return as GeoData with clinical layer. + + Calls ``load_nhanes(year)`` and wraps the result using + ``GeoData.from_clinical_matrix()`` so the data can be used + directly with clinical aging clocks via the ModelGallery. + + Parameters + ---------- + year : int + NHANES cycle year (2010 or 2012). + + Returns + ------- + GeoData + GeoData with ``clinical`` and ``metadata`` layers populated. + """ + from biolearn.data_library import GeoData + + df = load_nhanes(year) + return GeoData.from_clinical_matrix(df) diff --git a/biolearn/model.py b/biolearn/model.py index e4edf3a..f53f22f 100644 --- a/biolearn/model.py +++ b/biolearn/model.py @@ -1289,6 +1289,13 @@ def from_definition(cls, clock_definition): weights_path=weights_path, preprocess_file_path=preprocess_file ) + def required_features(self): + return { + "layer": "dnam", + "features": list(self.reference), + "metadata": [], + } + def methylation_sites(self): return list(self.reference) @@ -1441,6 +1448,13 @@ def solve_qp(meth_vector, deconv_reference): # Return samples as rows to match other model outputs return cell_prop_df.T + def required_features(self): + return { + "layer": "dnam", + "features": list(self.reference.index), + "metadata": [], + } + # returns required methylation sites def methylation_sites(self): return list(self.reference.index) @@ -1514,6 +1528,19 @@ def predict(self, geo_data): # Return as a DataFrame return result.apply(self.transform).to_frame(name="Predicted") + def required_features(self): + """Return the data layer, features, and metadata this model needs. + + Returns + ------- + dict + ``{"layer": str, "features": list, "metadata": list}`` + """ + features = [ + idx for idx in self.coefficients.index if idx != "intercept" + ] + return {"layer": "dnam", "features": features, "metadata": []} + def _validate_required_features(self, matrix_data): return @@ -1644,6 +1671,13 @@ def _get_data_matrix(self, geo_data): PCs = X_centered.T.dot(rotation) # (samples × PCs) return PCs.T # (PCs × samples) + def required_features(self): + return { + "layer": "dnam", + "features": self.methylation_sites(), + "metadata": [], + } + def methylation_sites(self): """ Return the list of required CpG sites. @@ -1662,6 +1696,12 @@ class LinearTranscriptomicModel(LinearModel): def _get_data_matrix(self, geo_data): return geo_data.rna + def required_features(self): + features = [ + idx for idx in self.coefficients.index if idx != "intercept" + ] + return {"layer": "rna", "features": features, "metadata": []} + class GrimageModel: def __init__(self, coefficient_file, **details): @@ -1803,6 +1843,13 @@ def methylation_sites(self): unique_vars = set(filtered_df["var"]) - {"Intercept", "Age", "Female"} return list(unique_vars) + def required_features(self): + return { + "layer": "dnam", + "features": self.methylation_sites(), + "metadata": ["age", "sex"], + } + class LinearMultipartProteomicModel: def __init__( @@ -1868,6 +1915,15 @@ def predict(self, geo_data): # Apply transformation to results return self.transform(pd.DataFrame(results)) + def required_features(self): + proteins = list( + self.coefficients.loc[ + self.coefficients["Protein"].str.lower() != "intercept", + "Protein", + ].unique() + ) + return {"layer": "protein_olink", "features": proteins, "metadata": []} + def methylation_sites(self): return [] @@ -1931,6 +1987,13 @@ def predict(self, geo_data): return pred_df + def required_features(self): + return { + "layer": "dnam", + "features": list(self.coefficients.index), + "metadata": [], + } + def methylation_sites(self): return list(self.coefficients.index) @@ -1970,6 +2033,13 @@ def predict(self, geo_data): return pd.DataFrame(vals, index=dnam.columns, columns=["Predicted"]) + def required_features(self): + return { + "layer": "dnam", + "features": list(self.CpG_names), + "metadata": [], + } + def methylation_sites(self): return list(self.CpG_names) @@ -2232,6 +2302,13 @@ def predict( except Exception as e: raise Exception(f"API error: {str(e)}") + def required_features(self): + return { + "layer": "dnam", + "features": self.required_cpgs if self.required_cpgs else [], + "metadata": ["age", "sex"], + } + def methylation_sites(self): """Return list of required CpG sites for imputation compatibility.""" return self.required_cpgs if self.required_cpgs else [] @@ -2283,6 +2360,9 @@ def predict(self, geo_data): predictions, index=dnam.columns, columns=["Predicted"] ) + def required_features(self): + return {"layer": "dnam", "features": self._sites, "metadata": []} + def methylation_sites(self): return self._sites @@ -2434,6 +2514,9 @@ def predict(self, geo_data): predictions, index=methylation_data.columns, columns=["Predicted"] ) + def required_features(self): + return {"layer": "dnam", "features": self.cpg_sites, "metadata": []} + def methylation_sites(self): """Return list of required CpG sites""" return self.cpg_sites diff --git a/biolearn/test/test_clinical_layer.py b/biolearn/test/test_clinical_layer.py new file mode 100644 index 0000000..7b77c9d --- /dev/null +++ b/biolearn/test/test_clinical_layer.py @@ -0,0 +1,171 @@ +import os +import numpy as np +import pandas as pd +import pytest +from biolearn.data_library import GeoData + + +def _make_clinical_df(): + """Create a small clinical DataFrame (samples as rows).""" + return pd.DataFrame( + { + "age": [45, 62, 38], + "sex": [1, 0, 1], + "albumin": [4.2, 3.8, 4.5], + "creatinine": [0.9, 1.1, 0.8], + "glucose": [5.1, 6.2, 4.8], + "white_blood_cell_count": [6.5, 8.0, 5.5], + "lymphocyte_percent": [30.0, 25.0, 35.0], + "mean_cell_volume": [88.0, 92.0, 86.0], + "red_blood_cell_distribution_width": [12.5, 14.0, 12.0], + "alkaline_phosphate": [65.0, 80.0, 55.0], + }, + index=["P1", "P2", "P3"], + ) + + +def test_geodata_with_clinical_layer(): + """GeoData accepts a clinical DataFrame.""" + clinical = pd.DataFrame( + {"P1": [4.2, 0.9], "P2": [3.8, 1.1]}, + index=["albumin", "creatinine"], + ) + metadata = pd.DataFrame({"age": [45, 62]}, index=["P1", "P2"]) + geo = GeoData(metadata=metadata, clinical=clinical) + + assert geo.clinical is not None + assert list(geo.clinical.columns) == ["P1", "P2"] + assert list(geo.clinical.index) == ["albumin", "creatinine"] + assert geo.dnam is None + + +def test_geodata_clinical_defaults_to_none(): + """Clinical layer defaults to None for backward compatibility.""" + metadata = pd.DataFrame({"age": [45]}, index=["P1"]) + geo = GeoData(metadata=metadata) + assert geo.clinical is None + + +def test_from_clinical_matrix_basic(): + """from_clinical_matrix separates metadata and transposes biomarkers.""" + df = _make_clinical_df() + geo = GeoData.from_clinical_matrix(df) + + # Metadata should contain age and sex + assert "age" in geo.metadata.columns + assert "sex" in geo.metadata.columns + assert len(geo.metadata) == 3 + + # Clinical should be features-as-rows, samples-as-columns + assert geo.clinical is not None + assert set(geo.clinical.columns) == {"P1", "P2", "P3"} + assert "albumin" in geo.clinical.index + assert "creatinine" in geo.clinical.index + + # Metadata columns should NOT be in clinical + assert "age" not in geo.clinical.index + assert "sex" not in geo.clinical.index + + +def test_from_clinical_matrix_preserves_values(): + """Values survive the transpose correctly.""" + df = _make_clinical_df() + geo = GeoData.from_clinical_matrix(df) + + assert geo.clinical.loc["albumin", "P1"] == 4.2 + assert geo.clinical.loc["creatinine", "P2"] == 1.1 + assert geo.metadata.loc["P1", "age"] == 45 + + +def test_from_clinical_matrix_no_metadata_cols(): + """Works when input has no metadata columns.""" + df = pd.DataFrame( + {"albumin": [4.2, 3.8], "creatinine": [0.9, 1.1]}, + index=["P1", "P2"], + ) + geo = GeoData.from_clinical_matrix(df) + + assert len(geo.metadata.columns) == 0 + assert geo.clinical is not None + assert "albumin" in geo.clinical.index + + +def test_from_clinical_matrix_unit_conversion(): + """Unit conversion via the units parameter works.""" + df = pd.DataFrame( + {"creatinine": [79.56, 97.24]}, # umol/L values + index=["P1", "P2"], + ) + geo = GeoData.from_clinical_matrix(df, units={"creatinine": "umol/L"}) + + # Should be converted to mg/dL (divide by 88.42) + converted = geo.clinical.loc["creatinine", "P1"] + assert abs(converted - 79.56 / 88.42) < 0.01 + + +def test_from_clinical_matrix_source_preset(): + """source_units preset applies correct conversions.""" + df = pd.DataFrame( + { + "albumin": [42.0], # g/L (UK Biobank) + "creatinine": [88.42], # umol/L + }, + index=["P1"], + ) + geo = GeoData.from_clinical_matrix(df, source_units="ukbiobank") + + # albumin: 42 g/L -> 4.2 g/dL + assert abs(geo.clinical.loc["albumin", "P1"] - 4.2) < 0.01 + # creatinine: 88.42 umol/L -> 1.0 mg/dL + assert abs(geo.clinical.loc["creatinine", "P1"] - 1.0) < 0.01 + + +def test_copy_preserves_clinical(): + """GeoData.copy() deep-copies the clinical layer.""" + df = _make_clinical_df() + geo = GeoData.from_clinical_matrix(df) + geo_copy = geo.copy() + + # Modify original + geo.clinical.iloc[0, 0] = -999 + + # Copy should be unaffected + assert geo_copy.clinical.iloc[0, 0] != -999 + + +def test_save_load_roundtrip_with_clinical(tmp_path): + """Clinical data survives save_csv / load_csv roundtrip.""" + df = _make_clinical_df() + geo = GeoData.from_clinical_matrix(df) + + folder = str(tmp_path) + geo.save_csv(folder, "test") + + # Verify clinical file was created + assert os.path.exists(os.path.join(folder, "test_clinical.csv")) + + # Load it back + loaded = GeoData.load_csv(folder, "test", validate=False) + assert loaded.clinical is not None + assert set(loaded.clinical.index) == set(geo.clinical.index) + assert set(loaded.clinical.columns) == set(geo.clinical.columns) + + # Values should match + pd.testing.assert_frame_equal( + loaded.clinical.sort_index(axis=0).sort_index(axis=1), + geo.clinical.sort_index(axis=0).sort_index(axis=1), + atol=1e-10, + ) + + +def test_validate_metadata_omics_includes_clinical(): + """Validation includes clinical samples in consistency check.""" + clinical = pd.DataFrame( + {"P1": [4.2], "P2": [3.8], "P3": [4.5]}, + index=["albumin"], + ) + metadata = pd.DataFrame({"age": [45, 62]}, index=["P1", "P2"]) + geo = GeoData(metadata=metadata, clinical=clinical) + + with pytest.warns(UserWarning, match="without metadata"): + geo._validate_metadata_omics_consistency() diff --git a/biolearn/test/test_registry.py b/biolearn/test/test_registry.py new file mode 100644 index 0000000..cf4b3f4 --- /dev/null +++ b/biolearn/test/test_registry.py @@ -0,0 +1,109 @@ +import pandas as pd +import pytest +from biolearn.clinical.registry import BIOMARKER_REGISTRY +from biolearn.clinical.convert import convert_units, validate_ranges + + +class TestBiomarkerRegistry: + def test_known_biomarkers_not_empty(self): + assert len(BIOMARKER_REGISTRY) > 0 + + def test_get_albumin(self): + entry = BIOMARKER_REGISTRY.get("albumin") + assert entry["unit"] == "g/dL" + assert "range" in entry + assert "conversions" in entry + + def test_get_unknown_raises(self): + with pytest.raises(KeyError, match="Unknown biomarker"): + BIOMARKER_REGISTRY.get("nonexistent_biomarker") + + def test_canonical_unit(self): + assert BIOMARKER_REGISTRY.canonical_unit("glucose") == "mmol/L" + assert BIOMARKER_REGISTRY.canonical_unit("creatinine") == "mg/dL" + + def test_valid_range(self): + lo, hi = BIOMARKER_REGISTRY.valid_range("albumin") + assert lo < hi + + def test_contains(self): + assert "albumin" in BIOMARKER_REGISTRY + assert "fake_marker" not in BIOMARKER_REGISTRY + + def test_known_biomarkers_list(self): + names = BIOMARKER_REGISTRY.known_biomarkers() + assert isinstance(names, list) + assert "albumin" in names + assert names == sorted(names) # should be sorted + + def test_source_preset_nhanes(self): + preset = BIOMARKER_REGISTRY.get_source_preset("nhanes") + assert isinstance(preset, dict) + + def test_source_preset_ukbiobank(self): + preset = BIOMARKER_REGISTRY.get_source_preset("ukbiobank") + assert "creatinine" in preset + assert preset["creatinine"] == "umol/L" + + def test_source_preset_unknown_raises(self): + with pytest.raises(ValueError, match="Unknown source preset"): + BIOMARKER_REGISTRY.get_source_preset("fake_source") + + +class TestConvertUnits: + def test_creatinine_umol_to_mg(self): + df = pd.DataFrame({"creatinine": [88.42]}, index=["P1"]) + result = convert_units(df, units={"creatinine": "umol/L"}) + assert abs(result.loc["P1", "creatinine"] - 1.0) < 0.01 + + def test_albumin_g_per_l_to_g_per_dl(self): + df = pd.DataFrame({"albumin": [42.0]}, index=["P1"]) + result = convert_units(df, units={"albumin": "g/L"}) + assert abs(result.loc["P1", "albumin"] - 4.2) < 0.01 + + def test_no_conversion_returns_copy(self): + df = pd.DataFrame({"albumin": [4.2]}, index=["P1"]) + result = convert_units(df) + pd.testing.assert_frame_equal(result, df) + assert result is not df # should be a different object + + def test_already_canonical_no_change(self): + df = pd.DataFrame({"creatinine": [1.0]}, index=["P1"]) + result = convert_units(df, units={"creatinine": "mg/dL"}) + assert result.loc["P1", "creatinine"] == 1.0 + + def test_unknown_unit_raises(self): + df = pd.DataFrame({"creatinine": [1.0]}, index=["P1"]) + with pytest.raises(ValueError, match="No conversion"): + convert_units(df, units={"creatinine": "fake_unit"}) + + def test_missing_column_skipped(self): + df = pd.DataFrame({"albumin": [4.2]}, index=["P1"]) + result = convert_units(df, units={"creatinine": "umol/L"}) + assert "albumin" in result.columns + + def test_source_preset(self): + df = pd.DataFrame( + {"creatinine": [88.42], "albumin": [42.0]}, index=["P1"] + ) + result = convert_units(df, source_units="ukbiobank") + assert abs(result.loc["P1", "creatinine"] - 1.0) < 0.01 + assert abs(result.loc["P1", "albumin"] - 4.2) < 0.01 + + +class TestValidateRanges: + def test_in_range_no_warnings(self): + df = pd.DataFrame({"albumin": [4.0]}, index=["P1"]) + result = validate_ranges(df, warn=False) + assert len(result) == 0 + + def test_out_of_range_detected(self): + df = pd.DataFrame({"albumin": [0.1]}, index=["P1"]) # below range + result = validate_ranges(df, warn=False) + assert "albumin" in result + assert result["albumin"] == 1 + + def test_unknown_columns_ignored(self): + df = pd.DataFrame({"unknown_col": [999]}, index=["P1"]) + result = validate_ranges(df, warn=False) + assert len(result) == 0 diff --git a/biolearn/test/test_required_features.py b/biolearn/test/test_required_features.py new file mode 100644 index 0000000..ae0baba --- /dev/null +++ b/biolearn/test/test_required_features.py @@ -0,0 +1,85 @@ +import pytest +from biolearn import model +from biolearn.model_gallery import ModelGallery + +gallery = ModelGallery() + + +@pytest.mark.parametrize( + "model_name, model_entry", model.model_definitions.items() +) +def test_required_features_interface(model_name, model_entry): + """Every model must implement required_features() with the correct shape.""" + model_type = model_entry["model"]["type"] + + # Skip types that can't be instantiated without special setup + if model_type in ["NotImplemented", "HurdleAPIModel"]: + pytest.skip(f"Model type {model_type} requires special setup") + + model_class = getattr(model, model_type) + instance = model_class.from_definition(model_entry) + + result = instance.required_features() + + # Validate return format + assert isinstance( + result, dict + ), f"{model_name}: required_features() must return a dict" + assert ( + "layer" in result + ), f"{model_name}: required_features() must include 'layer'" + assert ( + "features" in result + ), f"{model_name}: required_features() must include 'features'" + assert ( + "metadata" in result + ), f"{model_name}: required_features() must include 'metadata'" + + assert isinstance( + result["layer"], str + ), f"{model_name}: 'layer' must be a string" + assert isinstance( + result["features"], list + ), f"{model_name}: 'features' must be a list" + assert isinstance( + result["metadata"], list + ), f"{model_name}: 'metadata' must be a list" + + # Layer should be one of the known GeoData layers + valid_layers = { + "dnam", + "rna", + "protein_alamar", + "protein_olink", + "clinical", + } + assert ( + result["layer"] in valid_layers + ), f"{model_name}: 'layer' must be one of {valid_layers}, got '{result['layer']}'" + + +@pytest.mark.parametrize( + "model_name, model_entry", model.model_definitions.items() +) +def test_required_features_consistency_with_methylation_sites( + model_name, model_entry +): + """For dnam models, required_features() features should match methylation_sites().""" + model_type = model_entry["model"]["type"] + + if model_type in ["NotImplemented", "HurdleAPIModel"]: + pytest.skip(f"Model type {model_type} requires special setup") + + model_class = getattr(model, model_type) + instance = model_class.from_definition(model_entry) + + if not hasattr(instance, "methylation_sites"): + pytest.skip(f"{model_name} does not have methylation_sites()") + + result = instance.required_features() + sites = instance.methylation_sites() + + if result["layer"] == "dnam" and sites: + assert set(result["features"]) == set( + sites + ), f"{model_name}: required_features() features should match methylation_sites()"