bio-learn · marcbal77 · Mar 29, 2026
diff --git a/biolearn/clinical/__init__.py b/biolearn/clinical/__init__.py
@@ -0,0 +1,4 @@
+from biolearn.clinical.registry import BIOMARKER_REGISTRY
+from biolearn.clinical.convert import convert_units
+
+__all__ = ["BIOMARKER_REGISTRY", "convert_units"]
diff --git a/biolearn/clinical/convert.py b/biolearn/clinical/convert.py
@@ -0,0 +1,99 @@
+"""Unit conversion utilities for clinical biomarker data."""
+
+import warnings
+import pandas as pd
+from biolearn.clinical.registry import BIOMARKER_REGISTRY
+
+
+def convert_units(df, source_units=None, units=None):
+    """Convert biomarker columns to canonical units.
+
+    Parameters
+    ----------
+    df : DataFrame
+        DataFrame with biomarker columns (samples as rows).
+    source_units : str, optional
+        Named source preset (e.g. 'ukbiobank'). Applies preset unit
+        mappings for all biomarkers from that source.
+    units : dict, optional
+        Per-biomarker unit overrides. Keys are biomarker names, values
+        are source unit strings (e.g. ``{"creatinine": "umol/L"}``).
+        Overrides any preset from ``source_units``.
+
+    Returns
+    -------
+    DataFrame
+        Copy of df with converted columns.
+
+    Raises
+    ------
+    ValueError
+        If a specified unit has no known conversion.
+    """
+    unit_map = {}
+    if source_units is not None:
+        unit_map.update(BIOMARKER_REGISTRY.get_source_preset(source_units))
+    if units is not None:
+        unit_map.update(units)
+
+    if not unit_map:
+        return df.copy()
+
+    result = df.copy()
+    for biomarker, source_unit in unit_map.items():
+        if biomarker not in result.columns:
+            continue
+        if biomarker not in BIOMARKER_REGISTRY:
+            warnings.warn(
+                f"Biomarker '{biomarker}' not in registry, skipping conversion."
+            )
+            continue
+
+        entry = BIOMARKER_REGISTRY.get(biomarker)
+        if source_unit == entry["unit"]:
+            continue  # already in canonical units
+
+        if source_unit not in entry["conversions"]:
+            raise ValueError(
+                f"No conversion from '{source_unit}' to '{entry['unit']}' "
+                f"for biomarker '{biomarker}'. "
+                f"Known source units: {list(entry['conversions'].keys())}"
+            )
+
+        converter = entry["conversions"][source_unit]
+        result[biomarker] = result[biomarker].apply(converter)
+
+    return result
+
+
+def validate_ranges(df, warn=True):
+    """Check biomarker values against expected ranges.
+
+    Parameters
+    ----------
+    df : DataFrame
+        DataFrame with biomarker columns (samples as rows).
+    warn : bool
+        If True, emit warnings for out-of-range values.
+
+    Returns
+    -------
+    dict
+        Mapping of biomarker name to count of out-of-range values.
+    """
+    out_of_range = {}
+    for col in df.columns:
+        if col not in BIOMARKER_REGISTRY:
+            continue
+        lo, hi = BIOMARKER_REGISTRY.valid_range(col)
+        mask = (df[col] < lo) | (df[col] > hi)
+        count = mask.sum()
+        if count > 0:
+            out_of_range[col] = int(count)
+            if warn:
+                warnings.warn(
+                    f"Biomarker '{col}': {count} values outside expected "
+                    f"range [{lo}, {hi}] (unit: {BIOMARKER_REGISTRY.canonical_unit(col)}). "
+                    f"Check units."
+                )
+    return out_of_range
diff --git a/biolearn/clinical/registry.py b/biolearn/clinical/registry.py
@@ -0,0 +1,206 @@
+"""Biomarker registry defining canonical names, units, valid ranges, and conversions.
+
+The canonical units match NHANES conventions established in biolearn.load.
+All clinical clocks expect data in these units.
+"""
+
+_REGISTRY = {
+    "albumin": {
+        "unit": "g/dL",
+        "range": (1.0, 6.0),
+        "description": "Serum albumin",
+        "conversions": {
+            "g/L": lambda x: x / 10.0,
+        },
+    },
+    "creatinine": {
+        "unit": "mg/dL",
+        "range": (0.1, 15.0),
+        "description": "Serum creatinine",
+        "conversions": {
+            "umol/L": lambda x: x / 88.42,
+        },
+    },
+    "glucose": {
+        "unit": "mmol/L",
+        "range": (1.0, 40.0),
+        "description": "Fasting glucose",
+        "conversions": {
+            "mg/dL": lambda x: x * 0.05551,
+        },
+    },
+    "c_reactive_protein": {
+        "unit": "mg/dL",
+        "range": (0.01, 30.0),
+        "description": "C-reactive protein",
+        "conversions": {
+            "mg/L": lambda x: x / 10.0,
+            "nmol/L": lambda x: x / 95.24,
+        },
+    },
+    "white_blood_cell_count": {
+        "unit": "1000 cells/uL",
+        "range": (1.0, 50.0),
+        "description": "White blood cell count",
+        "conversions": {},
+    },
+    "lymphocyte_percent": {
+        "unit": "%",
+        "range": (1.0, 80.0),
+        "description": "Lymphocyte percentage",
+        "conversions": {},
+    },
+    "red_blood_cell_distribution_width": {
+        "unit": "%",
+        "range": (8.0, 30.0),
+        "description": "Red blood cell distribution width",
+        "conversions": {},
+    },
+    "mean_cell_volume": {
+        "unit": "fL",
+        "range": (50.0, 130.0),
+        "description": "Mean corpuscular volume",
+        "conversions": {},
+    },
+    "alkaline_phosphate": {
+        "unit": "U/L",
+        "range": (10.0, 500.0),
+        "description": "Alkaline phosphatase",
+        "conversions": {},
+    },
+    "hdl_cholesterol": {
+        "unit": "mmol/L",
+        "range": (0.2, 5.0),
+        "description": "HDL cholesterol",
+        "conversions": {
+            "mg/dL": lambda x: x / 38.67,
+        },
+    },
+    "hemoglobin": {
+        "unit": "g/dL",
+        "range": (4.0, 22.0),
+        "description": "Hemoglobin",
+        "conversions": {
+            "g/L": lambda x: x / 10.0,
+        },
+    },
+    "platelet_count": {
+        "unit": "1000 cells/uL",
+        "range": (10.0, 1000.0),
+        "description": "Platelet count",
+        "conversions": {},
+    },
+    "mean_cell_hemoglobin": {
+        "unit": "pg",
+        "range": (15.0, 45.0),
+        "description": "Mean corpuscular hemoglobin",
+        "conversions": {},
+    },
+    "basophil_percent": {
+        "unit": "%",
+        "range": (0.0, 10.0),
+        "description": "Basophil percentage",
+        "conversions": {},
+    },
+    "lymphocyte_number": {
+        "unit": "1000 cells/uL",
+        "range": (0.1, 20.0),
+        "description": "Lymphocyte count",
+        "conversions": {},
+    },
+    "red_blood_cell_count": {
+        "unit": "million cells/uL",
+        "range": (1.0, 10.0),
+        "description": "Red blood cell count",
+        "conversions": {},
+    },
+}
+
+# Source unit presets for common data sources
+_SOURCE_PRESETS = {
+    "nhanes": {},  # NHANES already uses canonical units after load.py processing
+    "ukbiobank": {
+        "creatinine": "umol/L",
+        "c_reactive_protein": "mg/L",
+        "albumin": "g/L",
+        "hemoglobin": "g/L",
+        "hdl_cholesterol": "mg/dL",
+    },
+}
+
+
+class BiomarkerRegistry:
+    """Registry of canonical biomarker definitions for clinical clocks.
+
+    Provides lookup of units, valid ranges, and conversion functions
+    for all biomarkers used by clinical aging clocks.
+    """
+
+    def __init__(self, registry=None):
+        self._registry = registry or _REGISTRY
+
+    def get(self, name):
+        """Get biomarker definition by canonical name.
+
+        Parameters
+        ----------
+        name : str
+            Canonical biomarker name (e.g. 'albumin', 'creatinine').
+
+        Returns
+        -------
+        dict
+            Biomarker definition with keys: unit, range, description, conversions.
+
+        Raises
+        ------
+        KeyError
+            If the biomarker name is not in the registry.
+        """
+        if name not in self._registry:
+            raise KeyError(
+                f"Unknown biomarker: '{name}'. "
+                f"Known biomarkers: {', '.join(sorted(self._registry.keys()))}"
+            )
+        return self._registry[name]
+
+    def canonical_unit(self, name):
+        """Return the canonical unit for a biomarker."""
+        return self.get(name)["unit"]
+
+    def valid_range(self, name):
+        """Return the (min, max) valid range for a biomarker."""
+        return self.get(name)["range"]
+
+    def known_biomarkers(self):
+        """Return sorted list of all known biomarker names."""
+        return sorted(self._registry.keys())
+
+    def get_source_preset(self, source_name):
+        """Return unit mapping for a named data source.
+
+        Parameters
+        ----------
+        source_name : str
+            Source preset name (e.g. 'nhanes', 'ukbiobank').
+
+        Returns
+        -------
+        dict
+            Mapping of biomarker name to source unit string.
+        """
+        if source_name not in _SOURCE_PRESETS:
+            raise ValueError(
+                f"Unknown source preset: '{source_name}'. "
+                f"Known presets: {', '.join(sorted(_SOURCE_PRESETS.keys()))}"
+            )
+        return _SOURCE_PRESETS[source_name]
+
+    def __contains__(self, name):
+        return name in self._registry
+
+    def __len__(self):
+        return len(self._registry)
+
+
+BIOMARKER_REGISTRY = BiomarkerRegistry()