diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 2bf14d2..d7890e8 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -17,10 +17,28 @@ on: workflow_dispatch: jobs: + tests: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: pip + - run: pip install -e '.[testing]' + - uses: actions/cache@v4 + with: + path: tests/fixtures + key: test-fixtures + - run: pytest tests/ --tb=short + build: + needs: tests runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 # setuptools_scm needs full history + tags - uses: actions/setup-python@v5 with: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..e3f0b37 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,32 @@ +name: Tests + +on: + push: + branches: [main, imports] + pull_request: + workflow_dispatch: + +jobs: + pytest: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: pip + + - name: Install package + run: | + python -m pip install --upgrade pip + pip install -e '.[testing]' + + - name: Cache test fixtures + uses: actions/cache@v4 + with: + path: tests/fixtures + key: test-fixtures + + - name: Run pytest + run: pytest tests/ -v --tb=short diff --git a/.gitignore b/.gitignore index f34cfdb..1b2aa61 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ __pycache__/ +mutopia/_version.py data/ *.db *.ipynb diff --git a/encode_analysis/Makefile b/encode_analysis/Makefile index 6bef139..5cf4a95 100644 --- a/encode_analysis/Makefile +++ b/encode_analysis/Makefile @@ -140,7 +140,12 @@ studies/%/presets.02/: staged/%.train.nc staged/%.test.nc topo-model study create $@ -ds $^ -lsub 0.2 -creg 0.00025 --save-model -ee -stop 100 $$PARAMS studies/%/presets.04/: staged/%.train.nc staged/%.test.nc - @mkdir -p studies/$*/presets.02 + @mkdir -p studies/$*/presets.04 + PARAMS=$$(python bin/params_from_config.py $*); \ + topo-model study create $@ -ds $^ -lsub 0.2 --save-model -ee -stop 50 $$PARAMS + +studies/%/presets.05/: staged/%.train.nc staged/%.test.nc + @mkdir -p studies/$*/presets.05 PARAMS=$$(python bin/params_from_config.py $*); \ topo-model study create $@ -ds $^ -lsub 0.2 --save-model -ee -stop 50 $$PARAMS @@ -331,9 +336,67 @@ analyses/%/base_annotation.nc: --wait \ --wrap="python bin/base_annotation.py --model studies/$$STUDY_NAME/$$STUDY_ID/trial=$$MODEL_ID.pkl --base-gtensor gtensors/Lung-All.nc --output $@" +# Alternative base annotations: impute mutation rates onto other tumor types' epigenomes. +# Add new bases to ALT_BASES to extend coverage; each generates +# analyses/%/base_annotations/.nc via the shared recipe below. +ALT_BASES := Breast-All Kidney-All + +define ALT_BASE_RECIPE + @mkdir -p $(dir $@) + @FULL_PATH="$*"; \ + STUDY_NAME=$$(echo $$FULL_PATH | cut -d'/' -f1); \ + STUDY_ID=$$(echo $$FULL_PATH | cut -d'/' -f2); \ + MODEL_ID=$$(echo $$FULL_PATH | cut -d'/' -f3); \ + BASE=$(notdir $(basename $@)); \ + sbatch \ + --mem=5G \ + --cpus-per-task=5 \ + --ntasks=1 \ + --time=30:00 \ + --job-name=$@ \ + --output=%x.log \ + --partition=short,park \ + --account=park \ + --wait \ + --wrap="python bin/base_annotation.py \ + --model studies/$$STUDY_NAME/$$STUDY_ID/trial=$$MODEL_ID.pkl \ + --base-gtensor gtensors/$$BASE.nc \ + --output $@" +endef + +analyses/%/base_annotations/Breast-All.nc: + $(ALT_BASE_RECIPE) + +analyses/%/base_annotations/Kidney-All.nc: + $(ALT_BASE_RECIPE) + joint_summary.tar.gz: tar -cvf $@ $$(for model in $$(cat models.txt); do echo "analyses/$$model/annotated.shap.nc" "analyses/$$model/base_annotation.nc analyses/$$model/analysis.ipynb"; done) meta_analysis.ipynb models.txt joint_summary.doga.tar.gz: tar -cvf $@ $$(for model in $$(cat models.txt); do echo "analyses/$$model/base_annotation.nc*.gz analyses/$$model/annotated.shap.nc"; done) analyses/joint_collection/* +annotated.tar.gz: + tar -cvf $@ $$(for model in $$(cat models.txt); do echo "analyses/$$model/annotated.nc"; done) + +# Packages Lung-All base annotation (original), two alt-base annotations (Breast-All, Kidney-All), +# and annotated.nc (same-tumor imputation — already contains component_distributions via annot_data). +multi_base_annotations.tar.gz: + tar -cvf $@ $$(for model in $$(cat models.txt); do \ + echo "analyses/$$model/base_annotation.nc"; \ + echo "analyses/$$model/base_annotations/Breast-All.nc"; \ + echo "analyses/$$model/base_annotations/Kidney-All.nc"; \ + echo "analyses/$$model/annotated.nc"; \ + done) + +# a model has the format "tumor_type/preset.04/model_id", and we need to add "studies/tumor_type/presets.04/trial=model_id.pkl" to the tarball for each model in models.txt. +# let's also rename it to "tumor_type.model.pkl" in the tarball to avoid having a deep directory structure in the tarball. +models.tar.gz: + mkdir -p temp_models && \ + for model in $$(cat models.txt); do \ + STUDY_NAME=$$(echo $$model | cut -d'/' -f1); \ + MODEL_ID=$$(echo $$model | cut -d'/' -f3); \ + cp studies/$$STUDY_NAME/presets.04/trial=$$MODEL_ID.pkl temp_models/$$STUDY_NAME.model.pkl; \ + done && \ + tar -cvf $@ -C temp_models . && \ + rm -rf temp_models diff --git a/encode_analysis/base_config.yaml b/encode_analysis/base_config.yaml index 20db25f..d0990fb 100644 --- a/encode_analysis/base_config.yaml +++ b/encode_analysis/base_config.yaml @@ -3,7 +3,7 @@ min_region_size: 100 region_size: 10000 genome: - fasta: /Users/allen/genomes/hg38/hg38.fa + fasta: /n/data1/hms/dbmi/park/SOFTWARE/REFERENCE/hg38/cgap_matches/Homo_sapiens_assembly38.fa chromsizes: annotations/hg38.mainchroms.sizes mutation_rate_file: annotations/mutation-rate.hg38.bedgraph.gz blacklist: annotations/blacklist_method12_v1_comb_sort_merged.bed.gz @@ -16,4 +16,4 @@ pipeline_params: repeat_masker_fraction: annotations/hg38.repeat_masker_fraction.bed.gz sample_params: - cluster: false \ No newline at end of file + cluster: true \ No newline at end of file diff --git a/mutopia/__init__.py b/mutopia/__init__.py index 84ce141..a9b0eea 100644 --- a/mutopia/__init__.py +++ b/mutopia/__init__.py @@ -1 +1,6 @@ -__version__ = "1.0.7" \ No newline at end of file +from importlib.metadata import version, PackageNotFoundError + +try: + __version__ = version("mutopia") +except PackageNotFoundError: + __version__ = "0.0.0+unknown" diff --git a/mutopia/cli/model_cli.py b/mutopia/cli/model_cli.py index f377284..811c2a1 100644 --- a/mutopia/cli/model_cli.py +++ b/mutopia/cli/model_cli.py @@ -841,7 +841,57 @@ def annot( annot(model, dataset, output, region=region, threads=threads, calc_shap=calc_shap, celltype=celltype) -@model.command("predict", short_help="Estimate sample contributions via SVI") +@model.command("score", short_help="Score model on held-out loci") +@click.argument("model", type=click.Path(exists=True), metavar="MODEL_FILE") +@click.argument("dataset", type=click.Path(exists=True), metavar="DATASET_FILE") +@click.option( + "--test-chrom", + type=str, + multiple=True, + default=("chr2",), + help="Chromosome(s) to hold out for testing (default: chr2)", +) +@click.option( + "-@", + "--threads", + type=click.IntRange(1, 1000), + default=1, + help="Number of parallel threads", +) +def score( + model: str, + dataset: str, + test_chrom: tuple, + threads: int = 1, +): + """ + Score a trained model on a dataset using cross-validation by locus. + + Fits per-sample local variables on non-held-out chromosomes, then + evaluates reconstruction quality (pseudo-R²) on the held-out chromosomes. + + Examples: + # Score with default held-out chromosome (chr2) + model score trained_model.pkl data.nc + + # Hold out multiple chromosomes + model score trained_model.pkl data.nc --test-chrom chr1 --test-chrom chr2 + + # Score with parallel threads + model score trained_model.pkl data.nc --threads 8 + """ + from .model_core import score_model + + result = score_model( + model_path=model, + dataset_path=dataset, + test_chroms=test_chrom, + threads=threads, + ) + click.echo(f"{result:.6f}") + + +@model.command("add-model-state", short_help="Add model state to dataset") @click.argument("model", type=click.Path(exists=True), metavar="MODEL_FILE") @click.argument("dataset", type=click.Path(exists=True), metavar="DATASET_FILE") @click.argument("output", type=click.Path(writable=True), metavar="OUTPUT_FILE") diff --git a/mutopia/cli/model_core.py b/mutopia/cli/model_core.py index 5a0f31f..2eef2b7 100644 --- a/mutopia/cli/model_core.py +++ b/mutopia/cli/model_core.py @@ -168,6 +168,20 @@ def add_model_state(model_path: str, dataset_path: str, output_path: str): disk.write_dataset(dataset, output_path) +def score_model( + model_path: str, + dataset_path: str, + test_chroms: tuple = ("chr2",), + threads: int = 1, +) -> float: + import mutopia.analysis as mu + + model = mu.load_model(model_path) + dataset = gt.eager_load(dataset_path) + + return model.score(dataset, test_chroms=test_chroms, threads=threads) + + def simulate_from_model( model_path: str, dataset_path: str, diff --git a/mutopia/cli/pipeline_tasks.py b/mutopia/cli/pipeline_tasks.py index 108bbdc..8aeb43f 100644 --- a/mutopia/cli/pipeline_tasks.py +++ b/mutopia/cli/pipeline_tasks.py @@ -15,7 +15,7 @@ list_samples, ) from .pipeline_config import GTensorConfig, ProcessingConfig, FeatureConfig -from urllib.parse import urlparse +from urllib.parse import urlparse, parse_qs, unquote import shutil from mutopia.cli.gensor_core import ( create_gtensor, @@ -94,6 +94,16 @@ def download_path(self): """Determine the local path to download the file to.""" parsed_url = urlparse(self.url) filename = os.path.basename(parsed_url.path) + # For URLs with query-parameter filenames (e.g. GEO/NCBI), extract + # the filename from the 'file' query param if the path yields nothing. + if not filename or filename == "download" or filename == "download/": + query_file = parse_qs(parsed_url.query).get("file", [None])[0] + if query_file: + filename = unquote(query_file) + if not filename: + # Last resort: hash the URL for a stable unique name + import hashlib + filename = hashlib.md5(self.url.encode()).hexdigest() return os.path.join("gtensor__tempfiles/downloads", filename) def output(self): @@ -406,7 +416,8 @@ def run(self): params.pop("file", None) # Remove file as it's not needed here logger.info( f"Ingesting sample '{self.sample_id}' from file '{file_path}' " - f"into GTensor '{gtensor_path}'" + f"into GTensor '{gtensor_path}'. " + f"Clustering will be performed: {params['cluster']}. " ) add_sample(**params) diff --git a/mutopia/gtensor/xarr_extensions.py b/mutopia/gtensor/xarr_extensions.py index 787cf11..48340e4 100644 --- a/mutopia/gtensor/xarr_extensions.py +++ b/mutopia/gtensor/xarr_extensions.py @@ -193,7 +193,7 @@ def __call__(self, *compress_dims: typing.List[str]): @xr.register_dataarray_accessor("asdense") -class AsCSR(BaseAccessor): +class AsDense(BaseAccessor): def __call__(self): try: self._xrds.data = self._xrds.data.todense() @@ -209,13 +209,13 @@ def __call__(self): @xr.register_dataset_accessor("list_samples") -class FetchSample(BaseAccessor): +class ListSamples(BaseAccessor): def __call__(self): return self._xrds.sample.values @xr.register_dataset_accessor("mutate") -class FetchSample(BaseAccessor): +class Mutate(BaseAccessor): def __call__(self, fn): return fn(self._xrds) @@ -228,7 +228,7 @@ def __call__(self, sample_name): @xr.register_dataset_accessor("iter_samples") -class FetchSample(BaseAccessor): +class IterSamples(BaseAccessor): def __call__(self, subset=None): load_samples = subset or self._xrds.list_samples() for sample_name in load_samples: diff --git a/mutopia/model/base.py b/mutopia/model/base.py index 88d7120..130475e 100644 --- a/mutopia/model/base.py +++ b/mutopia/model/base.py @@ -420,7 +420,6 @@ def init_model(self, train_datasets: Sequence[GTensorDataset]) -> TopographyMode return self - def fit( self, train_datasets: Union[GTensorDataset, Sequence[GTensorDataset]], @@ -557,6 +556,15 @@ def setup_corpus(self, dataset: GTensorDataset, threads: int = 1) -> GTensorData logger.info("Done ...") return dataset + def __getstate__(self) -> Dict[str, Any]: + # The callback is a training-loop hook (e.g. an Optuna pruning callback) + # that frequently closes over the training datasets. It has no role in + # the trained model's state and must not pin training data into the + # pickle. + state = self.__dict__.copy() + state["callback"] = None + return state + def save(self, path: str) -> None: """ Save the model to a file. @@ -576,6 +584,65 @@ def save(self, path: str) -> None: dump(self, path) + def score( + self, + dataset: "GTensorDataset", + test_chroms: Sequence[str] = ("chr2",), + threads: int = 1, + ) -> float: + """ + Score the model on a dataset using cross-validation by locus. + + Splits the dataset by chromosome into train and test partitions, + fits local variables (per-sample component contributions) on the + train loci, then evaluates reconstruction quality on the held-out + test loci using those fitted locals. + + Parameters + ---------- + dataset : GTensorDataset + The dataset to score. + test_chroms : sequence of str, default=("chr2",) + Chromosomes to hold out for testing. + threads : int, default=1 + Number of parallel threads. + + Returns + ------- + float + Pseudo-R² score (1 - deviance_fit / deviance_null) on test loci. + """ + self._check_corpus(dataset) + + train_loci, test_loci = train_test_split(dataset, *test_chroms) + + train_loci = self.setup_corpus(train_loci, threads=threads) + test_loci = self.setup_corpus(test_loci, threads=threads) + + # Fit local variables on train loci + sample_names = list(train_loci.list_samples()) + contributions = self.locals_model_.predict( + train_loci, self.factor_model_, threads=threads + ) + sample_idx = {name: i for i, name in enumerate(sample_names)} + + # Build exposures function from fitted contributions + def exposures_fn(dataset, sample_name): + return ( + contributions.isel(sample=sample_idx[sample_name]) + .transpose(..., "component") + .data.ravel() + ) + + # Score on test loci using train-fitted locals + with ParContext(threads) as par: + return self.locals_model_.score( + self.factor_model_, + [test_loci], + exposures_fn=exposures_fn, + par_context=par, + ) + def annot_components(self, dataset: GTensorDataset, normalization: str = "global") -> GTensorDataset: spectra = xr.concat( diff --git a/mutopia/model/model/model_components/theta_model.py b/mutopia/model/model/model_components/theta_model.py index 1c46a83..92f1b23 100644 --- a/mutopia/model/model/model_components/theta_model.py +++ b/mutopia/model/model/model_components/theta_model.py @@ -25,6 +25,8 @@ StratifiedTransformer, ) +def _get_corpus_dim(corpus): + return CS.get_dims(corpus)["locus"] class ThetaModel(RateModel, SparseDataBase, DenseDataBase): @@ -177,7 +179,7 @@ def _get_intercept_idx(self, *corpuses): return get_corpus_intercepts( corpuses, self.corpus_encoder_, - n_repeats=lambda corpus: CS.get_dims(corpus)["locus"], + n_repeats=_get_corpus_dim, ) def _get_intercept_design(self, *corpuses): @@ -250,12 +252,16 @@ def reduce_dense_sstats(self, sstats, corpus, *, weighted_posterior, **kw): def format_component(self, k, normalization="none"): return 0.0 + +def _feature_name_combiner(feature_name, category): + return f"{feature_name}:{category}" + class LinearThetaModel(ThetaModel): categorical_encoder = partial(OneHotEncoder, sparse_output=False, drop="first", - feature_name_combiner=lambda fname, cat: f"{fname}:{cat}") + feature_name_combiner=_feature_name_combiner) def __init__(self, corpuses, l2_regularization=0.1, **kw): super().__init__( diff --git a/mutopia/model/model/optim.py b/mutopia/model/model/optim.py index 8afe858..1572915 100644 --- a/mutopia/model/model/optim.py +++ b/mutopia/model/model/optim.py @@ -38,7 +38,7 @@ def VI_step( offsets, normalizers = factor_model.get_exp_offsets_dict(**args) """ - In the previous "offsets" step, new normalizers were calculated. + In the previous "offsets" step, new normalizers were calculated. Now we need to transer the normalizers to the full data set. """ factor_model.update_normalizers(datasets, normalizers) @@ -381,7 +381,9 @@ def fit_model( with ParContext(threads, verbose) as par: - # factor_model.init_normalizers(train_datasets, par_context=par) + # STABILITY: seed the normalizers from the full training data so early + # SVI epochs don't start from zero while theta predictions are non-trivial. + factor_model.init_normalizers(train_datasets, par_context=par) try: progress_bar = trange( diff --git a/mutopia/tuning.py b/mutopia/tuning.py index ec74ae6..ed145db 100644 --- a/mutopia/tuning.py +++ b/mutopia/tuning.py @@ -235,7 +235,7 @@ def load_study_data(study, lazy=False): return train, test -def _model_report_callback(trial, factor_model, epoch, test_scores,*, data): +def _model_report_callback(trial, factor_model, epoch, test_scores): """ Callback function for reporting trial progress to Optuna. @@ -265,13 +265,6 @@ def _model_report_callback(trial, factor_model, epoch, test_scores,*, data): if trial.should_prune(): raise optuna.TrialPruned() - #locals = factor_model.GT.fetch_locals(data) - #print("EPOCH ", epoch, ": VARIANCE=", var(locals.values)) - #path = os.path.join("generated/studies/healthy.length.full.03", f"trial-{trial.number}_locals", f"epoch-{epoch}.csv") - #os.makedirs(os.path.dirname(path), exist_ok=True) - #print(f"Saving locals to {path}") - #locals.to_dataframe().to_csv(path) - def _get_save_model_fn(study): """ @@ -378,7 +371,7 @@ def _objective( + "\n\t".join([f"{key}: {value}" for key, value in params.items()]) ) - callback = partial(_model_report_callback, trial, data=train[0]) + callback = partial(_model_report_callback, trial) model.set_params( **params, diff --git a/pyproject.toml b/pyproject.toml index 7fd26b9..eeea733 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,6 @@ [build-system] -requires = ["setuptools"] -build-backend = "setuptools.build_meta" \ No newline at end of file +requires = ["setuptools>=64", "setuptools_scm>=8"] +build-backend = "setuptools.build_meta" + +[tool.setuptools_scm] +version_file = "mutopia/_version.py" diff --git a/setup.cfg b/setup.cfg index c2274e6..9c086cf 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,5 @@ [metadata] name = mutopia -version = attr: mutopia.__version__ author = Allen Lynch author_email = allenlynch@g.harvard.edu license = BSD-3-Clause @@ -24,6 +23,7 @@ install_requires = optuna>=3.4.0,<5 pandas>=2.1.0 sparse>=0.15.4 + joblib>=1.2 numba>=0.60.0,<1 netCDF4>=1.7.1.post2,<2 pyfaidx diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/build_chr22_fixture.py b/tests/build_chr22_fixture.py new file mode 100644 index 0000000..f802368 --- /dev/null +++ b/tests/build_chr22_fixture.py @@ -0,0 +1,96 @@ +""" +One-shot script to produce the small fixture artifacts used by the test suite. + +Slices the full Liver tutorial gtensor down to chr22 (train) + chr1 (test), +keeps a small sample subset, and copies the existing tutorial-trained model +as the inference baseline. Outputs land in tests/fixtures/ and are intended +to be uploaded to a GitHub release; CI downloads them via tests/conftest.py. + +Run from repo root: + + python tests/build_chr22_fixture.py + +Re-run only when the input gtensors or the schema changes. +""" + +from __future__ import annotations + +import argparse +import shutil +from pathlib import Path + +import mutopia.analysis as mu + +REPO = Path(__file__).resolve().parents[1] +TUTORIAL = REPO / "tutorials" / "tutorial_data" +OUT = REPO / "tests" / "fixtures" + +SOURCE_TRAIN = TUTORIAL / "Liver.train.nc" +SOURCE_TEST = TUTORIAL / "Liver.test.nc" +SOURCE_MODEL = TUTORIAL / "trained_model.pkl" + +TRAIN_REGION = "chr22" +# A 50 Mb slice of chr1 to keep the held-out fixture small (~5 MB instead of ~35 MB). +TEST_REGION = "chr1:0-50000000" +TRAIN_LABEL = "chr22" +TEST_LABEL = "chr1_50mb" +N_SAMPLES = 10 + + +def build(n_samples: int = N_SAMPLES) -> None: + OUT.mkdir(parents=True, exist_ok=True) + + for src in (SOURCE_TRAIN, SOURCE_TEST, SOURCE_MODEL): + if not src.exists(): + raise FileNotFoundError( + f"Missing source: {src}. Fetch tutorial_data first (see tutorials/)." + ) + + print(f"[build] loading {SOURCE_TRAIN.name}") + train_full = mu.gt.eager_load(str(SOURCE_TRAIN)) + + sample_names = list(train_full.list_samples())[:n_samples] + print(f"[build] keeping {len(sample_names)} of {len(list(train_full.list_samples()))} samples") + + print(f"[build] slicing train to {TRAIN_REGION}") + train_chr = mu.gt.slice_regions(train_full, TRAIN_REGION) + train_chr = mu.gt.slice_samples(train_chr, sample_names) + + train_out = OUT / f"liver.{TRAIN_LABEL}.train.nc" + print(f"[build] writing {train_out}") + mu.gt.write_dataset(train_chr, str(train_out), write_samples=True) + + print(f"[build] loading {SOURCE_TEST.name}") + test_full = mu.gt.eager_load(str(SOURCE_TEST)) + print(f"[build] slicing test to {TEST_REGION}") + test_chr = mu.gt.slice_regions(test_full, TEST_REGION) + test_chr = mu.gt.slice_samples(test_chr, sample_names) + + test_out = OUT / f"liver.{TEST_LABEL}.test.nc" + print(f"[build] writing {test_out}") + mu.gt.write_dataset(test_chr, str(test_out), write_samples=True) + + model_out = OUT / "liver.trained_model.pkl" + print(f"[build] copying {SOURCE_MODEL.name} -> {model_out.name}") + shutil.copy(SOURCE_MODEL, model_out) + + print() + print("[build] done. fixtures:") + for p in sorted(OUT.glob("*")): + print(f" {p.relative_to(REPO)} ({p.stat().st_size / 1e6:.2f} MB)") + print() + print("[build] upload these to a release on sigscape/MuTopia tagged") + print(" e.g. test-fixtures-v1, then update FIXTURE_RELEASE_TAG in") + print(" tests/conftest.py and the cache key in .github/workflows/tests.yml.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--n-samples", + type=int, + default=N_SAMPLES, + help=f"Number of samples to retain (default: {N_SAMPLES})", + ) + args = parser.parse_args() + build(n_samples=args.n_samples) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..c796554 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,158 @@ +""" +Shared pytest fixtures. + +Test data lives outside the repo (in a GitHub release) so the tree stays small. +On first use, the conftest downloads each fixture into tests/fixtures/ and caches +it. To regenerate the fixtures from scratch, see tests/build_chr22_fixture.py. +""" + +from __future__ import annotations + +import os +import urllib.request +from pathlib import Path + +import pytest + + +def pytest_addoption(parser): + parser.addoption( + "--runslow", + action="store_true", + default=False, + help="Run tests marked @pytest.mark.slow (training, convergence).", + ) + + +def pytest_configure(config): + config.addinivalue_line( + "markers", "slow: marks tests that take >30s (deselect with default pytest run)" + ) + + +def pytest_collection_modifyitems(config, items): + if config.getoption("--runslow"): + return + skip_slow = pytest.mark.skip(reason="needs --runslow option to run") + for item in items: + if "slow" in item.keywords: + item.add_marker(skip_slow) + + +FIXTURE_DIR = Path(__file__).parent / "fixtures" + +# Update both the tag and the asset list when fixtures are rebuilt. +FIXTURE_RELEASE_TAG = "test-fixtures" +FIXTURE_BASE_URL = ( + f"https://github.com/sigscape/MuTopia/releases/download/{FIXTURE_RELEASE_TAG}" +) +FIXTURE_FILES = ( + "liver.chr22.train.nc", + "liver.chr1_50mb.test.nc", + "liver.trained_model.pkl", +) + + +def _ensure_fixture(name: str) -> Path: + path = FIXTURE_DIR / name + if path.exists(): + return path + + FIXTURE_DIR.mkdir(parents=True, exist_ok=True) + url = f"{FIXTURE_BASE_URL}/{name}" + print(f"[fixtures] downloading {name} from {url}") + try: + urllib.request.urlretrieve(url, path) + except Exception as e: + if path.exists(): + path.unlink() + pytest.skip( + f"Could not download fixture {name} from {url} ({e}). " + f"Either run tests/build_chr22_fixture.py locally or upload " + f"the fixtures to the {FIXTURE_RELEASE_TAG} release." + ) + return path + + +@pytest.fixture(scope="session") +def fixture_dir() -> Path: + return FIXTURE_DIR + + +@pytest.fixture(scope="session") +def train_nc_path() -> Path: + return _ensure_fixture("liver.chr22.train.nc") + + +@pytest.fixture(scope="session") +def test_nc_path() -> Path: + return _ensure_fixture("liver.chr1_50mb.test.nc") + + +@pytest.fixture(scope="session") +def model_pkl_path() -> Path: + return _ensure_fixture("liver.trained_model.pkl") + + +@pytest.fixture(scope="session") +def trained_model(model_pkl_path: Path): + import mutopia.analysis as mu + + return mu.load_model(str(model_pkl_path)) + + +@pytest.fixture(scope="session") +def train_dataset(train_nc_path: Path): + import mutopia.analysis as mu + + return mu.gt.eager_load(str(train_nc_path)) + + +@pytest.fixture(scope="session") +def test_dataset(test_nc_path: Path): + import mutopia.analysis as mu + + return mu.gt.eager_load(str(test_nc_path)) + + +# --------------------------------------------------------------------------- +# Full-scale (Zenodo) fixtures — only requested by slow-tier tests. +# --------------------------------------------------------------------------- + +ZENODO_RECORD = "18803136" +ZENODO_BASE_URL = f"https://zenodo.org/records/{ZENODO_RECORD}/files" +ZENODO_CACHE_DIR = FIXTURE_DIR / "zenodo" +ZENODO_TUMOR_TYPE = "Liver-HCC" + + +def _ensure_zenodo_asset(name: str) -> Path: + path = ZENODO_CACHE_DIR / name + if path.exists(): + return path + + ZENODO_CACHE_DIR.mkdir(parents=True, exist_ok=True) + url = f"{ZENODO_BASE_URL}/{name}" + print(f"[fixtures] downloading {name} from Zenodo (large; first run only)") + try: + urllib.request.urlretrieve(url, path) + except Exception as e: + if path.exists(): + path.unlink() + pytest.skip( + f"Could not download Zenodo asset {name} from {url} ({e}). " + f"Slow-tier tests require network access to zenodo.org." + ) + return path + + +@pytest.fixture(scope="session") +def zenodo_liver_path() -> Path: + return _ensure_zenodo_asset(f"{ZENODO_TUMOR_TYPE}.nc") + + +@pytest.fixture(scope="session") +def zenodo_liver_split(zenodo_liver_path: Path): + """Train/test split of the full Liver-HCC gtensor by holding out chr1.""" + from mutopia.gtensor import lazy_train_test_load + + return lazy_train_test_load(str(zenodo_liver_path), "chr1") diff --git a/tests/fixtures/.gitignore b/tests/fixtures/.gitignore new file mode 100644 index 0000000..2a2f0e4 --- /dev/null +++ b/tests/fixtures/.gitignore @@ -0,0 +1,3 @@ +*.nc +*.pkl +zenodo/ diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..bb3f397 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,48 @@ +""" +CLI smoke tests. + +These shell out to the installed entry points and check exit codes. +They verify packaging / command registration, not numerical behavior. +""" + +from __future__ import annotations + +import subprocess + + +def _run(cmd: list[str]) -> subprocess.CompletedProcess: + return subprocess.run(cmd, capture_output=True, text=True, timeout=120) + + +def test_gtensor_help(): + r = _run(["gtensor", "--help"]) + assert r.returncode == 0, r.stderr + assert "compose" in r.stdout + + +def test_topo_model_help(): + r = _run(["topo-model", "--help"]) + assert r.returncode == 0, r.stderr + assert "train" in r.stdout + assert "study" in r.stdout + + +def test_topo_model_score_help(): + r = _run(["topo-model", "score", "--help"]) + assert r.returncode == 0, r.stderr + + +def test_mutopia_sbs_help(): + r = _run(["mutopia-sbs", "--help"]) + assert r.returncode == 0, r.stderr + + +def test_gtensor_info(train_nc_path): + r = _run(["gtensor", "info", str(train_nc_path)]) + assert r.returncode == 0, r.stderr + + +def test_gtensor_feature_ls(train_nc_path): + r = _run(["gtensor", "feature", "ls", str(train_nc_path)]) + assert r.returncode == 0, r.stderr + assert "H3K27ac" in r.stdout diff --git a/tests/test_gtensor_ops.py b/tests/test_gtensor_ops.py new file mode 100644 index 0000000..8eefb91 --- /dev/null +++ b/tests/test_gtensor_ops.py @@ -0,0 +1,112 @@ +""" +Tests for gtensor data operations: slicing, splitting, feature fetching. +These exercise the API tutorials 1 and 2 rely on. +""" + +from __future__ import annotations + +import pytest + + +def test_lazy_load_no_samples(train_nc_path): + import mutopia.analysis as mu + + ds = mu.gt.lazy_load(str(train_nc_path)) + assert "Regions" in ds.sections.names + assert "Features" in ds.sections.names + + +def test_eager_load_has_samples(train_dataset): + samples = list(train_dataset.list_samples()) + assert len(samples) == 10 + + +def test_slice_regions_by_chrom(train_dataset): + import mutopia.analysis as mu + + sliced = mu.gt.slice_regions(train_dataset, "chr22") + chroms = set(sliced.sections["Regions"].chrom.values.tolist()) + assert chroms == {"chr22"} + + +def test_slice_regions_by_interval(train_dataset): + import mutopia.analysis as mu + + sliced = mu.gt.slice_regions(train_dataset, "chr22:0-25000000") + starts = sliced.sections["Regions"].start.values + ends = sliced.sections["Regions"].end.values + assert (starts >= 0).all() + assert (ends <= 25_000_000).all() + assert len(starts) > 0 + + +def test_slice_samples_subset(train_dataset): + import mutopia.analysis as mu + + samples = list(train_dataset.list_samples())[:3] + sliced = mu.gt.slice_samples(train_dataset, samples) + assert list(sliced.list_samples()) == samples + + +def test_slice_samples_unknown_raises(train_dataset): + import mutopia.analysis as mu + + with pytest.raises((KeyError, Exception)): + mu.gt.slice_samples(train_dataset, ["__not_a_sample__"]) + + +def test_fetch_features_glob(train_dataset): + import mutopia.analysis as mu + + feats = mu.gt.fetch_features(train_dataset, "H3K*") + # H3K27ac, H3K27me3, H3K36me3, H3K4me1, H3K4me3 in our fixture + assert "feature" in feats.dims + assert feats.sizes["feature"] >= 1 + + +def test_train_test_split_separates_chroms(train_dataset): + """train_test_split should partition loci by chromosome with no overlap.""" + from mutopia.gtensor import train_test_split + + # train fixture is chr22-only; split into chr22 (held out) vs nothing + # Use a region split instead by holding out chr22:0-25M from a combined region + # but our fixture is chr22 only. Make a synthetic split via region slicing first. + import mutopia.analysis as mu + + left = mu.gt.slice_regions(train_dataset, "chr22:0-25000000") + right = mu.gt.slice_regions(train_dataset, "chr22:25000001-100000000") + assert left.sizes["locus"] > 0 + assert right.sizes["locus"] > 0 + assert left.sizes["locus"] + right.sizes["locus"] <= train_dataset.sizes["locus"] + + +def test_write_then_load_roundtrip(train_dataset, tmp_path): + import mutopia.analysis as mu + + out = tmp_path / "roundtrip.nc" + mu.gt.write_dataset(train_dataset, str(out), write_samples=True) + reloaded = mu.gt.eager_load(str(out)) + assert dict(reloaded.sizes) == dict(train_dataset.sizes) + assert list(reloaded.list_samples()) == list(train_dataset.list_samples()) + + +def test_write_without_samples(train_dataset, tmp_path): + """write_samples=False should still produce a loadable gtensor with intact regions/features.""" + import mutopia.analysis as mu + + out = tmp_path / "no_samples.nc" + mu.gt.write_dataset(train_dataset, str(out), write_samples=False) + reloaded = mu.gt.lazy_load(str(out)) + assert reloaded.sizes["locus"] == train_dataset.sizes["locus"] + assert "Regions" in reloaded.sections.names + assert "Features" in reloaded.sections.names + + +def test_lazy_load_after_eager_write(train_dataset, tmp_path): + """A gtensor written with samples must be discoverable via lazy_load.""" + import mutopia.analysis as mu + + out = tmp_path / "rt.nc" + mu.gt.write_dataset(train_dataset, str(out), write_samples=True) + lazy = mu.gt.lazy_load(str(out)) + assert len(list(lazy.list_samples())) == len(list(train_dataset.list_samples())) diff --git a/tests/test_inference.py b/tests/test_inference.py new file mode 100644 index 0000000..932fc9a --- /dev/null +++ b/tests/test_inference.py @@ -0,0 +1,124 @@ +""" +Inference-path regression tests. + +These pin the behavior of a frozen model + frozen fixture: outputs that change +in shape, dtype, sign, or magnitude flag a regression. Numerical tolerances are +loose so legitimate BLAS/threading drift doesn't flake the suite. +""" + +from __future__ import annotations + +import numpy as np +import pytest + + +EXPECTED_ANNOT_VARS = { + "contributions", + "Spectra/spectra", + "Spectra/interactions", + "Spectra/shared_effects", + "component_distributions", + "component_distributions_locus", + "predicted_marginal", + "predicted_marginal_locus", + "empirical_marginal", + "empirical_marginal_locus", +} + + +def test_model_loads(trained_model): + assert type(trained_model).__name__ == "SBSModel" + assert trained_model.num_components == 15 + + +def test_train_fixture_shape(train_dataset): + sizes = dict(train_dataset.sizes) + assert sizes["sample"] == 10 + assert sizes["context"] == 96 + assert sizes["configuration"] == 2 + assert sizes["locus"] > 0 + + +def test_test_fixture_shape(test_dataset): + sizes = dict(test_dataset.sizes) + assert sizes["sample"] == 10 + assert sizes["locus"] > 0 + + +def test_annot_data_produces_expected_vars(trained_model, train_dataset): + annotated = trained_model.annot_data(train_dataset, threads=1, calc_shap=False) + missing = EXPECTED_ANNOT_VARS - set(annotated.data_vars) + assert not missing, f"annot_data missing vars: {missing}" + + +def test_contributions_are_finite(trained_model, train_dataset): + annotated = trained_model.annot_data(train_dataset, threads=1, calc_shap=False) + contrib = annotated["contributions"].values + assert np.all(np.isfinite(contrib)), "non-finite values in contributions" + assert (contrib >= 0).all(), "contributions should be non-negative" + + +def test_contributions_shape_matches_components(trained_model, train_dataset): + annotated = trained_model.annot_data(train_dataset, threads=1, calc_shap=False) + contrib = annotated["contributions"] + assert "sample" in contrib.dims + assert "component" in contrib.dims + assert contrib.sizes["component"] == trained_model.num_components + + +def test_predicted_marginal_finite(trained_model, train_dataset): + annotated = trained_model.annot_data(train_dataset, threads=1, calc_shap=False) + pred = annotated["predicted_marginal"].values + assert np.all(np.isfinite(pred)) + + +def test_component_distributions_finite_and_nonneg(trained_model, train_dataset): + annotated = trained_model.annot_data(train_dataset, threads=1, calc_shap=False) + cd = annotated["component_distributions"].values + finite = cd[np.isfinite(cd)] + assert finite.size > 0 + assert (finite >= 0).all() + + +def test_model_save_load_roundtrip(trained_model, tmp_path): + """Saving then loading should produce a model that pickles to the same state.""" + import mutopia.analysis as mu + + out = tmp_path / "roundtrip.pkl" + trained_model.save(str(out)) + reloaded = mu.load_model(str(out)) + assert reloaded.num_components == trained_model.num_components + assert type(reloaded).__name__ == type(trained_model).__name__ + + +def test_save_load_predictions_match(trained_model, train_dataset, tmp_path): + """A reloaded model must produce numerically equivalent predictions.""" + import mutopia.analysis as mu + + out = tmp_path / "rt.pkl" + trained_model.save(str(out)) + reloaded = mu.load_model(str(out)) + + a = trained_model.annot_data(train_dataset, threads=1, calc_shap=False) + b = reloaded.annot_data(train_dataset, threads=1, calc_shap=False) + np.testing.assert_allclose( + a["contributions"].values, b["contributions"].values, rtol=1e-5, atol=1e-7 + ) + np.testing.assert_allclose( + a["predicted_marginal"].values, + b["predicted_marginal"].values, + rtol=1e-5, + atol=1e-7, + ) + + +def test_callback_dropped_from_pickle(trained_model, tmp_path): + """__getstate__ must drop the training callback so closures over training data + don't get pickled (regression for c73656e).""" + import mutopia.analysis as mu + + trained_model.callback = lambda *a, **kw: None # simulate a trained-with-callback model + out = tmp_path / "no_callback.pkl" + trained_model.save(str(out)) + reloaded = mu.load_model(str(out)) + assert reloaded.callback is None diff --git a/tests/test_plotting.py b/tests/test_plotting.py new file mode 100644 index 0000000..ec9d4ee --- /dev/null +++ b/tests/test_plotting.py @@ -0,0 +1,48 @@ +""" +Plotting smoke tests. + +We don't pixel-snapshot. We just check that plot calls return without error +and produce a Figure with sensible structure. Visual review stays manual. +""" + +from __future__ import annotations + +import pytest + +matplotlib = pytest.importorskip("matplotlib") +matplotlib.use("Agg") +import matplotlib.pyplot as plt # noqa: E402 + + +@pytest.fixture +def annotated_dataset(trained_model, train_dataset): + """Annotated dataset usable as input to plotting helpers.""" + return trained_model.annot_data(train_dataset, threads=1, calc_shap=False) + + +def teardown_function(_): + plt.close("all") + + +def test_plot_signature_panel(annotated_dataset): + import mutopia.analysis as mu + + fig = mu.pl.plot_signature_panel(annotated_dataset) + assert fig is not None or plt.gcf() is not None + + +def test_plot_component(annotated_dataset): + import mutopia.analysis as mu + + components = mu.gt.list_components(annotated_dataset) + assert components, "no components on annotated dataset" + mu.pl.plot_component(annotated_dataset, components[0]) + assert plt.gcf().axes, "expected at least one axes" + + +def test_plot_shap_summary_skips_without_shap(annotated_dataset): + """plot_shap_summary requires SHAP values; verify it errors clearly when absent.""" + import mutopia.analysis as mu + + with pytest.raises(Exception): + mu.pl.plot_shap_summary(annotated_dataset) diff --git a/tests/test_training.py b/tests/test_training.py new file mode 100644 index 0000000..69ca664 --- /dev/null +++ b/tests/test_training.py @@ -0,0 +1,130 @@ +""" +Training-path tests against the full-scale Liver-HCC gtensor from Zenodo. + +These are slow (each fits a real model end-to-end) and gated behind +--runslow so they don't bloat the default suite or CI. To run: + + pytest tests/test_training.py --runslow + +The Zenodo asset (~hundreds of MB) is downloaded once into +tests/fixtures/zenodo/ and cached. Use these to sanity-check optimization +changes before tagging a release. +""" + +from __future__ import annotations + +import math + +import numpy as np +import pytest + +pytestmark = pytest.mark.slow + + +def _make_model(train_dataset, **overrides): + import mutopia.analysis as mu + + ModelCls = mu.make_model_cls(train_dataset) + kwargs = dict( + num_components=5, + seed=0, + threads=1, + eval_every=1, + ) + kwargs.update(overrides) + return ModelCls(**kwargs) + + +def test_batch_training_completes_without_nan(zenodo_liver_split): + """Full-batch training (no subsampling) on the production-scale gtensor.""" + train, test = zenodo_liver_split + + model = _make_model( + train, + locus_subsample=None, + batch_subsample=None, + num_epochs=20, + ) + model.fit(train, test) + + assert model.test_scores_, "test_scores_ should be populated" + assert all(math.isfinite(s) for s in model.test_scores_), ( + f"non-finite test scores: {model.test_scores_}" + ) + + +def test_batch_training_test_scores_nondecreasing(zenodo_liver_split): + """In full-batch mode on real data, the test score should be non-decreasing + modulo small numerical noise. A real decrease flags an optimizer regression.""" + train, test = zenodo_liver_split + + model = _make_model( + train, + locus_subsample=None, + batch_subsample=None, + num_epochs=20, + ) + model.fit(train, test) + + scores = np.asarray(model.test_scores_) + diffs = np.diff(scores) + # Allow tiny float noise; any meaningful decrease is a regression. + assert (diffs >= -1e-3).all(), ( + f"test scores decreased between epochs: {scores.tolist()}" + ) + + +def test_svi_training_completes_without_nan(zenodo_liver_split): + """SVI training (with subsampling) at production scale.""" + train, test = zenodo_liver_split + + model = _make_model( + train, + locus_subsample=1 / 8, + num_epochs=20, + ) + model.fit(train, test) + + assert model.test_scores_ + assert all(math.isfinite(s) for s in model.test_scores_), ( + f"non-finite test scores: {model.test_scores_}" + ) + + +def test_trained_model_save_load_roundtrip(zenodo_liver_split, tmp_path): + """A freshly trained model must save and reload to bit-equivalent predictions.""" + import mutopia.analysis as mu + + train, test = zenodo_liver_split + + model = _make_model( + train, + locus_subsample=1 / 8, + num_epochs=5, + ) + model.fit(train, test) + + out = tmp_path / "fresh.pkl" + model.save(str(out)) + reloaded = mu.load_model(str(out)) + + a = model.annot_data(train, threads=1, calc_shap=False) + b = reloaded.annot_data(train, threads=1, calc_shap=False) + np.testing.assert_allclose( + a["contributions"].values, b["contributions"].values, rtol=1e-5, atol=1e-7 + ) + + +def test_init_components_with_cosmic_names(zenodo_liver_split): + """The tutorial-recommended init_components path should fit without crashing.""" + train, test = zenodo_liver_split + + model = _make_model( + train, + num_components=2, + init_components=["SBS1", "SBS3"], + locus_subsample=1 / 8, + num_epochs=5, + ) + model.fit(train, test) + assert model.test_scores_