sigscape · AllenWLynch · Apr 28, 2026 · Apr 13, 2026 · Apr 27, 2026 · Apr 27, 2026
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -17,10 +17,28 @@ on:
   workflow_dispatch:
 
 jobs:
+  tests:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: pip
+      - run: pip install -e '.[testing]'
+      - uses: actions/cache@v4
+        with:
+          path: tests/fixtures
+          key: test-fixtures
+      - run: pytest tests/ --tb=short
+
   build:
+    needs: tests
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # setuptools_scm needs full history + tags
 
       - uses: actions/setup-python@v5
         with:

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,32 @@
+name: Tests
+
+on:
+  push:
+    branches: [main, imports]
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  pytest:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: pip
+
+      - name: Install package
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e '.[testing]'
+
+      - name: Cache test fixtures
+        uses: actions/cache@v4
+        with:
+          path: tests/fixtures
+          key: test-fixtures
+
+      - name: Run pytest
+        run: pytest tests/ -v --tb=short
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 __pycache__/
+mutopia/_version.py
 data/
 *.db
 *.ipynb

diff --git a/encode_analysis/Makefile b/encode_analysis/Makefile
@@ -140,7 +140,12 @@ studies/%/presets.02/: staged/%.train.nc staged/%.test.nc
 	topo-model study create $@ -ds $^ -lsub 0.2 -creg 0.00025 --save-model -ee -stop 100 $$PARAMS
 
 studies/%/presets.04/: staged/%.train.nc staged/%.test.nc
-	@mkdir -p studies/$*/presets.02
+	@mkdir -p studies/$*/presets.04
+	PARAMS=$$(python bin/params_from_config.py $*); \
+	topo-model study create $@ -ds $^ -lsub 0.2 --save-model -ee -stop 50 $$PARAMS
+
+studies/%/presets.05/: staged/%.train.nc staged/%.test.nc
+	@mkdir -p studies/$*/presets.05
 	PARAMS=$$(python bin/params_from_config.py $*); \
 	topo-model study create $@ -ds $^ -lsub 0.2 --save-model -ee -stop 50 $$PARAMS
 
@@ -331,9 +336,67 @@ analyses/%/base_annotation.nc:
 		--wait \
 		--wrap="python bin/base_annotation.py --model studies/$$STUDY_NAME/$$STUDY_ID/trial=$$MODEL_ID.pkl --base-gtensor gtensors/Lung-All.nc --output $@"
 
+# Alternative base annotations: impute mutation rates onto other tumor types' epigenomes.
+# Add new bases to ALT_BASES to extend coverage; each generates
+# analyses/%/base_annotations/<BASE>.nc via the shared recipe below.
+ALT_BASES := Breast-All Kidney-All
+
+define ALT_BASE_RECIPE
+	@mkdir -p $(dir $@)
+	@FULL_PATH="$*"; \
+	STUDY_NAME=$$(echo $$FULL_PATH | cut -d'/' -f1); \
+	STUDY_ID=$$(echo $$FULL_PATH | cut -d'/' -f2); \
+	MODEL_ID=$$(echo $$FULL_PATH | cut -d'/' -f3); \
+	BASE=$(notdir $(basename $@)); \
+	sbatch \
+		--mem=5G \
+		--cpus-per-task=5 \
+		--ntasks=1 \
+		--time=30:00 \
+		--job-name=$@ \
+		--output=%x.log \
+		--partition=short,park \
+		--account=park \
+		--wait \
+		--wrap="python bin/base_annotation.py \
+			--model studies/$$STUDY_NAME/$$STUDY_ID/trial=$$MODEL_ID.pkl \
+			--base-gtensor gtensors/$$BASE.nc \
+			--output $@"
+endef
+
+analyses/%/base_annotations/Breast-All.nc:
+	$(ALT_BASE_RECIPE)
+
+analyses/%/base_annotations/Kidney-All.nc:
+	$(ALT_BASE_RECIPE)
+
 joint_summary.tar.gz:
 	tar -cvf $@ $$(for model in $$(cat models.txt); do echo "analyses/$$model/annotated.shap.nc" "analyses/$$model/base_annotation.nc analyses/$$model/analysis.ipynb"; done) meta_analysis.ipynb models.txt
 
 joint_summary.doga.tar.gz:
 	tar -cvf $@ $$(for model in $$(cat models.txt); do echo "analyses/$$model/base_annotation.nc*.gz analyses/$$model/annotated.shap.nc"; done) analyses/joint_collection/*
 
+annotated.tar.gz:
+	tar -cvf $@ $$(for model in $$(cat models.txt); do echo "analyses/$$model/annotated.nc"; done)
+
+# Packages Lung-All base annotation (original), two alt-base annotations (Breast-All, Kidney-All),
+# and annotated.nc (same-tumor imputation — already contains component_distributions via annot_data).
+multi_base_annotations.tar.gz:
+	tar -cvf $@ $$(for model in $$(cat models.txt); do \
+		echo "analyses/$$model/base_annotation.nc"; \
+		echo "analyses/$$model/base_annotations/Breast-All.nc"; \
+		echo "analyses/$$model/base_annotations/Kidney-All.nc"; \
+		echo "analyses/$$model/annotated.nc"; \
+	done)
+
+# a model has the format "tumor_type/preset.04/model_id", and we need to add "studies/tumor_type/presets.04/trial=model_id.pkl" to the tarball for each model in models.txt.
+# let's also rename it to "tumor_type.model.pkl" in the tarball to avoid having a deep directory structure in the tarball.
+models.tar.gz:
+	mkdir -p temp_models && \
+	for model in $$(cat models.txt); do \
+		STUDY_NAME=$$(echo $$model | cut -d'/' -f1); \
+		MODEL_ID=$$(echo $$model | cut -d'/' -f3); \
+		cp studies/$$STUDY_NAME/presets.04/trial=$$MODEL_ID.pkl temp_models/$$STUDY_NAME.model.pkl; \
+	done && \
+	tar -cvf $@ -C temp_models . && \
+	rm -rf temp_models
diff --git a/encode_analysis/base_config.yaml b/encode_analysis/base_config.yaml
@@ -3,7 +3,7 @@ min_region_size: 100
 region_size: 10000
 
 genome:
-  fasta: /Users/allen/genomes/hg38/hg38.fa
+  fasta: /n/data1/hms/dbmi/park/SOFTWARE/REFERENCE/hg38/cgap_matches/Homo_sapiens_assembly38.fa
   chromsizes: annotations/hg38.mainchroms.sizes
   mutation_rate_file: annotations/mutation-rate.hg38.bedgraph.gz
   blacklist: annotations/blacklist_method12_v1_comb_sort_merged.bed.gz
@@ -16,4 +16,4 @@ pipeline_params:
   repeat_masker_fraction: annotations/hg38.repeat_masker_fraction.bed.gz
 
 sample_params:
-  cluster: false
+  cluster: true
diff --git a/mutopia/__init__.py b/mutopia/__init__.py
@@ -1 +1,6 @@
-__version__ = "1.0.7"
+from importlib.metadata import version, PackageNotFoundError
+
+try:
+    __version__ = version("mutopia")
+except PackageNotFoundError:
+    __version__ = "0.0.0+unknown"
diff --git a/mutopia/cli/model_cli.py b/mutopia/cli/model_cli.py
@@ -841,7 +841,57 @@ def annot(
     annot(model, dataset, output, region=region, threads=threads, calc_shap=calc_shap, celltype=celltype)
 
 
-@model.command("predict", short_help="Estimate sample contributions via SVI")
+@model.command("score", short_help="Score model on held-out loci")
+@click.argument("model", type=click.Path(exists=True), metavar="MODEL_FILE")
+@click.argument("dataset", type=click.Path(exists=True), metavar="DATASET_FILE")
+@click.option(
+    "--test-chrom",
+    type=str,
+    multiple=True,
+    default=("chr2",),
+    help="Chromosome(s) to hold out for testing (default: chr2)",
+)
+@click.option(
+    "-@",
+    "--threads",
+    type=click.IntRange(1, 1000),
+    default=1,
+    help="Number of parallel threads",
+)
+def score(
+    model: str,
+    dataset: str,
+    test_chrom: tuple,
+    threads: int = 1,
+):
+    """
+    Score a trained model on a dataset using cross-validation by locus.
+
+    Fits per-sample local variables on non-held-out chromosomes, then
+    evaluates reconstruction quality (pseudo-R²) on the held-out chromosomes.
+
+    Examples:
+        # Score with default held-out chromosome (chr2)
+        model score trained_model.pkl data.nc
+
+        # Hold out multiple chromosomes
+        model score trained_model.pkl data.nc --test-chrom chr1 --test-chrom chr2
+
+        # Score with parallel threads
+        model score trained_model.pkl data.nc --threads 8
+    """
+    from .model_core import score_model
+
+    result = score_model(
+        model_path=model,
+        dataset_path=dataset,
+        test_chroms=test_chrom,
+        threads=threads,
+    )
+    click.echo(f"{result:.6f}")
+
+
+@model.command("add-model-state", short_help="Add model state to dataset")
 @click.argument("model", type=click.Path(exists=True), metavar="MODEL_FILE")
 @click.argument("dataset", type=click.Path(exists=True), metavar="DATASET_FILE")
 @click.argument("output", type=click.Path(writable=True), metavar="OUTPUT_FILE")

diff --git a/mutopia/cli/model_core.py b/mutopia/cli/model_core.py
@@ -168,6 +168,20 @@ def add_model_state(model_path: str, dataset_path: str, output_path: str):
     disk.write_dataset(dataset, output_path)
 
 
+def score_model(
+    model_path: str,
+    dataset_path: str,
+    test_chroms: tuple = ("chr2",),
+    threads: int = 1,
+) -> float:
+    import mutopia.analysis as mu
+
+    model = mu.load_model(model_path)
+    dataset = gt.eager_load(dataset_path)
+
+    return model.score(dataset, test_chroms=test_chroms, threads=threads)
+
+
 def simulate_from_model(
     model_path: str,
     dataset_path: str,

diff --git a/mutopia/cli/pipeline_tasks.py b/mutopia/cli/pipeline_tasks.py
@@ -15,7 +15,7 @@
     list_samples,
 )
 from .pipeline_config import GTensorConfig, ProcessingConfig, FeatureConfig
-from urllib.parse import urlparse
+from urllib.parse import urlparse, parse_qs, unquote
 import shutil
 from mutopia.cli.gensor_core import (
     create_gtensor,
@@ -94,6 +94,16 @@ def download_path(self):
         """Determine the local path to download the file to."""
         parsed_url = urlparse(self.url)
         filename = os.path.basename(parsed_url.path)
+        # For URLs with query-parameter filenames (e.g. GEO/NCBI), extract
+        # the filename from the 'file' query param if the path yields nothing.
+        if not filename or filename == "download" or filename == "download/":
+            query_file = parse_qs(parsed_url.query).get("file", [None])[0]
+            if query_file:
+                filename = unquote(query_file)
+        if not filename:
+            # Last resort: hash the URL for a stable unique name
+            import hashlib
+            filename = hashlib.md5(self.url.encode()).hexdigest()
         return os.path.join("gtensor__tempfiles/downloads", filename)
 
     def output(self):
@@ -406,7 +416,8 @@ def run(self):
         params.pop("file", None)  # Remove file as it's not needed here
         logger.info(
             f"Ingesting sample '{self.sample_id}' from file '{file_path}' "
-            f"into GTensor '{gtensor_path}'"
+            f"into GTensor '{gtensor_path}'. "
+            f"Clustering will be performed: {params['cluster']}. "
         )
         add_sample(**params)
 

diff --git a/mutopia/gtensor/xarr_extensions.py b/mutopia/gtensor/xarr_extensions.py
@@ -193,7 +193,7 @@ def __call__(self, *compress_dims: typing.List[str]):
 
 
 @xr.register_dataarray_accessor("asdense")
-class AsCSR(BaseAccessor):
+class AsDense(BaseAccessor):
     def __call__(self):
         try:
             self._xrds.data = self._xrds.data.todense()
@@ -209,13 +209,13 @@ def __call__(self):
 
 
 @xr.register_dataset_accessor("list_samples")
-class FetchSample(BaseAccessor):
+class ListSamples(BaseAccessor):
     def __call__(self):
         return self._xrds.sample.values
 
 
 @xr.register_dataset_accessor("mutate")
-class FetchSample(BaseAccessor):
+class Mutate(BaseAccessor):
     def __call__(self, fn):
         return fn(self._xrds)
 
@@ -228,7 +228,7 @@ def __call__(self, sample_name):
 
 
 @xr.register_dataset_accessor("iter_samples")
-class FetchSample(BaseAccessor):
+class IterSamples(BaseAccessor):
     def __call__(self, subset=None):
         load_samples = subset or self._xrds.list_samples()
         for sample_name in load_samples: