Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,28 @@ on:
workflow_dispatch:

jobs:
tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: pip
- run: pip install -e '.[testing]'
- uses: actions/cache@v4
with:
path: tests/fixtures
key: test-fixtures
- run: pytest tests/ --tb=short

build:
needs: tests
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # setuptools_scm needs full history + tags

- uses: actions/setup-python@v5
with:
Expand Down
32 changes: 32 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: Tests

on:
push:
branches: [main, imports]
pull_request:
workflow_dispatch:

jobs:
pytest:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: pip

- name: Install package
run: |
python -m pip install --upgrade pip
pip install -e '.[testing]'

- name: Cache test fixtures
uses: actions/cache@v4
with:
path: tests/fixtures
key: test-fixtures

- name: Run pytest
run: pytest tests/ -v --tb=short
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
__pycache__/
mutopia/_version.py
data/
*.db
*.ipynb
Expand Down
65 changes: 64 additions & 1 deletion encode_analysis/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,12 @@ studies/%/presets.02/: staged/%.train.nc staged/%.test.nc
topo-model study create $@ -ds $^ -lsub 0.2 -creg 0.00025 --save-model -ee -stop 100 $$PARAMS

studies/%/presets.04/: staged/%.train.nc staged/%.test.nc
@mkdir -p studies/$*/presets.02
@mkdir -p studies/$*/presets.04
PARAMS=$$(python bin/params_from_config.py $*); \
topo-model study create $@ -ds $^ -lsub 0.2 --save-model -ee -stop 50 $$PARAMS

studies/%/presets.05/: staged/%.train.nc staged/%.test.nc
@mkdir -p studies/$*/presets.05
PARAMS=$$(python bin/params_from_config.py $*); \
topo-model study create $@ -ds $^ -lsub 0.2 --save-model -ee -stop 50 $$PARAMS

Expand Down Expand Up @@ -331,9 +336,67 @@ analyses/%/base_annotation.nc:
--wait \
--wrap="python bin/base_annotation.py --model studies/$$STUDY_NAME/$$STUDY_ID/trial=$$MODEL_ID.pkl --base-gtensor gtensors/Lung-All.nc --output $@"

# Alternative base annotations: impute mutation rates onto other tumor types' epigenomes.
# Add new bases to ALT_BASES to extend coverage; each generates
# analyses/%/base_annotations/<BASE>.nc via the shared recipe below.
ALT_BASES := Breast-All Kidney-All

define ALT_BASE_RECIPE
@mkdir -p $(dir $@)
@FULL_PATH="$*"; \
STUDY_NAME=$$(echo $$FULL_PATH | cut -d'/' -f1); \
STUDY_ID=$$(echo $$FULL_PATH | cut -d'/' -f2); \
MODEL_ID=$$(echo $$FULL_PATH | cut -d'/' -f3); \
BASE=$(notdir $(basename $@)); \
sbatch \
--mem=5G \
--cpus-per-task=5 \
--ntasks=1 \
--time=30:00 \
--job-name=$@ \
--output=%x.log \
--partition=short,park \
--account=park \
--wait \
--wrap="python bin/base_annotation.py \
--model studies/$$STUDY_NAME/$$STUDY_ID/trial=$$MODEL_ID.pkl \
--base-gtensor gtensors/$$BASE.nc \
--output $@"
endef

analyses/%/base_annotations/Breast-All.nc:
$(ALT_BASE_RECIPE)

analyses/%/base_annotations/Kidney-All.nc:
$(ALT_BASE_RECIPE)

joint_summary.tar.gz:
tar -cvf $@ $$(for model in $$(cat models.txt); do echo "analyses/$$model/annotated.shap.nc" "analyses/$$model/base_annotation.nc analyses/$$model/analysis.ipynb"; done) meta_analysis.ipynb models.txt

joint_summary.doga.tar.gz:
tar -cvf $@ $$(for model in $$(cat models.txt); do echo "analyses/$$model/base_annotation.nc*.gz analyses/$$model/annotated.shap.nc"; done) analyses/joint_collection/*

annotated.tar.gz:
tar -cvf $@ $$(for model in $$(cat models.txt); do echo "analyses/$$model/annotated.nc"; done)

# Packages Lung-All base annotation (original), two alt-base annotations (Breast-All, Kidney-All),
# and annotated.nc (same-tumor imputation — already contains component_distributions via annot_data).
multi_base_annotations.tar.gz:
tar -cvf $@ $$(for model in $$(cat models.txt); do \
echo "analyses/$$model/base_annotation.nc"; \
echo "analyses/$$model/base_annotations/Breast-All.nc"; \
echo "analyses/$$model/base_annotations/Kidney-All.nc"; \
echo "analyses/$$model/annotated.nc"; \
done)

# a model has the format "tumor_type/preset.04/model_id", and we need to add "studies/tumor_type/presets.04/trial=model_id.pkl" to the tarball for each model in models.txt.
# let's also rename it to "tumor_type.model.pkl" in the tarball to avoid having a deep directory structure in the tarball.
models.tar.gz:
mkdir -p temp_models && \
for model in $$(cat models.txt); do \
STUDY_NAME=$$(echo $$model | cut -d'/' -f1); \
MODEL_ID=$$(echo $$model | cut -d'/' -f3); \
cp studies/$$STUDY_NAME/presets.04/trial=$$MODEL_ID.pkl temp_models/$$STUDY_NAME.model.pkl; \
done && \
tar -cvf $@ -C temp_models . && \
rm -rf temp_models
4 changes: 2 additions & 2 deletions encode_analysis/base_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ min_region_size: 100
region_size: 10000

genome:
fasta: /Users/allen/genomes/hg38/hg38.fa
fasta: /n/data1/hms/dbmi/park/SOFTWARE/REFERENCE/hg38/cgap_matches/Homo_sapiens_assembly38.fa
chromsizes: annotations/hg38.mainchroms.sizes
mutation_rate_file: annotations/mutation-rate.hg38.bedgraph.gz
blacklist: annotations/blacklist_method12_v1_comb_sort_merged.bed.gz
Expand All @@ -16,4 +16,4 @@ pipeline_params:
repeat_masker_fraction: annotations/hg38.repeat_masker_fraction.bed.gz

sample_params:
cluster: false
cluster: true
7 changes: 6 additions & 1 deletion mutopia/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,6 @@
__version__ = "1.0.7"
from importlib.metadata import version, PackageNotFoundError

try:
__version__ = version("mutopia")
except PackageNotFoundError:
__version__ = "0.0.0+unknown"
52 changes: 51 additions & 1 deletion mutopia/cli/model_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -841,7 +841,57 @@ def annot(
annot(model, dataset, output, region=region, threads=threads, calc_shap=calc_shap, celltype=celltype)


@model.command("predict", short_help="Estimate sample contributions via SVI")
@model.command("score", short_help="Score model on held-out loci")
@click.argument("model", type=click.Path(exists=True), metavar="MODEL_FILE")
@click.argument("dataset", type=click.Path(exists=True), metavar="DATASET_FILE")
@click.option(
"--test-chrom",
type=str,
multiple=True,
default=("chr2",),
help="Chromosome(s) to hold out for testing (default: chr2)",
)
@click.option(
"-@",
"--threads",
type=click.IntRange(1, 1000),
default=1,
help="Number of parallel threads",
)
def score(
model: str,
dataset: str,
test_chrom: tuple,
threads: int = 1,
):
"""
Score a trained model on a dataset using cross-validation by locus.

Fits per-sample local variables on non-held-out chromosomes, then
evaluates reconstruction quality (pseudo-R²) on the held-out chromosomes.

Examples:
# Score with default held-out chromosome (chr2)
model score trained_model.pkl data.nc

# Hold out multiple chromosomes
model score trained_model.pkl data.nc --test-chrom chr1 --test-chrom chr2

# Score with parallel threads
model score trained_model.pkl data.nc --threads 8
"""
from .model_core import score_model

result = score_model(
model_path=model,
dataset_path=dataset,
test_chroms=test_chrom,
threads=threads,
)
click.echo(f"{result:.6f}")


@model.command("add-model-state", short_help="Add model state to dataset")
@click.argument("model", type=click.Path(exists=True), metavar="MODEL_FILE")
@click.argument("dataset", type=click.Path(exists=True), metavar="DATASET_FILE")
@click.argument("output", type=click.Path(writable=True), metavar="OUTPUT_FILE")
Expand Down
14 changes: 14 additions & 0 deletions mutopia/cli/model_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,20 @@ def add_model_state(model_path: str, dataset_path: str, output_path: str):
disk.write_dataset(dataset, output_path)


def score_model(
model_path: str,
dataset_path: str,
test_chroms: tuple = ("chr2",),
threads: int = 1,
) -> float:
import mutopia.analysis as mu

model = mu.load_model(model_path)
dataset = gt.eager_load(dataset_path)

return model.score(dataset, test_chroms=test_chroms, threads=threads)


def simulate_from_model(
model_path: str,
dataset_path: str,
Expand Down
15 changes: 13 additions & 2 deletions mutopia/cli/pipeline_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
list_samples,
)
from .pipeline_config import GTensorConfig, ProcessingConfig, FeatureConfig
from urllib.parse import urlparse
from urllib.parse import urlparse, parse_qs, unquote
import shutil
from mutopia.cli.gensor_core import (
create_gtensor,
Expand Down Expand Up @@ -94,6 +94,16 @@ def download_path(self):
"""Determine the local path to download the file to."""
parsed_url = urlparse(self.url)
filename = os.path.basename(parsed_url.path)
# For URLs with query-parameter filenames (e.g. GEO/NCBI), extract
# the filename from the 'file' query param if the path yields nothing.
if not filename or filename == "download" or filename == "download/":
query_file = parse_qs(parsed_url.query).get("file", [None])[0]
if query_file:
filename = unquote(query_file)
if not filename:
# Last resort: hash the URL for a stable unique name
import hashlib
filename = hashlib.md5(self.url.encode()).hexdigest()
return os.path.join("gtensor__tempfiles/downloads", filename)

def output(self):
Expand Down Expand Up @@ -406,7 +416,8 @@ def run(self):
params.pop("file", None) # Remove file as it's not needed here
logger.info(
f"Ingesting sample '{self.sample_id}' from file '{file_path}' "
f"into GTensor '{gtensor_path}'"
f"into GTensor '{gtensor_path}'. "
f"Clustering will be performed: {params['cluster']}. "
)
add_sample(**params)

Expand Down
8 changes: 4 additions & 4 deletions mutopia/gtensor/xarr_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def __call__(self, *compress_dims: typing.List[str]):


@xr.register_dataarray_accessor("asdense")
class AsCSR(BaseAccessor):
class AsDense(BaseAccessor):
def __call__(self):
try:
self._xrds.data = self._xrds.data.todense()
Expand All @@ -209,13 +209,13 @@ def __call__(self):


@xr.register_dataset_accessor("list_samples")
class FetchSample(BaseAccessor):
class ListSamples(BaseAccessor):
def __call__(self):
return self._xrds.sample.values


@xr.register_dataset_accessor("mutate")
class FetchSample(BaseAccessor):
class Mutate(BaseAccessor):
def __call__(self, fn):
return fn(self._xrds)

Expand All @@ -228,7 +228,7 @@ def __call__(self, sample_name):


@xr.register_dataset_accessor("iter_samples")
class FetchSample(BaseAccessor):
class IterSamples(BaseAccessor):
def __call__(self, subset=None):
load_samples = subset or self._xrds.list_samples()
for sample_name in load_samples:
Expand Down
Loading
Loading