diff --git a/.flake8 b/.flake8
deleted file mode 100644
index 8dcf28a7..00000000
--- a/.flake8
+++ /dev/null
@@ -1,12 +0,0 @@
-[flake8]
-# See https://pycodestyle.pycqa.org/en/latest/intro.html#error-codes
-ignore =
- # E203: whitespace before ‘,’, ‘;’, or ‘:’
- E203,
- # E266: too many leading ‘#’ for block comment
- E266,
- # E501: line too long
- E501,
- # W503: line break before binary operator
- W503
-exclude = .git,docs,__init__.py
diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
new file mode 100644
index 00000000..5cac6697
--- /dev/null
+++ b/.github/workflows/formatting.yml
@@ -0,0 +1,19 @@
+name: dimelo-formatting
+on: [ pull_request ]
+jobs:
+ ruff-check:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: chartboost/ruff-action@v1
+ with:
+ version: 0.6.8
+ # TODO: Is it really necessary for these to be separate jobs? This seems redundant.
+ ruff-format-check:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: chartboost/ruff-action@v1
+ with:
+ version: 0.6.8
+ args: 'format --check'
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a10d46a5..5c62aa79 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1,65 +1,42 @@
-name: dimelo
+name: dimelo-test
-on: [push]
+on:
+ workflow_dispatch: # Allows manual trigger of the workflow
+ pull_request: # Trigger by default on PR opened, reopened, or synchronized (commits pushed to PR)
jobs:
- build-conda:
- runs-on: ubuntu-latest
+ platform_matrix: # Run the tests on each supported platform
strategy:
matrix:
- python-version: ["3.7"]
-
+ platform: # Define the platform tag and the file name of the appropriate miniconda install script
+ - os: ubuntu-latest
+ miniconda: Miniconda3-latest-Linux-x86_64.sh
+ - os: macos-latest
+ miniconda: Miniconda3-latest-MacOSX-x86_64.sh
+ runs-on: ${{ matrix.platform.os }}
steps:
- - uses: actions/checkout@v3
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v3
- with:
- python-version: ${{ matrix.python-version }}
- # TODO: If we change the name of the dimelo subdirectory, conda install as written here should break.
- # Installing format dependencies with pip for full compatibility
- - name: Install dependencies
- run: |
- $CONDA/bin/conda env update --file environment_linux.yml --name base
- pip install flake8 black isort
- $CONDA/bin/conda install pytest
- - name: Lint with flake8
- run: |
- flake8
- - name: Check format with black
- run: |
- black --config pyproject.toml --diff --check .
- - name: Clean up and sort imports
- run: |
- isort --check-only .
- - name: Test with pytest
- run: |
- $CONDA/bin/pytest
-
- build-pip:
- runs-on: ubuntu-latest
- strategy:
- matrix:
- python-version: ["3.7"]
- steps:
- - uses: actions/checkout@v3
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v3
- with:
- python-version: ${{ matrix.python-version }}
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip
- pip install flake8 black isort pytest
+ - uses: actions/checkout@v4
+ - name: Setup conda and run pytest
+ run: |
+ # Install Miniconda if not available on the runner
+ wget https://repo.anaconda.com/miniconda/${{ matrix.platform.miniconda }} -O miniconda.sh
+ bash miniconda.sh -b -p $HOME/miniconda
+ source $HOME/miniconda/etc/profile.d/conda.sh
+
+ # Create conda environment from environment.yml
+ conda env create -f environment.yml
+
+ # Activate the environment
+ conda activate $(head -n 1 environment.yml | cut -d' ' -f2)
+
+ # Install pip dependencies and the package itself
pip install .
- - name: Lint with flake8
- run: |
- flake8
- - name: Check format with black
- run: |
- black --diff --check .
- - name: Clean up and sort imports
- run: |
- isort --check-only .
- - name: Test with pytest
- run: |
+
+ # Additional testing dependencies not covered in environment.yml
+ conda install pytest
+
+ # Make sure everything is installed correctly
+ conda list
+
+ # Run pytest
pytest
diff --git a/.gitignore b/.gitignore
index c8b25e67..f6236edc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,4 +53,12 @@ coverage.xml
.pytest_cache/
# Sphinx documentation
-docs/_build/
\ No newline at end of file
+docs/_build/
+
+
+# Ignore tutorial output files
+dimelo/test/output
+
+# Checkpoint files
+.ipynb_checkpoints/
+
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f4837ea6..5194a15d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,15 +1,16 @@
repos:
- - repo: https://github.com/psf/black
- rev: 22.3.0
- hooks:
- - id: black
- - repo: https://gitlab.com/pycqa/flake8
- rev: 3.9.2
- hooks:
- - id: flake8
- - repo: https://github.com/pycqa/isort
- rev: 5.9.3
- hooks:
- - id: isort
- name: isort (python)
- additional_dependencies: [toml]
+- repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.6.8
+ hooks:
+ # Run the linter.
+ - id: ruff
+ # Run the formatter.
+ - id: ruff-format
+# - repo: https://github.com/pre-commit/mirrors-mypy
+# rev: v1.9.0
+# hooks:
+# - id: mypy
+ # These are things I'm considering including in the ignore list, but should think about...
+ # Ignore missing import errors caused by pre-commit being run in an isolated environment
+ # Allow conflicting assignment to existing variables in reasonable circumstances; mainly for coersion of path strings
+ # args: [--ignore-missing-imports, --allow-redefinition]
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index c29515a8..00000000
--- a/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2021 amaslan
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/Makefile b/Makefile
deleted file mode 100644
index c322dbee..00000000
--- a/Makefile
+++ /dev/null
@@ -1,57 +0,0 @@
-define help
-
-Supported targets: prepare, develop, sdist, clean, test, and pypi.
-
-The 'prepare' target installs this project's build requirements into the current virtualenv.
-
-The 'develop' target creates an editable install of this project and its runtime requirements in the
-current virtualenv. The install is called 'editable' because changes to the source code
-immediately affect the virtualenv.
-
-The 'clean' target undoes the effect of 'develop'.
-
-The 'test' target runs unit tests. Set the 'tests' variable to run a particular test, e.g.
-
- make test tests=PlotMAPQtest/countMAPQ_test.py
-
-The 'pypi' target publishes the current commit of this project to PyPI after enforcing that the working
-copy and the index are clean, and tagging it as an unstable .dev build.
-
-endef
-export help
-help:
- @printf "$$help"
-
-SHELL=bash
-python=python
-pip=pip
-tests=.
-version:=$(shell $(python) version.py)
-sdist_name:=dimelo-$(version).tar.gz
-
-develop:
- $(pip) install -e .
-
-clean_develop:
- - $(pip) uninstall -y dimelo
- - rm -rf *.egg-info
-
-clean_sdist:
- - rm -rf dist
-
-clean: clean_develop clean_pypi
-
-check_build_reqs:
- @$(python) -c 'import pytest' \
- || ( printf "$(redpip)Build requirements are missing. Run 'make prepare' to install them.$(normal)" ; false )
-
-test: check_build_reqs
- $(python) -m pytest -vv $(tests)
-
-pypi: clean clean_sdist
- set -x \
- && $(python) setup.py sdist bdist_wheel \
- && twine check dist/* \
- && twine upload --repository-url https://test.pypi.org/legacy/ dist/*
-clean_pypi:
- - rm -rf build/
\ No newline at end of file
diff --git a/README.md b/README.md
index 0b188e90..a5746096 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,371 @@
-[](https://github.com/streetslab/dimelo/actions)
-[](https://github.com/psf/black)
+# dimelo
+## Introduction
-# DEPRECATION NOTICE:
-This version of the package is currently unmaintained. Aside from a number of performance issues, it is becoming increasingly difficult to install across many platforms. We are actively working on a completely overhauled version. If you are interested in using our new software, please reach out to us! We would love to work with you to facilitate your DiMeLo-seq analysis.
+The `dimelo` package provides an integrated pipeline for the analysis of multimodal single molecule epigenetic measurements. It is designed with long-read sequencing measurements in mind, but is compatible with any sequencing processing pipeline that generates specification-compliant modbam files.
-# dimelo
-dimelo is a python package and command-line tool for analysis of DiMeLo-seq & nanopore modified base data.
+v0.2.0 is a major overhaul compared to v0.1.0. It supports the same core pileup and single read extraction operations as the original `dimelo v0.1.0` package, but focuses on a number of new objectives:
+
+
+
Support multicolor data / any base modification context (GpC, CpC, etc)
+
Vector extraction for all data types
+
Enhanced speed and reliability, enabling e.g. whole genome processing
+
Maintainability -> using a small number of standard dependencies, outsourcing as much as possible to well-maintained third-party packages (e.g. modkit, pysam, h5py, and a few others)
+
Modularity in both architecture and operation
+
Ease of use, especially for multiplatform installation
+
More powerful plotting e.g. bam files from different basecallers, single read sorting, rapid iteration
+
+
+This README document contains installation instructions and documentation for various use cases. There is a [tutorial](#basic-use) jupyter notebook that will take you through the core functionality of the package step-by-step. For Google Colab, the notebook already contains the necessary code to set up and run `dimelo`, whereas for local operation you will first need to follow the [local install instructions](#local-install-via-conda). Be sure to check that your system meets our [specifications](#system-requirements). The software is still in early release, and as such, user feedback and requests are greatly appreciated.
+
+## Contents
+[1.0 Install Instructions](#Install-instructions)
+
+-[1.1 Local Install via Conda](#Local-Install-via-Conda)
+
+-[1.2 Google Colab Installation](#Google-Colab-Installation)
+
+-[1.3 Alternative Installations](#Alternative-Installations)
+
+-[1.4 Developer Installation](#Developer-Installation)
+
+[2.0 Basic Use](#Basic-Use)
+
+-[2.1 Parameters and what they mean](#Parameters-and-what-they-mean)
+
+-[2.2 Parsing and processing](#Parsing-and-processing)
+
+-[2.3 Plotting](#Plotting)
+
+-[2.4 Load values from processed files](#load-values-from-processed-files)
+
+[3.0 Known Issues](#known-issues)
+
+-[3.1 No progress bars](#no-progress-bars)
+
+# Install Instructions
+## Local Install via Conda
+
+### System Requirements
+
+**System Specs:** You will need at least 10GB of disk space and 10GB of RAM for the tutorial (largely due to the reference genome). More disk space may be required if processing large datasets. Additionally, if you want to run on many cores, you should have at least 4GB of RAM per core. If you have less per-core memory than this, consider specifying a subset of cores when calling parsing methods. See the [parameters](#parameters-and-what-they-mean) section for more information. `cores=1` will use the least memory and thus is the least likely to be terminated by your OS.
+
+**Platforms:** Mac and Linux operating systems, ARM (e.g. M1/M2 mac) and x86 (e.g. Intel mac) architectures. The package has been tested on HPC clusters, but there may be additional complexities depending on how these systems are set up.
+
+*For Windows, we recommend using [Google Colab](https://colab.research.google.com/). We have not tested on [Windows Linux Subsystem](https://learn.microsoft.com/en-us/windows/wsl/install) but in principle that should work too. Windows support is possible in future, but blocked by [conda availability for modkit executables](https://anaconda.org/nanoporetech/modkit) and the [current implementation](dimelo/run_modkit.py) of live error/progress tracking during modkit execution, which relies on a unix-only library as of Python 3.11. The urgency of a Windows implementation will depend on user need, so please let us know if this is important for you.*
+
+**Conda and Python:** The default installation requires conda, or alternatives like mamba. See [here](https://www.anaconda.com/download) for conda installation. The installation instructions below will install Python 3.11 for you within a conda virtual environment, but depending on your system configuration you may need to ensure that you are not also loading a different version of Python on your path. If you encounter unexpected errors when importing `dimelo`, e.g. complaining about syntax, consider checking your Python version.
+
+### Load source code from the modkit_parsing_beta branch
+
+Open your terminal or command line and navigate to wherever you want to keep the `dimelo` source code (e.g. your Documents folder, `cd Documents`) and clone the repo
+
+```
+git clone https://github.com/streetslab/dimelo
+```
+
+### Set up virtual environment
+
+Navigate into the dimelo directory
+
+```
+cd dimelo
+```
+
+Create a conda environment using environment.yml. This will make a new conda environment with the name `dimelo`.
+
+```
+conda env create -f environment.yml
+```
+
+*If you want to handle environment creation yourself, see [the alternative installation instructions](#alternative-installations).*
+
+### Install pip dependencies and core dimelo package
+
+Activate your conda environment, which should now contain python 3.11 and a modkit executable on the path and executable on your system.
+
+```
+conda activate dimelo
+```
+
+Ensure that you are still in the top-level dimelo directory. Install the dimelo package and its dependencies from source.
+
+```
+pip install .
+```
+
+## Google Colab Installation
+
+Run the following code in the first cell of your notebook to grab `modkit v0.2.4` from conda and install the `dimelo modkit_parsing_beta` branch. This will have to be run whenever you make a new Colab instance, unless you have a better way of managing this, in which case please reach out. The tutorial notebook runs equivalent code blocks to set up your environment, so if you are trying to run the tutorial you can skip to [Basic Use](#basic-use).
+
+```
+from google.colab import drive
+drive.mount('/content/drive')
+!pip install -q condacolab
+import condacolab
+condacolab.install()
+!conda install nanoporetech::modkit==0.2.4
+!git clone https://github.com/streetslab/dimelo
+!cd dimelo && pip install ipywidgets==7.7.1 .
+import dimelo
+```
+
+## Alternative Installations
+
+Alternatively, you can install modkit into any conda environment you like. If you want to, you can install modkit some other way, and then add it to the path of your notebook or script. *NOTE: if you are creating the environment yourself, be sure to use python 3.10 or greater. Some dimelo package features require relatively new python releases.*
+
+```
+conda install nanoporetech::modkit==0.2.4
+```
+OR
+```
+# install modkit some other way
+# add to path in python before importing dimelo
+import sys
+sys.path.append('path_to_modkit_executable_directory')
+```
+
+## Developer Installation
+If you are planning on developing for the `dimelo` package, change the `pip install` command to install the package in "editable" mode, so that your code changes are reflected in your environment:
+```
+pip install . -e
+```
+
+Additionally, be aware that this package uses [ruff](https://docs.astral.sh/ruff/) to enforce code standards. To make it easy to check that your changes meet standards, we provide [pre-commit](https://pre-commit.com/) hooks that run the checks automatically when you commit.
+
+After installing [pre-commit](https://pre-commit.com/) on your system, run the following from the top level of the repository to set up the hooks:
+```
+pre-commit install
+```
+
+If you need to manually trigger a formatting check, the following command will forcibly run all checks on the entire repository:
+```
+pre-commit run --all-files
+```
+
+To run functionality tests on your machine, ensure that `pytest` is installed in your conda environment:
+```
+conda install pytest
+```
+
+The tests can be run from the top level of the repository using the following command:
+```
+pytest
+```
+
+# Basic Use
+
+See the [tutorial](tutorial.ipynb) as a starting point.
+
+For local operation on Mac or Linux, you will already have cloned the repo to disk in the installation step. Activate your conda environment, make sure you have jupyter installed, and then launch a jupyter notebook server and navigate to `tutorial.ipynb`. You can also use other tools to open the jupyter notebook or you can simply reference it as an example.
+
+```
+conda activate dimelo
+jupyter notebook
+```
+
+If you want to run the tutorial on Google Colab, you can download [tutorial.ipynb](tutorial.ipynb), upload it to [Google Colab](https://colab.research.google.com/), and follow the instructions in the cells.
+
+## Parsing and processing
+
+The general workflow of this package is as follows:
+```
+Parsing: aligned modbam file (latest .bam spec) --> processed file
+Loading: processed file --> python objects
+Plotting: python objects --> visualizations
+```
+
+Both pileup and extract are typically run with a .bed file of regions, which can then be also passed to the plotting functions. All regions are processed into a file called `regions.processed.bed` which follows the format required by `modkit`:
+```
+chr14 44123158 44123308 + . .
+```
+
+`parse_bam.pileup` creates a bedmethyl genome-position-wise pileup for profiles and enrichment plotting between regions/modifications or for pulling out a genomic track at one or more regions.
+
+```
+def pileup(
+ input_file: str | Path,
+ output_name: str,
+ ref_genome: str | Path,
+ output_directory: str | Path = None,
+ regions: str | Path | list[str | Path] = None,
+ motifs: list = ['A,0','CG,0'],
+ thresh: float = None,
+ window_size: int = None,
+ cores: int = None,
+ log: bool = False,
+ cleanup: bool = True,
+ quiet: bool = False,
+ override_checks: bool = False,) -> Path, Path:
+```
+
+`parse_bam.extract` creates an hdf5 file with datasets for different aspects of single read data, which can then be passed to plot single reads.
+```
+def extract(
+ input_file: str | Path,
+ output_name: str,
+ ref_genome: str | Path,
+ output_directory: str | Path = None,
+ regions: str | Path | list[str | Path] = None,
+ motifs: list = ['A,0','CG,0','GCH,1'],
+ thresh: float = None,
+ window_size: int = None,
+ cores: int = None,
+ log: bool = False,
+ cleanup: bool = True,
+ quiet: bool = False,
+ override_checks: bool = False,) -> Path, Path:
+```
+
+For human-readable pileups (bedmethyl files, .bed) and extracted reads (.txt tab-separated values), run with `cleanup=False`. `cleanup=True` will clear these outputs because they can take up a lot of space.
+
+### Parsing outputs
+You should expect to see some text outputs and a series of progress bars. Progress bars tell you an estimated time remaining (typically an overestimate by 2-3x at the beginning of contig/chromosome). If you do not see progress bars, go to the [known issues: no progress bars](#no-progress-bars) section for possible fixes.
+
+There should not be such issues for command line operation. See below an example of command line progress outputs: you should expect relatively fast pre-processing, 10-90 seconds, and then contig processing times depending heavily on the size of your `.bam` file and the extent of your `regions`.
+
+```
+(dimelo_modkit_parsing) oberondixon-luinenburg@Oberons-MacBook-Pro package_test_notebooks % python dimelo_cmd.py
+modkit found with expected version 0.2.4
+No output directory provided, using input directory /Users/oberondixon-luinenburg/Documents/Ioannidis-Streets/dimelo_test_data/20230702_jm_lmnb1_acessibility_redux
+No specified number of cores requested. 8 available on machine, allocating all.
+Modification threshold of 0.9 will be treated as coming from range 0-1.
+████████████████████| Preprocessing complete for motifs ['A,0'] in chm13.draft_v1.1.fasta: 100% | 00:30
+███████████████████| All regions complete in mod_mappings.01.retagged.ma.sorted.bam: 100% | 02:23<00:00
+████████████████| processed 218324 reads, 13323144 rows, skipped ~184465 reads, failed ~0 reads: 100%
+```
+
+You should see no outputs at all if `quiet=True`.
+
+## Plotting
+
+`plot_enrichment_profile` module for pileup line plot profiles across one or more region
+```
+def plot_enrichment_profile(mod_file_names: list[str | Path],
+ regions_list: list[str | Path | list[str | Path]],
+ motifs: list[str],
+ sample_names: list[str],
+ window_size: int,
+ smooth_window: int | None = None,
+ **kwargs) -> Axes:
+def by_modification(mod_file_name: str | Path,
+ regions: str | Path,
+ motifs: list[str],
+ *args,
+ **kwargs) -> Axes:
+def by_regions(mod_file_name: str | Path,
+ regions_list: list[str | Path | list[str | Path]],
+ motif: str,
+ sample_names: list[str] = None,
+ *args,
+ **kwargs) -> Axes:
+def by_dataset(mod_file_names: list[str | Path],
+ regions: str | Path | list[str | Path],
+ motif: str,
+ sample_names: list[str] = None,
+ *args,
+ **kwargs) -> Axes:
+```
+`plot_enrichment` module for enrichment (e.g. mA/A) bar plot comparisons
+```
+
+def plot_enrichment(mod_file_names: list[str | Path],
+ regions_list: list[str | Path | list[str | Path]],
+ motifs: list[str],
+ sample_names: list[str],
+ **kwargs) -> Axes:
+def by_modification(mod_file_name: str | Path,
+ regions: str | Path | list[str | Path],
+ motifs: list[str],
+ *args,
+ **kwargs) -> Axes:
+def by_regions(mod_file_name: str | Path,
+ regions_list: list[str | Path | list[str | Path]],
+ motif: str,
+ sample_names: list[str] = None,
+ *args,
+ **kwargs) -> Axes:
+def by_dataset(mod_file_names: list[str | Path],
+ regions: str | Path | list[str | Path],
+ motif: str,
+ sample_names: list[str] = None,
+ *args,
+ **kwargs) -> Axes:
+```
+`plot_reads` module for single read plots
+```
+def plot_reads(mod_file_name: str | Path,
+ regions: str | Path | list[str | Path],
+ motifs: list[str],
+ window_size: int = None,
+ sort_by: str | list[str] = 'shuffle',
+ thresh: float = None,
+ relative: bool = True,
+ **kwargs
+ ) -> Axes:
+```
+## Load values from processed files
+
+`load_processed.pileup_counts_from_bedmethyl` for valid/modified counts from a specified region or set of regions
+```
+def pileup_counts_from_bedmethyl(bedmethyl_file: Path,
+ motif: str,
+ regions: str | Path | list[str | Path] = None,
+ ) -> tuple[int, int]:
+```
+`load_processed.pileup_vectors_from_bedmethyl` for valid over modified fraction from a specified region or set of regions
+```
+def pileup_vectors_from_bedmethyl(bedmethyl_file: str | Path,
+ motif: str,
+ regions: str | Path | list[str | Path],
+ window_size: int = None) -> (np.ndarray,np.ndarray):
+```
+`load_processed.read_vectors_from_hdf5` for read-by-basemod lists of valid and modified positions
+```
+def read_vectors_from_hdf5(
+ file: str | Path,
+ motifs: list[str],
+ regions: str | Path | list[str | Path] = None,
+ window_size: int = None,
+ sort_by: str | list[str] = ['chromosome','region_start','read_start'],
+ calculate_mod_fractions: bool = True,
+) -> (list[tuple],list[str],dict):
+```
+
+## Parameters and what they mean
+
+Many of the parsing, loading, and plotting functions share parameters. The common ones and their meanings/defaults are listed below:
+
+`input_file` for parsing functions is a mandatory parameter providing the path to an aligned .bam file with modification calls, a .bam.bai index, and tags following the latest .bam specifications. parse_bam will check whether this .bam file meets the specifications and tell you what to do if it doesn't.
+
+`output_name` for parsing functions is a mandatory string parameter that will be given to the new folder containing the processed outputs.
+
+`ref_genome` for parsing functions is a mandatory parameter providing the path to the reference genome .fasta file to which the `input_file` .bam is aligned.
+
+`output_directory` for parsing functions is an optional parameter specifying a parent directory for your outputs. By default, this will simply be the directory in which you `input_file` resides.
+
+`mod_file_name` and `mod_file_names` for plotting functions are mandatory parameters providing a path to processed/parsed files that are ready for plotting. These paths are returns by the parsing functions but can also be provided manually by the user as a string or Path object. If providing manually, the path should point to a .bed.gz file with an accompanying .bed.gz.tbi index for profile and enrichment plots and to an .h5 file for read plots. `mod_file_name` points to a single file whereas `mod_files_names` is a list of files.
+
+`regions` and `regions_list` are used for specifying subsets of the genome to parse, load, or plot. A `region` is defined as a range of genomic coordinates, and `regions` can refer to any number of `region` specifications. Thus for the `regions` parameter one can pass a single region specified as a string, `chrX:XXX-XXX`, many regions defined in a .bed tab-separated-value file with each line containing at miniimum chromosome, start, and end coordinates (plus optionally a strand, + or -), or a list of strings specifiers or bed files. The entire list will be rolled into a single `regions` set to be passed down for subsequent processing. In the case of regions-wise comparisons in plotting functions, `regions_list` is a *list of regions objects*, where each element of the list is a string, Path, or list of strings or Paths.
+
+`motif` and `motifs` are used to specify what base modifications you are interested in and what their sequence context is for parse, load, and plot functions. A single `motif` is a string containing several canonical bases (using the [IUPAC nucleic acid notation](https://en.wikipedia.org/wiki/Nucleic_acid_notation), e.g. **H** refers to "not a G"), followed by a comma, and then an integer specifying which coordinate in the string is your modified base. For example, 6mA is denoted "A,0" and CpG is denoted "CG,0" whereas GpC *excluding CpGs* is denoted "GCH,1". `motifs` is a list of such strings for functions that can work on multiple base modifications at once.
+
+`thresh` for parsing and some loading/plotting functions refers to a base modification probability threshold, used to transform the the output of most basecalling pipelines into a binary call for any given read position. For parsing pileup calls, this defaults to `None` which allows `modkit` to pick its own threshold based on the data. For other calls, the parameter is mandatory. The normal use is specifying between 0 and 1, but 1-255 is also supported to make the inputs more backwards compatible with old `dimelo` package versions and with examination of the raw .bam file contents. A value between and 255 will simply be converted into a 0-1 probability before being handed down to subsequent processing.
+
+`window_size` for parsing and most loading and plotting functions is a *modification to your regions* that will redefine them to be all the same size, i.e. 2 x window_size, centered around the centers of your original regions. This is important for the parsing and plotting applications that show many genomic regions at once, but should be left blank if you don't want your regions modified. The default is `None` for functions where the parameter is optional.
+
+`cores`, `log`, `cleanup`, `quiet`, and `override_checks` can be ignored for most parsing applications. `cores` allows you to specify that `modkit` uses only a fraction of all the compute resources of your machine, rather than all; `log` will save the modkit logs for troubleshooting, and `cleanup` will keep the (often large) human-readable outputs that are inefficient for plotting and vector extraction but may be helpful for other use cases. `quiet` suppressed progress bars and other outputs and `override_checks` lets you run modkit even if the bam format checking and reference alignment checking are anomalous.
+
+`relative` is a boolean input that specifies whether loading and plotting operations adjust coordinates to be relative to some center point or simple plot in absolute genomic coordinates.
+
+`sort_by` for plot_reads will sort reads by any of `region_start`, `region_end`, `read_name`, `read_start`, `read_end`, `chromosome`, `strand`, and `MOTIF_mod_fraction` for any extracted motif. New sorting options are planned in the future. The default is `shuffle`, which will put the reads in a random order. `sort_by` can be passed as one string or as a list of strings. If a list is passed, the reads will be sorted hierarchically i.e. first by the first list element, then the second, and so on. The exception is that if any of the list elements are `shuffle`, the reads will *first* be shuffled and then sorted by the rest of the elements in order of priority.
+
+`**kwargs` for all plotting functions get passed down to the underlying matplotlib / seaborn plotting functions. See matplotlib and seaborn documentation for more details.
+
+# Known Issues
+## No progress bars
+The most common culprit for progress bar issues in notebooks (Jupyter or Colab) is an incompatibility between your notebooks interfaces and your `ipywidgets` version. The latest jupyter notebooks or jupyter lab install and the latest ipywidgets should work together, but on Google Colab, VS Code, Open On Demand, and other jupyter interfaces this may not be the case. [setup.py](setup.py) contains details on which versions you can try downgrading to for different platforms. The following code run in your activated conda environment will downgrade `ipywidgets` to your specified version. **Our Colab instructions in the [Colab Installation](#google-colab-installation) section and the [tutorial](tutorial.ipynb) already handle this for you.**
+
+```
+pip install ipywidgets==X.XX.X
+```
-Documentation: https://streetslab.github.io/dimelo/
diff --git a/bin/dimelo-parse-bam b/bin/dimelo-parse-bam
deleted file mode 100644
index db4d8fd1..00000000
--- a/bin/dimelo-parse-bam
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env python
-
-from dimelo.parse_bam import main
-
-# call main(), which takes in command line arguments.
-main()
diff --git a/bin/dimelo-plot-browser b/bin/dimelo-plot-browser
deleted file mode 100644
index ee47afba..00000000
--- a/bin/dimelo-plot-browser
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env python
-
-from dimelo.plot_browser import main
-
-# call main(), which takes in command line arguments.
-main()
diff --git a/bin/dimelo-plot-enrichment b/bin/dimelo-plot-enrichment
deleted file mode 100644
index 1ce07c13..00000000
--- a/bin/dimelo-plot-enrichment
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env python
-
-from dimelo.plot_enrichment import main
-
-# call main(), which takes in command line arguments.
-main()
diff --git a/bin/dimelo-plot-enrichment-profile b/bin/dimelo-plot-enrichment-profile
deleted file mode 100644
index 64c7d651..00000000
--- a/bin/dimelo-plot-enrichment-profile
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env python
-
-from dimelo.plot_enrichment_profile import main
-
-# call main(), which takes in command line arguments.
-main()
diff --git a/bin/dimelo-qc-report b/bin/dimelo-qc-report
deleted file mode 100644
index b7340e00..00000000
--- a/bin/dimelo-qc-report
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env python
-
-from dimelo.qc import main
-
-# call main(), which takes in command line arguments.
-main()
diff --git a/dimelo/__init__.py b/dimelo/__init__.py
index 79dd5ea9..4906a851 100644
--- a/dimelo/__init__.py
+++ b/dimelo/__init__.py
@@ -1,20 +1,23 @@
-r"""
-===============
-dimelo module
-===============
-.. currentmodule:: dimelo
+from . import (
+ export,
+ load_processed,
+ parse_bam,
+ plot_depth_histogram,
+ plot_depth_profile,
+ plot_enrichment,
+ plot_enrichment_profile,
+ plot_read_browser,
+ plot_reads,
+)
-dimelo allows you to perform quality control and plot modified bases from bam files.
-
-.. automodule:: parse_bam
-.. automodule:: plot_browser
-.. automodule:: plot_enrichment
-.. automodule:: plot_enrichment_profile
-.. automodule:: qc_report
-
-"""
-from .parse_bam import parse_bam
-from .plot_browser import plot_browser
-from .plot_enrichment import plot_enrichment
-from .plot_enrichment_profile import plot_enrichment_profile
-from .qc import qc_report
+__all__ = [
+ "export",
+ "load_processed",
+ "parse_bam",
+ "plot_depth_histogram",
+ "plot_depth_profile",
+ "plot_enrichment",
+ "plot_enrichment_profile",
+ "plot_read_browser",
+ "plot_reads",
+]
diff --git a/dimelo/export.py b/dimelo/export.py
new file mode 100644
index 00000000..e2c973c9
--- /dev/null
+++ b/dimelo/export.py
@@ -0,0 +1,167 @@
+import os
+from collections import deque
+from pathlib import Path
+
+import pyBigWig
+import pysam
+from tqdm.auto import tqdm
+
+from . import load_processed, utils
+
+"""
+This module contains code to export indexed and compressed parse output files to other formats that may be helpful for downstream analysis.
+"""
+
+
+def tail(n, iterable):
+ """
+ Return an iterator over the last n items.
+ Copied from https://docs.python.org/3/library/itertools.html#itertools-recipes
+ """
+
+ # tail(3, 'ABCDEFG') → E F G
+ return iter(deque(iterable, maxlen=n))
+
+
+def pileup_to_bigwig(
+ bedmethyl_file: str | Path,
+ motif: str,
+ bigwig_file: str | Path | None = None,
+ ref_genome: str | Path | None = None,
+ strand: str = ".",
+ chunk_size: int = 1000,
+):
+ """
+ Extract a single motif from a pileup and write its mod fractions by position to a bigwig file.
+
+ This function will take the entire contents of the pileup bedmethyl file and create a bigwig header with all of the same contigs, with
+ contig lengths in the bigwig header set to the highest motif coordinate for each contig. If strand is specified as + or -, only that
+ strand will be written to the output bigwig - this can allow for strand bias analysis in a genome browser. If strand is specified as .,
+ as is the default, both strands will be included.
+
+ The operation can be quite slow for large pileups. The current design is that if you want to create a bigwig for a subset of the genome,
+ you can specify the regions at parsing time, rather than re-implementing the subset handling logic here.
+
+ Args:
+ bedmethyl_file: Path to the input tabix-indexed gzipped bedmethyl file
+ motif: type of modification to extract data for
+ bigwig_file: Path to the output bigwig destination. If unspecified, a pileup.bw file will be created in the bedmethyl file's directory
+ ref_genome: a reference genome to use for constructing the bigwig header, i.e. contig lengths. If None, the bedmethyl file will be used
+ to estimate contig lengths, which can take some time.
+ strand: the DNA strand to extra, + or - for forward or reverse and . for both
+ chunk_size: size for bigwig write chunks, in bedmethyl lines
+ """
+ bedmethyl_file, bigwig_file, ref_genome = utils.sanitize_path_args(
+ bedmethyl_file, bigwig_file, ref_genome
+ )
+
+ # Set up output directories if they don't exist; load up objects for bedmethyl tabix file and motif specifier
+ output_file_path = (
+ bigwig_file
+ if bigwig_file is not None
+ else bedmethyl_file.parent / "pileup.fractions.bigwig"
+ )
+ os.makedirs(output_file_path.parent, exist_ok=True)
+ tabix = pysam.TabixFile(str(bedmethyl_file))
+ parsed_motif = utils.ParsedMotif(motif)
+
+ # Because we need to set up the bigwig header before we start writing data to it, we need to pre-calculate the length of each contig
+ # The header essentially needs to contain a list of the contigs/chromosomes to which the data is aligned, and their sizes.
+ # There may be a way to adjust this as we write a bigwig file, but my testing with pyBigWig suggests that you must set it upfront
+
+ contig_lengths_tuples = []
+ lines_by_contig = {}
+
+ # If we only have a bedmethyl file, we need to go through it to get contig lengths
+ if ref_genome is None:
+ for contig in tqdm(
+ tabix.contigs,
+ desc=f"Step 1: Indexing contigs in {bedmethyl_file.name} to set up bigwig header for {output_file_path.name}",
+ ):
+ # count up the number of rows, for progress tracking, and pull out the last row so as to grab the length of the chromosome
+ # note: the tqdm progress bar slows things down by about 33%, which was deemed better at the time of writing this than
+ # 90 seconds without any status updates
+ rows_count, last_row = list(
+ tail(
+ n=1,
+ iterable=enumerate(
+ tqdm(
+ tabix.fetch(contig),
+ mininterval=1.0,
+ desc=f"Indexing {contig}.",
+ leave=False,
+ )
+ ),
+ )
+ )[0]
+ fields = last_row.split("\t")
+ max_coord = int(fields[2])
+ contig_lengths_tuples.append((contig, max_coord))
+ lines_by_contig[contig] = rows_count
+ # If we have a fasta file we can just reference that for contig lengths
+ else:
+ # Open the reference genome fasta file using pysam
+ with pysam.FastaFile(ref_genome) as fasta:
+ for contig in tqdm(
+ tabix.contigs, # if these are in the wrong order, e.g. the order from the fasta, it is an issue for pyBigWig somehow
+ desc=f"Step 1: Indexing contigs in {ref_genome.name} to set up bigwig header for {output_file_path.name}",
+ ):
+ # Get the length of the contig
+ try:
+ contig_length = fasta.get_reference_length(contig)
+ except Exception as err:
+ raise ValueError(
+ f"Error loading {contig} length from {ref_genome.name}. Are you certain that {bedmethyl_file.name} is aligned to this reference?"
+ ) from err
+ contig_lengths_tuples.append((contig, contig_length))
+ # if we used a fasta to calculate contig lengths we actually don't know the lines per contig
+ lines_by_contig[contig] = None
+
+ with pyBigWig.open(str(output_file_path), "w") as bw:
+ bw.addHeader(contig_lengths_tuples)
+ for contig in tqdm(
+ tabix.contigs,
+ desc=f"Step 2: Writing {bedmethyl_file.name} contents to {output_file_path.name}",
+ ):
+ contig_list = []
+ start_list = []
+ end_list = []
+ values_list = []
+
+ for row in tqdm(
+ tabix.fetch(contig),
+ desc=f"Writing {contig}.",
+ total=lines_by_contig[contig],
+ leave=False,
+ ):
+ keep_basemod, genomic_coord, modified_in_row, valid_in_row = (
+ load_processed.process_pileup_row(
+ row=row,
+ parsed_motif=parsed_motif,
+ region_strand=strand,
+ single_strand=(strand != "."),
+ )
+ )
+ if keep_basemod and valid_in_row > 0:
+ contig_list.append(contig)
+ start_list.append(genomic_coord)
+ end_list.append(genomic_coord + 1)
+ values_list.append(modified_in_row / valid_in_row)
+
+ if len(values_list) > chunk_size:
+ bw.addEntries(
+ contig_list, # Contig names
+ start_list, # Start positions
+ ends=end_list, # End positions
+ values=values_list, # Corresponding values
+ )
+ contig_list = []
+ start_list = []
+ end_list = []
+ values_list = []
+ bw.addEntries(
+ contig_list, # Contig names
+ start_list, # Start positions
+ ends=end_list, # End positions
+ values=values_list, # Corresponding values
+ )
diff --git a/dimelo/load_processed.py b/dimelo/load_processed.py
new file mode 100644
index 00000000..62a4556f
--- /dev/null
+++ b/dimelo/load_processed.py
@@ -0,0 +1,1165 @@
+import concurrent.futures
+import gzip
+import multiprocessing
+from collections import defaultdict
+from functools import partial
+from multiprocessing import shared_memory
+from pathlib import Path
+
+import h5py
+import numpy as np
+import pysam
+from tqdm.auto import tqdm
+
+from . import test_data, utils
+
+# the default chunk size is the number of bp to include per processing chunk in parallelization for loaders.
+# 1e6 was empirically determined to be a good default: smaller than 1e5 we see slowdowns due to increased
+# parallelization overhead, larger than 1e7 we see slowdowns due to worker utilization decreasing because even
+# for whole chromosome processing there aren't always enough chunks to go around. In the 1e5-1e7 range, speed
+# on 32 cores is fairly similar, but sitting in the middle of the range should support 10x more cores (beyond
+# the reasonable upper bound) and 10x fewer cores (which is about the reasonable lower bound).
+DEFAULT_CHUNK_SIZE = 1_000_000
+
+################################################################################################################
+#### Loader wrappers ####
+################################################################################################################
+
+
+def regions_to_list(
+ function_handle,
+ regions,
+ window_size: int | None = None,
+ quiet: bool = True,
+ cores: int | None = None,
+ split_large_regions: bool = False,
+ **kwargs,
+):
+ """
+ User-facing function.
+
+ Run any standard load_processed pileup or extract loader loading each region from the region
+ specifier into a new element of a list.
+
+ Args:
+ function_handle: the loader function you want to run.
+ regions: the region specifier. Typically we expect to get many regions for this function, in the form of a list
+ of strings or bed file paths. regions_to_list will run across all of these one-by-one returning a separate
+ function return for each independent region.
+ window_size: window around centers of regions, defaults to None
+ quiet: disables progress bars
+ cores: CPU cores across which to parallelize processing. Default to None, which means all available.
+ split_large_regions: if True, regions will be run sequentially in parallelized chunks. If False,
+ each individual region's chunks will be run sequentially but there will be parallelization across
+ regions, i.e. each core will be assigned one region at a time by the executor. Set to True if you
+ are running a small number of very large regions (e.g. one or two chromosomes), otherwise to to False (default).
+ **kwargs: all necessary keyword arguments to pass down to the loader
+
+ Returns:
+ List(function_handle return objects per region)
+ """
+ regions_dict = utils.regions_dict_from_input(
+ regions,
+ window_size,
+ )
+
+ # Flatten regions into a list of (chromosome, start, end, strand)
+ region_strings = [
+ f"{chromosome}:{start}-{end},{strand}"
+ for chromosome, region_list in regions_dict.items()
+ for start, end, strand in region_list
+ ]
+
+ cores_to_run = utils.cores_to_run(cores)
+ # quiet and cores logic below is driven by the following:
+ # If the parallelization is within regions:
+ # (1) progress bars should happen within regions if at all, because we assume regions are
+ # large if they make sense to parallelize
+ # (2) the cores_to_run will be allocated to within-region parallelization, and the top-level
+ # jobs sequence is run sequentially
+ with concurrent.futures.ProcessPoolExecutor(
+ max_workers=1 if split_large_regions else cores_to_run
+ ) as executor:
+ # Use functools.partial to pre-fill arguments
+ process_partial = partial(
+ apply_loader_function_to_region,
+ function_handle=function_handle,
+ quiet=quiet or not split_large_regions,
+ cores=cores_to_run
+ if split_large_regions
+ else 1, # if parallelization is within region
+ **kwargs,
+ )
+ results = list(
+ tqdm(
+ executor.map(process_partial, region_strings),
+ total=len(region_strings),
+ desc="Loading data",
+ disable=quiet or split_large_regions,
+ leave=False,
+ )
+ )
+
+ return results
+
+
+def apply_loader_function_to_region(region_string, function_handle, **kwargs):
+ """
+ apply_loader_function_to_region simply exists to convert position arguments into keyword arguments to make executor.map work
+
+ Args:
+ region_string: passed down with regions keyword
+ function_handle: function to call with regions and other kwargs
+ **kwargs: all keyword arguments passed to regions_to_list. These must be sufficient for whichever load_processed function
+ if being referenced by function_handle
+ Returns:
+ function_handle return value
+ """
+ return function_handle(regions=region_string, **kwargs)
+
+
+################################################################################################################
+#### Pileup loaders ####
+################################################################################################################
+
+
+def pileup_counts_from_bedmethyl(
+ bedmethyl_file: str | Path,
+ motif: str,
+ regions: str | Path | list[str | Path],
+ window_size: int | None = None,
+ single_strand: bool = False,
+ quiet: bool = False,
+ cores: int | None = None,
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
+) -> tuple[int, int]:
+ """
+ User-facing function.
+
+ Extract number of modified bases and total number of bases from the given bedmethyl file.
+ Called by plotters or by the user.
+
+ This function loops through all the provided regions and pulls those regions up in the input
+ sorted and indexed bedmethyl file. For rows within those regions, checks that the motif
+ is correct (i.e. sequence context, modified base, mod code, and optionally strand). All
+ correct locations are included in the sum counts that get returned.
+
+ If no regions are specified, returns the sum total for the motif of interest across the
+ entire bedmethyl file.
+
+ TODO: Consider renaming this method, e.g. counts_from_pileup
+
+ Args:
+ bedmethyl_file: Path to bedmethyl file
+ regions: Path to bed file specifying regions
+ motif: type of modification to extract data for
+ window_size: (currently disabled) window around center of region, +-window_size
+ single_strand: True means we only grab counts from reads from the same strand as
+ the region of interest, False means we always grab both strands within the regions
+ quiet: disables progress bars
+ cores: CPU cores across which to parallelize processing. Default to None, which means all available.
+ chunk_size: size of genomic subregions to assign out to each process
+
+ Returns:
+ tuple containing counts of (modified_bases, total_bases)
+ """
+
+ parsed_motif = utils.ParsedMotif(motif)
+
+ regions_dict = utils.regions_dict_from_input(regions, window_size)
+ chunks_list = utils.process_chunks_from_regions_dict(
+ regions_dict, chunk_size=chunk_size
+ )
+
+ cores_to_run = utils.cores_to_run(cores)
+
+ # Initialize shared memory as length-one numpy arrays to make it easy to map to buffer in subprocesses
+ shm_valid = shared_memory.SharedMemory(
+ create=True, size=np.dtype(np.int32).itemsize
+ )
+ shm_modified = shared_memory.SharedMemory(
+ create=True, size=np.dtype(np.int32).itemsize
+ )
+
+ manager = multiprocessing.Manager()
+ lock = manager.Lock()
+
+ with concurrent.futures.ProcessPoolExecutor(max_workers=cores_to_run) as executor:
+ futures = [
+ executor.submit(
+ pileup_counts_process_chunk,
+ bedmethyl_file,
+ parsed_motif,
+ chunk,
+ shm_modified.name,
+ shm_valid.name,
+ lock,
+ single_strand,
+ )
+ for chunk in chunks_list
+ ]
+ for future in tqdm(
+ concurrent.futures.as_completed(futures),
+ total=len(futures),
+ disable=quiet,
+ desc="Loading data",
+ leave=False,
+ ):
+ try:
+ future.result()
+ except Exception as err:
+ raise RuntimeError("pileup_counts_process_chunk failed.") from err
+
+ # Directly convert shared memory buffers to integers
+ modified_base_count = int.from_bytes(
+ shm_modified.buf[:4], byteorder="little", signed=True
+ )
+ valid_base_count = int.from_bytes(
+ shm_valid.buf[:4], byteorder="little", signed=True
+ )
+ # Close and unlink shared memory - not fully handled by garbage collection otherwise
+ shm_modified.close()
+ shm_modified.unlink()
+ shm_valid.close()
+ shm_valid.unlink()
+
+ return modified_base_count, valid_base_count
+
+
+def pileup_vectors_from_bedmethyl(
+ bedmethyl_file: str | Path,
+ motif: str,
+ regions: str | Path | list[str | Path],
+ window_size: int | None = None,
+ single_strand: bool = False,
+ regions_5to3prime: bool = False,
+ quiet: bool = False,
+ cores: int | None = None,
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
+) -> tuple[np.ndarray, np.ndarray]:
+ """
+ User-facing function.
+
+ Extract per-position pileup counts at valid motifs across one or more superimposed regions.
+ Called by profile plotters, can also be used by a user directly.
+
+ Returns two vectors:
+ * Total number of times a modified base in the motif was found at each position
+ * Total number of times the motif was found at each position
+
+ This function loops through all the provided regions and fetches those regions from the
+ bedmethyl file. For rows within those regions, it checks that the motif
+ is correct (i.e. sequence context, modified base, mod code, and optionally strand). It then adds
+ to two vectors (mod and valid). By default all regions are assumed to
+ be the same size (the size of the first region).
+
+ If regions_5to3prime is set to True, then negative strand regions are flipped to that all regions
+ are superimposed along the 5 prime to 3 prime direction, which can be helpful if there is
+ directionality to the signal (e.g. upstream v downstream relative to TSSs, TF binding sites, and so on).
+ A region must be provided because otherwise there is no way to know what vector to return.
+ However, a region can be a whole chromosome if desired.
+
+ TODO: Consider renaming this method, e.g. vectors_from_pileup
+
+ Args:
+ bedmethyl_file: Path to bedmethyl file
+ regions: Path to bed file specifying centered equal-length regions
+ motif: type of modification to extract data for
+ window_size: the extent in either direction for windows around the center of regions.
+ single_strand: True means we only grab counts from reads from the same strand as
+ the region of interest, False means we always grab both strands within the regions
+ regions_5to3prime: True means negative strand regions get flipped, False means no flipping
+ quiet: disables progress bars
+ cores: CPU cores across which to parallelize processing. Default to None, which means all available.
+ chunk_size: size of genomic subregions to assign out to each process
+
+ Returns:
+ tuple containing (modified_base_counts, valid_base_counts)
+ """
+
+ parsed_motif = utils.ParsedMotif(motif)
+
+ regions_dict = utils.regions_dict_from_input(regions, window_size)
+ chunks_list = utils.process_chunks_from_regions_dict(
+ regions_dict, chunk_size=chunk_size
+ )
+
+ cores_to_run = utils.cores_to_run(cores)
+
+ # Peek at a region to figure out what size the vectors should be
+ first_key = next(iter(regions_dict))
+ first_tuple = regions_dict[first_key][0]
+ region_len = first_tuple[1] - first_tuple[0]
+
+ # Initialize shared memory as numpy arrays to make it easy to map to buffer in subprocesses
+ shm_valid = shared_memory.SharedMemory(
+ create=True, size=(region_len) * np.dtype(np.int32).itemsize
+ )
+ shm_modified = shared_memory.SharedMemory(
+ create=True, size=(region_len) * np.dtype(np.int32).itemsize
+ )
+
+ manager = multiprocessing.Manager()
+ lock = manager.Lock()
+
+ with concurrent.futures.ProcessPoolExecutor(max_workers=cores_to_run) as executor:
+ futures = [
+ executor.submit(
+ pileup_vectors_process_chunk,
+ bedmethyl_file,
+ parsed_motif,
+ chunk,
+ region_len,
+ shm_modified.name,
+ shm_valid.name,
+ lock,
+ single_strand,
+ regions_5to3prime,
+ )
+ for chunk in chunks_list
+ ]
+ for future in tqdm(
+ concurrent.futures.as_completed(futures),
+ total=len(futures),
+ disable=quiet,
+ desc="Loading data",
+ leave=False,
+ ):
+ try:
+ future.result()
+ except Exception as err:
+ raise RuntimeError("pileup_vectors_process_chunk failed.") from err
+
+ # We need to convert these shared memory buffers to numpy arrays which
+ # we then copy, so that they no longer reference the shared memory which
+ # will soon be de-allocated
+ modified_base_counts = np.copy(
+ np.ndarray((region_len,), dtype=np.int32, buffer=shm_modified.buf)
+ )
+ valid_base_counts = np.copy(
+ np.ndarray((region_len,), dtype=np.int32, buffer=shm_valid.buf)
+ )
+ # Close and unlink shared memory - not fully handled by garbage collection otherwise
+ shm_modified.close()
+ shm_modified.unlink()
+ shm_valid.close()
+ shm_valid.unlink()
+
+ return modified_base_counts, valid_base_counts
+
+
+def counts_from_fake(*args, **kwargs) -> tuple[int, int]:
+ """
+ Test helper function.
+
+ Generates a fake set of enrichment counts. Ignores all arguments.
+
+ Returns:
+ tuple containing counts of (modified_bases, total_bases)
+ """
+ window_halfsize = 500
+ return test_data.fake_peak_enrichment(halfsize=window_halfsize, peak_height=0.15)
+
+
+def vector_from_fake(window_size: int, *args, **kwargs) -> np.ndarray:
+ """
+ Test helper function.
+
+ Generates a fake peak trace. Ignores all arguments except window_size.
+
+ Args:
+ window_size: halfsize of the window; how far the window stretches on either side of the center point
+
+ Returns:
+ vector of fraction modified bases calculated for each position; float values between 0 and 1
+ """
+ return test_data.fake_peak_enrichment_profile(
+ halfsize=window_size, peak_height=0.15
+ )
+
+
+def pileup_vectors_process_chunk(
+ bedmethyl_file,
+ parsed_motif,
+ chunk,
+ region_len,
+ shm_name_modified,
+ shm_name_valid,
+ lock,
+ single_strand,
+ regions_5to3prime,
+) -> None:
+ """
+ Helper function to allow pileup_vectors_from_bedmethyl to operate in a parallized fashion.
+
+ Sum up modified and valid counts for a subregion chunk in a bedmethyl file.
+
+ Args:
+ bedmethyl_file: Path to bedmethyl file
+ parsed_motif: ParsedMotif object
+ chunk: a dict containing subregion chunk information
+ shm_name_modified: the name string for the shared memory location containing the modified counts array
+ shm_name_valid: the name string for the shared memory location containing the valid counts array
+ lock: a manager.Lock object to allow synchronization in accessing shared memory
+ single_strand: True if only single-strand mods are desired
+ regions_5to3prime: True means negative strand regions get flipped, False means no flipping
+
+ Returns:
+ None. Counts are added to arrays in-place to shared memory.
+ """
+ source_tabix = pysam.TabixFile(str(bedmethyl_file))
+ existing_valid = shared_memory.SharedMemory(name=shm_name_valid)
+ existing_modified = shared_memory.SharedMemory(name=shm_name_modified)
+ valid_base_counts = np.ndarray(
+ (region_len,), dtype=np.int32, buffer=existing_valid.buf
+ )
+ modified_base_counts = np.ndarray(
+ (region_len,), dtype=np.int32, buffer=existing_modified.buf
+ )
+
+ chromosome = chunk["chromosome"]
+ region_start = chunk["region_start"]
+ region_end = chunk["region_end"]
+ subregion_start = chunk["subregion_start"]
+ subregion_end = chunk["subregion_end"]
+ strand = chunk["strand"]
+
+ flip_coords = regions_5to3prime and strand == "-"
+
+ if flip_coords:
+ subregion_offset = region_end - subregion_end
+ else:
+ subregion_offset = subregion_start - region_start
+
+ if region_end - region_start > region_len:
+ print(
+ f"WARNING: You have specified a region at {chromosome}:{region_start}-{region_end} that is longer than the first region; the end of the region will be skipped. To make a profile plot with differently-sized region, consider using the window_size parameter to make a profile across centered windows."
+ )
+
+ valid_base_subregion = np.zeros(subregion_end - subregion_start, dtype=int)
+ modified_base_subregion = np.zeros(subregion_end - subregion_start, dtype=int)
+
+ # tabix throws and error if the contig is not present
+ # by the current design, this should be silent
+ if chromosome in source_tabix.contigs:
+ for row in source_tabix.fetch(
+ chromosome, max(subregion_start, 0), subregion_end
+ ):
+ (
+ keep_basemod,
+ genomic_coord,
+ modified_in_row,
+ valid_in_row,
+ ) = process_pileup_row(
+ row=row,
+ parsed_motif=parsed_motif,
+ region_strand=strand,
+ single_strand=single_strand,
+ )
+ if keep_basemod:
+ if flip_coords:
+ # We want to flip the coordinates for this region so that it is recorded along the 5 prime to 3 prime direction
+ # This will enable analyses where the orientation of protein binding / transcriptional dynamics / etc is relevant for our pileup signal
+ pileup_coord_in_subregion = subregion_end - genomic_coord - 1
+ else:
+ # Normal coordinates are the default. This will be used both for the '+' case and the '.' (no strand specified) case
+ pileup_coord_in_subregion = genomic_coord - subregion_start
+ if pileup_coord_in_subregion < (subregion_end - subregion_start):
+ valid_base_subregion[pileup_coord_in_subregion] += valid_in_row
+ modified_base_subregion[pileup_coord_in_subregion] += (
+ modified_in_row
+ )
+
+ with lock:
+ valid_base_counts[
+ subregion_offset : subregion_offset + abs(subregion_end - subregion_start)
+ ] += valid_base_subregion
+ modified_base_counts[
+ subregion_offset : subregion_offset + abs(subregion_end - subregion_start)
+ ] += modified_base_subregion
+ # Close the file descriptor/handle to the shared memory
+ existing_modified.close()
+ existing_valid.close()
+
+
+def pileup_counts_process_chunk(
+ bedmethyl_file,
+ parsed_motif,
+ chunk,
+ shm_name_modified,
+ shm_name_valid,
+ lock,
+ single_strand,
+) -> None:
+ """
+ Helper function to allow pileup_counts_from_bedmethyl to operate in a parallized fashion.
+
+ Sum up modified and valid counts for a subregion chunk in a bedmethyl file.
+
+ Args:
+ bedmethyl_file: Path to bedmethyl file
+ parsed_motif: ParsedMotif object
+ chunk: a dict containing subregion chunk information
+ shm_name_modified: the name string for the shared memory location containing the modified counts sum
+ shm_name_valid: the name string for the shared memory location containing the valid counts sum
+ lock: a manager.Lock object to allow synchronization in accessing shared memory
+ single_strand: True if only single-strand mods are desired
+
+ Returns:
+ None. Counts are added in-place to shared memory.
+ """
+ source_tabix = pysam.TabixFile(str(bedmethyl_file))
+ existing_valid = shared_memory.SharedMemory(name=shm_name_valid)
+ existing_modified = shared_memory.SharedMemory(name=shm_name_modified)
+ valid_base_counts = np.ndarray((1,), dtype=np.int32, buffer=existing_valid.buf)
+ modified_base_counts = np.ndarray(
+ (1,), dtype=np.int32, buffer=existing_modified.buf
+ )
+
+ chromosome = chunk["chromosome"]
+ subregion_start = chunk["subregion_start"]
+ subregion_end = chunk["subregion_end"]
+ strand = chunk["strand"]
+
+ valid_base_subregion_counts = 0
+ modified_base_subregion_counts = 0
+
+ # tabix throws and error if the contig is not present
+ # by the current design, this should be silent
+ if chromosome in source_tabix.contigs:
+ for row in source_tabix.fetch(
+ chromosome, max(subregion_start, 0), subregion_end
+ ):
+ (
+ keep_basemod,
+ _,
+ modified_in_row,
+ valid_in_row,
+ ) = process_pileup_row(
+ row=row,
+ parsed_motif=parsed_motif,
+ region_strand=strand,
+ single_strand=single_strand,
+ )
+ if keep_basemod:
+ valid_base_subregion_counts += valid_in_row
+ modified_base_subregion_counts += modified_in_row
+
+ with lock:
+ valid_base_counts[0] += valid_base_subregion_counts
+ modified_base_counts[0] += modified_base_subregion_counts
+
+ # Close the file descriptor/handle to the shared memory
+ existing_valid.close()
+ existing_modified.close()
+
+
+def process_pileup_row(
+ row: str,
+ parsed_motif: utils.ParsedMotif,
+ region_strand: str,
+ single_strand: bool = False,
+) -> tuple[bool, int, int, int]:
+ """
+ Helper function designed for pileup_counts_from_bedmethyl via pileup_counts_process_chunk, pileup_vectors_from_bedmethyl
+ via pileup_vectors_process_chunk, and export.pileup_to_bigwig; changes to logic here may impact some or all of
+ these.
+
+ Process a row from a pileup, determining whether the basemod is relevant and passing back its coordinate,
+ modification count, and valid read count.
+
+ Args:
+ row: a string row from a bedmethyl file
+ parsed_motif: a ParsedMotif object
+ region_strand: the strand from the query region
+ single_strand: True if only mods on the region_strand are to be kept
+
+ Returns: keep_basemod, genomic_coord, modified_in_row, valid_in_row. Values are provided even if keep_basemod is False.
+ """
+ tabix_fields = row.split("\t")
+ pileup_basemod = tabix_fields[3]
+ pileup_strand = tabix_fields[5]
+
+ if single_strand and pileup_strand.strip() != region_strand:
+ # We are on the wrong strand, can't keep this position
+ keep_basemod = False
+ elif len(pileup_basemod.split(",")) == 3:
+ pileup_modname, pileup_motif, pileup_mod_coord = pileup_basemod.split(",")
+ if (
+ pileup_motif == parsed_motif.motif_seq
+ and int(pileup_mod_coord) == parsed_motif.modified_pos
+ and pileup_modname in parsed_motif.mod_codes
+ ):
+ keep_basemod = True
+ else:
+ keep_basemod = False
+ elif len(pileup_basemod.split(",")) == 1:
+ keep_basemod = pileup_basemod in parsed_motif.mod_codes
+ else:
+ raise ValueError(
+ f"Unexpected format in bedmethyl file: {row} contains {pileup_basemod} which cannot be parsed."
+ )
+
+ pileup_info = tabix_fields[9].split(" ")
+ genomic_coord = int(tabix_fields[1])
+ valid_in_row = int(pileup_info[0])
+ modified_in_row = int(pileup_info[2])
+
+ return (
+ keep_basemod,
+ genomic_coord,
+ modified_in_row,
+ valid_in_row,
+ )
+
+
+################################################################################################################
+#### Single read loaders ####
+################################################################################################################
+
+
+def read_vectors_from_hdf5(
+ file: str | Path,
+ motifs: list[str],
+ regions: str | Path | list[str | Path] | None = None,
+ window_size: int | None = None,
+ single_strand: bool = False,
+ sort_by: str | list[str] = ["chromosome", "region_start", "read_start"],
+ calculate_mod_fractions: bool = True,
+ quiet: bool = True, # currently unused; change to default False when pbars are implemented
+ cores: int | None = None, # currently unused
+ subset_parameters: dict | None = None,
+) -> tuple[list[tuple], list[str], dict | None]:
+ """
+ User-facing function.
+
+ Pulls a list of read data out of an .h5 file containing processed read vectors, formatted
+ for read-by-read vector processing downstream use cases.
+
+ The flow of operation here is we load up the h5 file then loop through our regions and pick
+ out reads corresponding to our criteria. Criteria include chromosome, read starts and ends
+ (compared to region starts and ends), motif, and strand (if single_strand is True). The indices
+ for the desired reads are identified region-by-region, then all the reads for the region (or
+ the whole h5, if no region is passed) are loaded using the process_data function and put into
+ a list. The bytes are then decoded for the array entries, which are manually compressed because
+ h5py wasn't behaving.
+
+ There's some adjustment for the raw probability (no thresh) to match modkit extract outputs.
+ Specifically, the 0-255 8bit int has 0.5 added before dividing by 256, giving mod qualities
+ between 0.001953 and 0.99805 for bases in valid motifs. (Invalid positions have zeros.)
+
+ After this processing, we calculate modification fractions, sort, and return.
+
+ TODO: Implement progress bars and parallelization as with pileup loaders
+
+ Args:
+ file: Path to an hdf5 (.h5) file containing modification data for single reads,
+ stored in datasets read_name, chromosome, read_start,
+ read_end, base modification motif, mod_vector, and val_vector.
+ regions: Single or list of Path objects or strings. Path objects must point to .bed
+ files, strings can be .bed paths or region string in the format chrX:XXX-XXX.
+ All should all be regions for which your original .bam file had reads extracted,
+ although by design this method will not raise an error if any region contains
+ zero reads, as this may simply be a matter of low read depth.
+ If no regions are specified, the entire .h5 file will be returned. This may cause
+ memory issues.
+ motifs: types of modification to extract data for. Motifs are specified as
+ {DNA_sequence},{position_of_modification}. For example, a methylated adenine is specified
+ as 'A,0' and CpG methylation is specified as 'CG,0'.
+ single_strand: True means we only grab counts from reads from the same strand as
+ the region of interest, False means we always grab both strands within the regions
+ window_size: An optional parameter for creating centered windows for the provided regions.
+ If provided, all regions will be adjusted to be the same size and centered. If not provided,
+ all regions should already be the same size, or there should be only one.
+ sort_by: Read properties by which to sort, either one string or a list of strings. Options
+ include chromosome, region_start, region_end, read_start, read_end, and motif. More to
+ be added in future.
+ quiet: silences progress bars (currently unused)
+ cores: cores across which to parallelize processes (currently unused)
+ subset_parameters: Parameters to pass to the utils.random_sample() method, to subset the
+ reads to be returned. If not None, at least one of n or frac must be provided. The array
+ parameter should not be provided here.
+
+ Returns:
+ a list of tuples, each tuple containing all datasets corresponding to an individual read that
+ was within the specified regions.
+ a list of strings, naming the datasets returned.
+ a regions_dict, containing lists of (region_start,region_end) coordinates by chromosome/contig.
+
+ TODO: The way the subsetting is implemented is confusing, in that you need to pass all but one of
+ the available parameters.
+ """
+ with h5py.File(file, "r") as h5:
+ datasets: list[str] = [
+ name for name, obj in h5.items() if isinstance(obj, h5py.Dataset)
+ ]
+ if "threshold" in h5:
+ # we are looking at an .h5 file with the new, much better compressed format that does
+ # not know the data type intrinsically for mod and val vectors, so we must check
+ readwise_datasets = [
+ dataset for dataset in datasets if dataset not in ["threshold"]
+ ]
+ compressed_binary_datasets = ["mod_vector", "val_vector"]
+ threshold_applied_to_h5 = h5["threshold"][()]
+ binarized = not np.isnan(threshold_applied_to_h5)
+ else:
+ # backwards compatible with the old h5 file structure
+ # If we remove backwards compatibility, beta test (Feb 2024) h5 extractions will not run
+ readwise_datasets = datasets
+ compressed_binary_datasets = []
+ binarized = True # in this case all this will do is make it so we don't apply a +1/512 correction to the mod_vector
+
+ # Pre-load metadata so we can identify reads to pull from file
+ read_chromosomes = np.array(h5["chromosome"], dtype=str)
+ read_starts = np.array(h5["read_start"])
+ read_ends = np.array(h5["read_end"])
+ read_motifs = np.array(h5["motif"], dtype=str)
+ ref_strands = np.array(h5["strand"], dtype=str)
+
+ # Identify reads to load, then load them
+ if regions is not None:
+ regions_dict = utils.regions_dict_from_input(
+ regions=regions,
+ window_size=window_size,
+ )
+ read_tuples_raw = []
+ for chrom, region_list in regions_dict.items():
+ for region_start, region_end, region_strand in region_list:
+ # Find the read indices that we want to load
+ # TODO: consider building this up and then loading all at the end, chunked
+ # TODO: consolidate logic into clear variables
+ relevant_read_indices = np.flatnonzero(
+ (read_ends > region_start)
+ & (read_starts < region_end)
+ & np.isin(read_motifs, motifs)
+ & (read_chromosomes == chrom)
+ & (
+ (not single_strand)
+ | (region_strand not in ["+", "-"])
+ | (ref_strands == region_strand)
+ )
+ )
+ if subset_parameters is not None:
+ relevant_read_indices = np.sort(
+ utils.random_sample(
+ relevant_read_indices, **subset_parameters
+ )
+ )
+ read_tuples_raw += list(
+ zip(
+ *(
+ retrieve_h5_data(
+ h5=h5,
+ dataset=dataset,
+ indices=relevant_read_indices,
+ compressed=dataset in compressed_binary_datasets,
+ dtype=np.uint8,
+ decompressor=gzip.decompress,
+ binarized=binarized,
+ )
+ for dataset in readwise_datasets
+ ),
+ [region_start for _ in relevant_read_indices],
+ [region_end for _ in relevant_read_indices],
+ [region_strand for _ in relevant_read_indices],
+ )
+ )
+ else:
+ regions_dict = None
+ relevant_read_indices = np.flatnonzero(np.isin(read_motifs, motifs))
+ if subset_parameters is not None:
+ relevant_read_indices = np.sort(
+ utils.random_sample(relevant_read_indices, **subset_parameters)
+ )
+ read_tuples_raw = list(
+ zip(
+ *(
+ retrieve_h5_data(
+ h5=h5,
+ dataset=dataset,
+ indices=relevant_read_indices,
+ compressed=dataset in compressed_binary_datasets,
+ dtype=np.uint8,
+ decompressor=gzip.decompress,
+ binarized=binarized,
+ )
+ for dataset in readwise_datasets
+ ),
+ [-1 for _ in relevant_read_indices],
+ [-1 for _ in relevant_read_indices],
+ ["." for _ in relevant_read_indices],
+ )
+ )
+ # We add region information (start, end, and strand; chromosome is already present!)
+ # so that it is possible to sort by and process based on these
+ readwise_datasets += ["region_start", "region_end", "region_strand"]
+
+ # This is sanitizing the dataset entries and adjusting prob values if needed
+ if binarized:
+ read_tuples_processed = [
+ convert_bytes_to_strings(tup) for tup in read_tuples_raw
+ ]
+ else:
+ read_tuples_processed = [
+ adjust_mod_probs_in_tuples(
+ convert_bytes_to_strings(tup),
+ readwise_datasets.index("mod_vector"),
+ readwise_datasets.index("val_vector"),
+ )
+ for tup in read_tuples_raw
+ ]
+
+ if calculate_mod_fractions:
+ # Add the MOTIF_mod_fraction entries to the readwise_datasets list for future reference in sorting
+ readwise_datasets += [f"{motif}_mod_fraction" for motif in motifs]
+ # dict[read_name][motif]=modified fraction of motif in read, float
+ mod_fractions_by_read_name_by_motif: defaultdict[
+ str, defaultdict[str, float]
+ ] = defaultdict(lambda: defaultdict(lambda: 0.0))
+ for motif in motifs:
+ for read_tuple in read_tuples_processed:
+ if read_tuple[readwise_datasets.index("motif")] == motif:
+ mod_sum = np.sum(read_tuple[readwise_datasets.index("mod_vector")])
+ val_sum = np.sum(read_tuple[readwise_datasets.index("val_vector")])
+ mod_fraction = mod_sum / val_sum if val_sum > 0 else 0
+ mod_fractions_by_read_name_by_motif[
+ read_tuple[readwise_datasets.index("read_name")]
+ ][motif] = mod_fraction
+
+ read_tuples_all = []
+ for read_tuple in read_tuples_processed:
+ read_tuples_all.append(
+ tuple(val for val in read_tuple)
+ + tuple(
+ mod_frac
+ for mod_frac in mod_fractions_by_read_name_by_motif[
+ read_tuple[readwise_datasets.index("read_name")]
+ ].values()
+ )
+ )
+ else:
+ read_tuples_all = read_tuples_processed
+
+ ## Sort the reads
+
+ # Enforce that sort_by is a list
+ if not isinstance(sort_by, list):
+ sort_by = [sort_by]
+
+ # If 'shuffle' appears anywhere in sort_by, we first shuffle the list
+ if "shuffle" in sort_by:
+ utils.rng.shuffle(read_tuples_all)
+
+ try:
+ sort_by_indices = [
+ readwise_datasets.index(sort_item)
+ for sort_item in sort_by
+ if sort_item != "shuffle"
+ ]
+ except ValueError as e:
+ raise ValueError(
+ f"Sorting error. {e}. Datasets include {readwise_datasets}. If you need mod fraction sorting make sure you are not setting calculate_read_fraction to False."
+ ) from e
+
+ if len(sort_by_indices) > 0:
+ sorted_read_tuples = sorted(
+ read_tuples_all, key=lambda x: tuple(x[index] for index in sort_by_indices)
+ )
+ else:
+ sorted_read_tuples = read_tuples_all
+
+ return sorted_read_tuples, readwise_datasets, regions_dict
+
+
+def readwise_binary_modification_arrays(
+ file: str | Path,
+ motifs: list[str],
+ regions: str | Path | list[str | Path],
+ window_size: int | None = None,
+ regions_5to3prime: bool = False,
+ single_strand: bool = False,
+ sort_by: str | list[str] = ["chromosome", "region_start", "read_start"],
+ thresh: float | None = None,
+ relative: bool = True,
+ quiet: bool = True, # currently unused; change to default False when pbars are implemented
+ cores: int | None = None, # currently unused
+ subset_parameters: dict | None = None,
+) -> tuple[list[np.ndarray], np.ndarray[int], np.ndarray[str], dict | None]:
+ """
+ Primarily designed as a helper function for single-read plotting, but can be used by a user.
+
+ Pulls a list of read data out of a file containing processed read vectors, formatted with
+ seaborn plotting in mind. Currently we only support .h5 files.
+
+ After running read_vectors_from_hdf5, this function takes the baton to convert the names of
+ the sorted reads into integer indices, then goes through the reads and strips down the mod
+ vectors to simply a list of modified positions (applying a threshold if one has not already
+ been applied). Mod positions are by default expressed relative to the region from which
+ the read was identified, allowing for nice plotting, but can also be expressed in absolute
+ coordinates. If positions are relative, regions_5to3prime can be used to show all regions
+ as upstream-to-downstream along their respective strands.
+
+ TODO: Implement progress bars and parallelization as with pileup loaders
+
+ Args:
+ file: Path to an hdf5 (.h5) file containing modification data for single reads,
+ stored in datasets read_name, chromosome, read_start,
+ read_end, base modification motif, mod_vector, and val_vector.
+ regions: Single or list of Path objects or strings. Path objects must point to .bed
+ files, strings can be .bed paths or region string in the format chrX:XXX-XXX.
+ All should all be regions for which your original .bam file had reads extracted,
+ although by design this method will not raise an error if any region contains
+ zero reads, as this may simply be a matter of low read depth.
+ motifs: types of modification to extract data for. Motifs are specified as
+ {DNA_sequence},{position_of_modification}. For example, a methylated adenine is specified
+ as 'A,0' and CpG methylation is specified as 'CG,0'.
+ window_size: An optional parameter for creating centered windows for the provided regions.
+ If provided, all regions will be adjusted to be the same size and centered. If not provided,
+ all regions should already be the same size, or there should be only one.
+ single_strand: True means we only grab counts from reads from the same strand as
+ the region of interest, False means we always grab both strands within the regions
+ regions_5to3prime: True means negative strand regions get flipped, False means no flipping
+ sort_by: Read properties by which to sort, either one string or a list of strings. Options
+ include chromosome, region_start, region_end, read_start, read_end, and motif. More to
+ be added in future.
+ thresh: A modification calling threshold. If the .h5 is already modification-called, this does
+ nothing. If the .h5 files is not modification-called, i.e. its modification data is saved
+ as floating point array, thresh must be provided to have valid binary outputs.
+ relative: If True, modification coordinates are specified relative to their respective regions
+ in the genomes, centered at the center of the region. If False, absolute coordinates are provided.
+ There is not currently a check for all reads being on the same chromosome if relative=False, but
+ this could create unexpected behaviour for a the standard visualizations.
+ quiet: silences progress bars (currently unused)
+ cores: cores across which to parallelize processes (currently unused)
+ subset_parameters: Parameters to pass to the utils.random_sample() method, to subset the
+ reads to be returned. If not None, at least one of n or frac must be provided. The array
+ parameter should not be provided here.
+
+ Returns:
+ Returns a tuple of three arrays, of length (N_READS * len(mod_names)), and a dict of regions.
+ The arrays contain the following:
+ * positions at which the specified modification was found in a read, after a binary call
+ * unique integer ID for the read for each modification position. These integers are ordered
+ based on the specified sorting.
+ * modification represented by the positions, in the motif format
+ The regions_dict contains the following:
+ * keys: chromosomes/contigs
+ * values: lists of tuples in the format (region_start,region_end)
+ For example, if called on a dataset with a single read and two modification types, each array would have two entries. The unique IDs would be the same, as both entries would represent the same single read. The mods and positions would be different, as they would extact different mods.
+ """
+ file = Path(file)
+ if file.suffix == ".h5" or file.suffix == ".hdf5":
+ sorted_read_data_converted, datasets, regions_dict = read_vectors_from_hdf5(
+ file=file,
+ motifs=motifs,
+ regions=regions,
+ window_size=window_size,
+ single_strand=single_strand,
+ sort_by=sort_by,
+ )
+ read_name_index = datasets.index("read_name")
+ mod_vector_index = datasets.index("mod_vector")
+ motif_index = datasets.index("motif")
+ region_start_index = datasets.index("region_start")
+ region_end_index = datasets.index("region_end")
+ read_start_index = datasets.index("read_start")
+ region_strand_index = datasets.index("region_strand")
+
+ # Check this .h5 file was created with a threshold, i.e. that the mod calls are binarized
+ if thresh is None:
+ if not (sorted_read_data_converted[0][mod_vector_index].dtype == np.bool_):
+ raise ValueError(
+ "No threshold has been applied to this .h5 single read data. You must provide a threshold using the thresh parameter in order to extract binarized modification arrays."
+ )
+ else:
+ thresh = utils.adjust_threshold(thresh)
+
+ read_ints_list = []
+ mod_coords_list = []
+ motifs_list = []
+
+ read_names = np.array(
+ [read_data[read_name_index] for read_data in sorted_read_data_converted]
+ )
+ # TODO: handle the case where a read shows up in more than one different region
+ _, unique_first_indices = np.unique(read_names, return_index=True)
+ unique_in_order = read_names[np.sort(unique_first_indices)]
+ string_to_int = {
+ read_name: index for index, read_name in enumerate(unique_in_order)
+ }
+ read_ints = np.array([string_to_int[read_name] for read_name in read_names])
+
+ for read_int, read_data in zip(read_ints, sorted_read_data_converted):
+ if thresh is None:
+ mod_pos_in_read = np.flatnonzero(read_data[mod_vector_index])
+ else:
+ mod_pos_in_read = np.flatnonzero(read_data[mod_vector_index] > thresh)
+
+ if relative:
+ if regions_5to3prime and read_data[region_strand_index] == "-":
+ # Here we want to show the regions each along their 5 prime to 3 prime direction
+ # This means that negative strand regions need to be flipped
+ mod_pos_record = -(
+ mod_pos_in_read
+ + read_data[read_start_index]
+ - (read_data[region_start_index] + read_data[region_end_index])
+ // 2
+ )
+ else:
+ # This is the default case: just make the coordinates relative using
+ # the reference genome coordinate system. Normal, easy, chill, nice.
+ mod_pos_record = (
+ mod_pos_in_read
+ + read_data[read_start_index]
+ - (read_data[region_start_index] + read_data[region_end_index])
+ // 2
+ )
+ else:
+ # If we aren't using relative coordinates, then I think the 5prime to 3prime argument
+ # can just be ignored, and I think it's nicer if that's silent - less clutter in the output
+ # Basically if you are keeping different regions separate using other metadata (such as
+ # just keeping their actual real genomic coordinates) it is superfluous to do the 5to3 flip.
+ mod_pos_record = mod_pos_in_read + read_data[read_start_index]
+
+ mod_coords_list += list(mod_pos_record)
+ read_ints_list += [read_int] * len(mod_pos_record)
+ motifs_list += [read_data[motif_index]] * len(mod_pos_record)
+
+ return (
+ np.array(mod_coords_list),
+ np.array(read_ints_list),
+ np.array(motifs_list),
+ regions_dict,
+ )
+
+ else:
+ raise ValueError(
+ f"File {file} does not have a recognized extension for single read data."
+ )
+
+
+""" TEMPORARY STUB VARS """
+STUB_HALFSIZE = 500
+STUB_N_READS = 500
+
+
+def reads_from_fake(
+ file: Path, regions: Path, motifs: list[str]
+) -> tuple[list[np.ndarray], np.ndarray[int], np.ndarray[str], dict]:
+ """
+ Helper function to support testing.
+
+ TODO: What does the bed file represent in this method? This one is breaking my brain a bit.
+ TODO: Variable names in this method stink.
+ TODO: Currently assumes mod calling (thresholding probabilities) was already performed elsewhere
+
+ Args:
+ file: Path to file containing modification data for single reads
+ bed_file: Path to bed file specifying regions (WHAT DO THESE REPRESENT???)
+ mod_names: types of modification to extract data for
+
+ Returns:
+ Returns three parallel arrays, of length (N_READS * len(mod_names)), containing the following for each index:
+ * array of positions at which the specified modification was found in a read
+ * unique integer ID for the read
+ * modification represented by the positions
+ For example, if called on a dataset with a single read and two modification types, each array would have two entries. The unique IDs would be the same, as both entries would represent the same single read. The mods and positions would be different, as they would extact different mods.
+ """
+ reads = []
+ read_names = []
+ mods = []
+ for mod_name in motifs:
+ match mod_name:
+ case "A,0":
+ mod_reads = [
+ test_data.fake_read_mod_positions(STUB_HALFSIZE, "peak", 0.7)
+ for _ in range(STUB_N_READS)
+ ]
+ case "CG,0":
+ mod_reads = [
+ test_data.fake_read_mod_positions(
+ STUB_HALFSIZE, "inverse_peak", 0.4
+ )
+ for _ in range(STUB_N_READS)
+ ]
+ case _:
+ raise ValueError(f"No stub settings for requested mod {mod_name}")
+ reads += mod_reads
+ read_names.append(np.arange(len(mod_reads)))
+ mods.append([mod_name] * len(mod_reads))
+
+ read_names = np.concatenate(read_names)
+ mods = np.concatenate(mods)
+ return reads, read_names, mods, {}
+
+
+# def convert_bytes(item):
+# """Convert bytes to string if item is bytes, otherwise return as is."""
+# if isinstance(item, bytes):
+# return item.decode()
+# return item
+
+
+def convert_bytes_to_strings(tup):
+ """
+ Helper function for single read loading.
+ Convert all bytes elements in a tuple to strings.
+ """
+ return tuple(item.decode() if isinstance(item, bytes) else item for item in tup)
+ # tuple(convert_bytes(item) for item in tup)
+
+
+def adjust_mod_probs_in_arrays(mod_array, val_array):
+ """
+ Helper function to correct for an idiosyncracy in modkit single-read parsing wherein 0-255
+ "mod quality" values are parsed as floating-point values from 1/512 to 511/512.
+ """
+ mod_array[np.flatnonzero(val_array)] += 1 / 512
+ return mod_array
+
+
+def adjust_mod_probs_in_tuples(tup, mod_idx, val_idx):
+ """
+ Helper function to apply mod prob adjustments
+ """
+ return tuple(
+ item if index != mod_idx else adjust_mod_probs_in_arrays(item, tup[val_idx])
+ for index, item in enumerate(tup)
+ )
+
+
+def binary_to_np_array(compressed_bytes, dtype, decompressor, binarized, int8tofloat):
+ """
+ Helper function to decompress binary data to boolean or floating point arrays
+ """
+ if binarized:
+ return np.frombuffer(decompressor(compressed_bytes), dtype=dtype).astype(bool)
+ elif int8tofloat:
+ return (
+ (np.frombuffer(decompressor(compressed_bytes), dtype=dtype).astype(float))
+ / 256
+ ).astype(np.float16)
+ else:
+ return np.frombuffer(decompressor(compressed_bytes), dtype=dtype).astype(int)
+
+
+def retrieve_h5_data(h5, dataset, indices, compressed, dtype, decompressor, binarized):
+ """
+ Load the requested dataset from the h5 file at the relevant indices.
+
+ For compressed vector data, decompress each dataset element to numpy array.
+ """
+ if compressed:
+ # Determine if int8tofloat should be applied
+ int8tofloat = "mod_vector" in dataset
+ # Logic for compressed data
+ loaded_uint8_list = h5[dataset][list(indices)]
+ return [
+ binary_to_np_array(
+ loaded_uint8.tobytes(), dtype, decompressor, binarized, int8tofloat
+ )
+ for loaded_uint8 in loaded_uint8_list
+ ]
+ else:
+ # Logic for non-compressed data
+ return h5[dataset][list(indices)]
diff --git a/dimelo/parse_bam.py b/dimelo/parse_bam.py
index a2f0b897..88a5310f 100644
--- a/dimelo/parse_bam.py
+++ b/dimelo/parse_bam.py
@@ -1,907 +1,1162 @@
-r"""
-=================
-parse_bam module
-=================
-.. currentmodule:: dimelo.parse_bam
-.. autosummary::
- parse_bam
+import gzip
+import multiprocessing
+import os
+import subprocess
+from collections import defaultdict
+from pathlib import Path
-parse_bam allows you to summarize modification calls in a sql database
+import h5py
+import numpy as np
+import pysam
+from tqdm.auto import tqdm
+from . import run_modkit, utils
+
+"""
+This module contains code to convert .bam files into both human-readable and
+indexed random-access pileup and read-wise processed outputs.
"""
+"""
+Global variables
+"""
-import argparse
-import multiprocessing
-import os
-import sqlite3
-from typing import List, Tuple, Union
+# Specifies how many reads to check for the base modifications of interest.
+NUM_READS_TO_CHECK = 100
-import numpy as np
-import pandas as pd
-import pysam
-from joblib import Parallel, delayed
-from tqdm import tqdm
-
-from dimelo.utils import clear_db, create_sql_table, execute_sql_command
-
-DEFAULT_BASEMOD = "A+CG"
-DEFAULT_THRESH_A = 129
-DEFAULT_THRESH_C = 129
-DEFAULT_WINDOW_SIZE = 1000
-
-
-class Region(object):
- def __init__(self, region: Union[str, pd.Series]):
- """Represents a region of genetic data.
- Attributes:
- - chromosome: string name of the chromosome to which the region applies
- - begin: integer start position of region
- - end: integer end position of region
- - size: length of region
- - string: string representation of region
- - strand: string specifying forward or reverse strand; either "+" or "-" (default +)
- """
- self.chromosome = None
- self.begin = None
- self.end = None
- self.size = None
- self.string = None
- self.strand = "+"
-
- if isinstance(region, str): # ":" in region:
- # String of format "{CHROMOSOME}:{START}-{END}"
- try:
- self.chromosome, interval = region.replace(",", "").split(":")
- try:
- # see if just integer chromosomes are used
- self.chromosome = int(self.chromosome)
- except ValueError:
- pass
- self.begin, self.end = [int(i) for i in interval.split("-")]
- except ValueError:
- raise TypeError(
- "Invalid region string. Example of accepted format: 'chr5:150200605-150423790'"
- )
- self.size = self.end - self.begin
- self.string = f"{self.chromosome}_{self.begin}_{self.end}"
- elif isinstance(region, pd.Series):
- # Ordered sequence containing [CHROMOSOME, START, END] and optionally [STRAND], where STRAND can be either "+" or "-"
- self.chromosome = region[0]
- self.begin = region[1]
- self.end = region[2]
- self.size = self.end - self.begin
- self.string = f"{self.chromosome}_{self.begin}_{self.end}"
- # strand of motif to orient single molecules
- # if not passed just keep as all +
- if len(region) >= 4:
- if (region[3] == "+") or (region[3] == "-"):
- self.strand = region[3]
- # handle case of bed file with additional field that isn't strand +/-
- else:
- self.strand = "+"
- else:
- self.strand = "+"
- else:
- raise TypeError(
- "Unknown datatype passed for Region initialization"
- )
+"""
+User-facing parse operations: pileup and extract
+"""
-def make_db(
- fileName: str,
- sampleName: str,
- outDir: str,
- testMode: bool = False,
- qc: bool = False,
-) -> Tuple[str, List[str]]:
- """Sets up the necessary database tables.
+def pileup(
+ input_file: str | Path,
+ output_name: str,
+ ref_genome: str | Path,
+ output_directory: str | Path | None = None,
+ regions: str | Path | list[str | Path] | None = None,
+ motifs: list = ["A,0", "CG,0"],
+ thresh: float | None = None,
+ window_size: int | None = None,
+ cores: int | None = None,
+ log: bool = False,
+ cleanup: bool = True,
+ quiet: bool = False,
+ override_checks: bool = False,
+) -> tuple[Path, Path]:
+ """
+ Takes a bam file containing long read sequencing data aligned
+ to a reference genome with modification calls for one or more base/context
+ and creates a pileup. A pileup contains genome-position-wise sums of both reads with
+ bases that could have the modification in question and of reads that are in
+ fact modified.
+
+ The current implementation of this method uses modkit, a tool built by
+ Nanopore Technologies, along with htslib tools compress and index the output
+ bedmethyl file. The modkit command for this function is `modkit pileup`.
+
+ https://github.com/nanoporetech/modkit/
+
+ The intermediate output file is a standard bedmethyl file containing all
+ specified motifs and mod codes. The compressed and indexed output file is
+ a .bed.gz file with an accompanying .bed.gz.tbi index.
Args:
- :param fileName: name of bam file with Mm and Ml tags
- :param sampleName: name of sample for output SQL table name labelling
- :param outDir: directory where SQL database is stored
- :param testMode: turns on test mode; note that this will clear the database if it exists
- :param qc: turns on qc mode
+ output_file: a string or Path object pointing to the location of a .bam file.
+ The file should follow at least v1.6 of the .bam file specifications,
+ found here: https://samtools.github.io/hts-specs/
+ https://samtools.github.io/hts-specs/SAMv1.pdf
+
+ The file needs to have modifications stored in the standard format,
+ with MM and ML tags (NOT mm and ml) and mod names m for 5mC and a
+ for 6mA.
+
+ Furthermore, the file must have a .bam.bai index file with the same name.
+ You can create an index if needed using samtools index.
+ output_name: a string that will be used to create an output folder
+ containing the intermediate and final outputs, along with any logs.
+ ref_genome: a string of Path objecting pointing to the .fasta file
+ for the reference genome to which the .bam file is aligned.
+ output_directory: optional str or Path pointing to an output directory.
+ If left as None, outputs will be stored in a new folder within the input
+ directory.
+ regions: TODO
+ motifs: a list of strings specifying which base modifications to look for.
+ The basemods are each specified as {sequence_motif},{position_of_modification}.
+ For example, a methylated adenine is specified as 'A,0' and CpG methylation
+ is specified as 'CG,0'.
+ thresh: float point number specifying the base modification probability threshold
+ used to delineate modificaton calls as True or False. When set to None, modkit
+ will select its own threshold automatically based on the data.
+ window_size: an integer specifying a window around the center of each bed_file
+ region. If set to None, the bed_file is used unmodified. If set to a non-zero
+ positive integer, the bed_file regions are replaced by new regions with that
+ window size in either direction of the center of the original bed_file regions.
+ This is used for e.g. extracting information from around known motifs or peaks.
+ cores: an integer specifying how many parallel cores modkit gets to use.
+ By default modkit will use all of the available cores on the machine.
+ log: a boolean specifying whether to output logs into the output folder.
+ cleanup: a boolean specifying whether to clean up to keep intermediate
+ outputs. The final processed files are not human-readable, whereas the intermediate
+ outputs are. However, intermediate outputs can also be quite large.
+ override_checks: convert errors from input checking into warnings if True
Returns:
- - path to the new database
- - list of newly-created table names
+ Path object pointing to the compressed and indexed .bed.gz bedmethyl file, ready
+ for plotting functions.
+ Path object pointing to 'regions.processed.bed', the `--include-bed` file used for `modkit pileup`
+
+ """
+ """
+ TODO: There are a lot of issues that are all related here:
+ dimelo/parse_bam.py:150: error: Incompatible types in assignment (expression has type "Path | None", variable has type "str | Path") [assignment]
+ dimelo/parse_bam.py:169: error: Argument "input_file" to "prep_outputs" has incompatible type "str | Path"; expected "Path" [arg-type]
+ dimelo/parse_bam.py:256: error: Argument "input_file" to "run_with_progress_bars" has incompatible type "str | Path"; expected "Path" [arg-type]
+ dimelo/parse_bam.py:257: error: Argument "ref_genome" to "run_with_progress_bars" has incompatible type "str | Path"; expected "Path" [arg-type]
+
+ I'm not sure of the most elegant way to fix it. Come back and address.
"""
- if not os.path.exists(outDir):
- os.mkdir(outDir)
- DATABASE_NAME = (
- outDir + "/" + fileName.split("/")[-1].replace(".bam", "") + ".db"
- )
+ ## Verify and prepare inputs and outputs
- if testMode:
- clear_db(DATABASE_NAME)
-
- tables = []
- # for qc report
- if qc:
- table_name = "reads_" + sampleName
- cols = [
- "name",
- "chr",
- "start",
- "end",
- "length",
- "strand",
- "mapq",
- "ave_baseq",
- "ave_alignq",
- ]
- dtypes = [
- "TEXT",
- "TEXT",
- "INT",
- "INT",
- "INT",
- "TEXT",
- "INT",
- "INT",
- "INT",
- ]
- create_sql_table(DATABASE_NAME, table_name, cols, dtypes)
- tables.append(table_name)
- # for browser and enrichment plots
- else:
- table_name = "methylationByBase_" + sampleName
- cols = ["id", "read_name", "chr", "pos", "prob", "mod"]
- dtypes = ["TEXT", "TEXT", "TEXT", "INT", "INT", "TEXT"]
- create_sql_table(DATABASE_NAME, table_name, cols, dtypes)
- tables.append(table_name)
-
- table_name = "methylationAggregate_" + sampleName
- cols = ["id", "pos", "mod", "methylated_bases", "total_bases"]
- dtypes = ["TEXT", "INT", "TEXT", "INT", "INT"]
- create_sql_table(DATABASE_NAME, table_name, cols, dtypes)
- tables.append(table_name)
-
- return DATABASE_NAME, tables
-
-
-def parse_bam(
- fileName: str,
- sampleName: str,
- outDir: str,
- bedFile: str = None,
- basemod: str = DEFAULT_BASEMOD,
- center: bool = False,
- windowSize: int = DEFAULT_WINDOW_SIZE,
- region: str = None,
- threshA: int = DEFAULT_THRESH_A,
- threshC: int = DEFAULT_THRESH_C,
- extractAllBases: bool = False,
- cores: int = None,
-) -> None:
- """
- fileName
- name of bam file with Mm and Ml tags
- sampleName
- name of sample for output SQL table name labelling. Valid names contain [``a-zA-Z0-9_``].
- outDir
- directory where SQL database is stored
- bedFile
- name of bed file that defines regions of interest over which to extract mod calls. The bed file either defines regions over which to extract mod calls OR defines regions (likely motifs) over which to center positions for mod calls and then parse_bam extracts mod calls over a window flanking that region defined in by ``windowSize``. Optional 4th column in bed file to specify strand of region of interest as ``+`` or ``-``. Default is to consider regions as all ``+``. NB. The ``bedFile`` and ``region`` parameters are mutually exclusive; specify one or the other.
- basemod
- One of the following:
-
- * ``'A'`` - extract mA only
- * ``'CG'`` - extract mCpG only
- * ``'A+CG'`` - extract mA and mCpG
- center
- One of the following:
-
- * ``'True'`` - report positions with respect to center of motif window (+/- windowSize); only valid with bed file input
- * ``'False'`` - report positions in original reference space
- windowSize
- window size around center point of feature of interest to plot (+/-); only mods within this window are stored; only used if center=True; still, only reads that span the regions defined in the bed file will be included; default is 1,000 bp
- region
- single region over which to extract base mods, rather than specifying many windows in bedFile; format is chr:start-end. NB. The ``bedFile`` and ``region`` parameters are mutually exclusive; specify one or the other.
- threshA
- threshold above which to call an A base methylated; default is 129
- threshC
- threshold above which to call a C base methylated; default is 129
- extractAllBases
- One of the following:
-
- * ``'True'`` - Store all base mod calls, regardles of methylation probability threshold. Bases stored are those that can have a modification call (A, CG, or both depending on ``basemod`` parameter) and are sequenced bases, not all bases in the reference.
- * ``'False'`` - Only modifications above specified threshold are stored
- cores
- number of cores over which to parallelize; default is all available
-
- Valid argument combinations for ``bedFile``, ``center``, and ``windowSize`` are below. Regions of interest generally fall into two categories: small motifs at which to center analysis (use ``center`` = True) or full windows of interest (do not specify ``center`` or ``windowSize``).
-
- * ``bedFile`` --> extract all modified bases in regions defined in bed file
- * ``bedfile`` + ``center`` --> extract all modified bases in regions defined in bed file, report positions relative to region centers and extract base modificiations within default windowSize of 1kb
- * ``bedfile`` + ``center`` + ``windowSize`` --> extract all modified bases in regions defined in bed file, report positions relative to region centers and extract base modifications within flanking +/- windowSize
- * ``region`` --> extract all modified bases in single region
-
- **Example**
-
- For regions defined by ``bedFile``:
-
- >>> dm.parse_bam("dimelo/test/data/mod_mappings_subset.bam", "test", "dimelo/dimelo_test", bedFile="dimelo/test/data/test.bed", basemod="A+CG", center=True, windowSize=500, threshA=190, threshC=190, extractAllBases=False, cores=8)
-
- For single region defined with ``region``:
-
- >>> dm.parse_bam("dimelo/test/data/mod_mappings_subset.bam", "test", "dimelo/dimelo_test", region="chr1:2907273-2909473", basemod="A+CG", threshA=190, threshC=190, cores=8)
-
- **Return**
-
- Returns a SQL database in the specified output directory. Database can be converted into pandas dataframe with:
-
- >>> fileName = "dimelo/test/data/mod_mappings_subset.bam"
- >>> sampleName = "test"
- >>> outDir = "dimelo/dimelo_test"
- >>> all_data = pd.read_sql("SELECT * from methylationByBase_" + sampleName, sqlite3.connect(outDir + "/" + fileName.split("/")[-1].replace(".bam", "") + ".db"))
- >>> aggregate_counts = pd.read_sql("SELECT * from methylationAggregate_" + sampleName, sqlite3.connect(outDir + "/" + fileName.split("/")[-1].replace(".bam", "") + ".db"))
-
- Each database contains these two tables with columns listed below:
-
- 1. methylationByBase_sampleName
- * id(read_name:pos)
- * read_name
- * chr
- * pos
- * prob
- * mod
- 2. methylationAggregate_sampleName
- * id(pos:mod)
- * pos
- * mod
- * methylated_bases
- * total_bases
-
- When running parse_bam with a region defined, a summary bed file is also produced to support visualizing aggregate data with any genome browser tool. The columns of this bed file are chr, start, end, methylated_bases, total_bases.
-
- For example, to take a summary output bed and create a file with fraction of modified bases with a window size of 100 bp for visualization with the WashU browser, you could run the below commands in terminal:
-
- * ``bedtools makewindows -g ref_genome.chromsizes.txt -w 100 > ref_genome_windows.100.bp.bed``
- * ``bedtools map -a ref_genome_windows.100.bp.bed -b outDir/fileName_sampleName_chr_start_end_A.bed -c 4,5 -o sum,sum -null 0 | awk -v "OFS=\\t" '{if($5>0){print $1,$2,$3,$4/$5}else{print $1,$2,$3,$5}}' > outDir/fileName_sampleName_chr_start_end_A.100.bed``
- * ``bgzip outDir/fileName_sampleName_chr_start_end_A.100.bed``
- * ``tabix -f -p bed outDir/fileName_sampleName_chr_start_end_A.100.bed.gz``
+ input_file, ref_genome, output_directory = utils.sanitize_path_args(
+ input_file, ref_genome, output_directory
+ )
- """
- # Ensure exactly one of bedFile and region are specified
- if sum([arg is None for arg in (bedFile, region)]) != 1:
- raise RuntimeError(
- "Exactly one of the mutually exclusive arguments 'bedFile' or 'region' must be specified."
+ try:
+ verify_inputs(input_file, motifs, ref_genome)
+ except Exception as e:
+ if override_checks:
+ if not quiet:
+ print(f"WARNING: {e}")
+ else:
+ raise Exception(
+ f'{e}\nIf you are confident that your inputs are ok, pass "override_checks=True" to convert to warning and proceed with processing.'
+ ) from e
+
+ output_path, (output_bedmethyl, output_bedmethyl_sorted, output_pileup_path, _) = (
+ prep_output_directory(
+ output_directory=output_directory,
+ output_name=output_name,
+ input_file=input_file,
+ output_file_names=[
+ "pileup.bed",
+ "pileup.sorted.bed",
+ "pileup.sorted.bed.gz",
+ "pileup.sorted.bed.gz.tbi",
+ ],
)
- # The argument center is incompatible with region
- if region is not None:
- if center:
- raise RuntimeError(
- "Argument 'center' cannot be given alongside 'region'."
- )
-
- if not os.path.isdir(outDir):
- os.makedirs(outDir)
+ )
- make_db(fileName, sampleName, outDir)
+ ## Build up the command list to be sent to modkit, then run modkit
- if bedFile is not None:
- # make a region object for each row of bedFile
- bed = pd.read_csv(bedFile, sep="\t", header=None)
- windows = []
- for _, row in bed.iterrows():
- windows.append(Region(row))
+ # TODO: This is mildly confusing. I get what it's doing, but it's hard to follow / names are bad. Also, why is it used in cleanup here, but not in extract?
+ region_command_list, processed_regions_path = create_region_command_list(
+ output_path,
+ regions,
+ window_size,
+ )
- if region is not None:
- windows = [Region(region)]
+ motif_command_list = []
+ if len(motifs) > 0:
+ for motif in motifs:
+ parsed_motif = utils.ParsedMotif(motif)
+ motif_command_present = False
+ for a, b in zip(motif_command_list, motif_command_list[1:]):
+ if a == parsed_motif.motif_seq and b == str(parsed_motif.modified_pos):
+ # This motif is already going to be processed; we want to skip adding it a second
+ # time because modkit does not like duplicate motifs.
+ # It's actually ok if it's a different mod code in the two cases because the pileup
+ # operation, under the hood, keeps all mod codes. Filtering is only done when loading.
+ motif_command_present = True
+ break
+ if not motif_command_present:
+ motif_command_list.append("--motif")
+ motif_command_list.append(parsed_motif.motif_seq)
+ motif_command_list.append(str(parsed_motif.modified_pos))
+ else:
+ raise ValueError("Error: no motifs specified. Nothing to process.")
- # Configure progress reporting
- if len(windows) == 1:
- show_read_progress = True
+ if log:
+ if not quiet:
+ print("Logging to ", Path(output_path) / "pileup-log")
+ log_command_list = ["--log-filepath", Path(output_path) / "pileup-log"]
else:
- show_read_progress = False
- # Enable top-level progress bar for multi-window processing
- windows = tqdm(windows, desc="Parsing windows", unit="windows")
+ log_command_list = []
- # default number of cores is max available
+ # TODO: This should be a method, like create_region_specifier, or just combined into a prep method for the start...
cores_avail = multiprocessing.cpu_count()
if cores is None:
- num_cores = cores_avail
- else:
- # if more than available cores is specified, process with available cores
- if cores > cores_avail:
- num_cores = cores_avail
- else:
- num_cores = cores
-
- batchSize = 100
-
- Parallel(n_jobs=num_cores)(
- delayed(parse_reads_window)(
- fileName,
- sampleName,
- basemod,
- windowSize,
- window,
- center,
- threshA,
- threshC,
- batchSize,
- outDir,
- extractAllBases,
- showReadProgress=show_read_progress,
- )
- for window in windows
- )
-
- # create summary bed files
- if region is not None:
- if "A" in basemod:
- make_bed_file_output(fileName, sampleName, outDir, region, "A")
- if "C" in basemod:
- make_bed_file_output(fileName, sampleName, outDir, region, "C")
-
- # output all files created to std out
- f = fileName.split("/")[-1].replace(".bam", "")
- out_path = f"{outDir}/{f}.db"
- if region is None:
- str_out = f"Outputs\n_______\nDB file: {out_path}"
+ if not quiet:
+ print(
+ f"No specified number of cores requested. {cores_avail} available on machine, allocating all."
+ )
+ cores_command_list = ["--threads", str(cores_avail)]
+ elif cores > cores_avail:
+ if not quiet:
+ print(
+ f"Warning: {cores} cores request, {cores_avail} available. Allocating {cores_avail}"
+ )
+ cores_command_list = ["--threads", str(cores_avail)]
else:
- bed_paths = []
- if "A" in basemod:
- bed_path = (
- f"{outDir}/{f}_{sampleName}_{Region(region).string}_A.bed"
+ if not quiet:
+ print(f"Allocating requested {cores} cores.")
+ cores_command_list = ["--threads", str(cores)]
+
+ # TODO: This is SO SO SO similar to extract; just the ValueError vs. printing. I think this can be resolved
+ mod_thresh_command_list: list[str] = []
+ if thresh is None:
+ if not quiet:
+ print(
+ "No base modification threshold provided. Using adaptive threshold selection via modkit."
)
- bed_paths.append(bed_path)
- if "C" in basemod:
- bed_path = (
- f"{outDir}/{f}_{sampleName}_{Region(region).string}_CG.bed"
+ else:
+ adjusted_threshold = utils.adjust_threshold(thresh, quiet=quiet)
+ if adjusted_threshold < 0.5 and not quiet:
+ print(
+ f"WARNING: thresh {thresh} is very low and may lead to unexpected behavior. Typical thresholds are at least 0.5 or 128."
)
- bed_paths.append(bed_path)
- str_out = (
- f"Outputs\n_______\nDB file: {out_path}\nBED file: {bed_paths}"
+ for motif in motifs:
+ parsed_motif = utils.ParsedMotif(motif)
+ for mod_code in parsed_motif.mod_codes:
+ mod_thresh_command_list = mod_thresh_command_list + [
+ "--mod-thresholds",
+ f"{mod_code}:{adjusted_threshold}",
+ ]
+
+ ref_genome_command_list = ["--ref", ref_genome]
+ filter_command_list = ["--filter-threshold", "0"]
+
+ pileup_command_list = (
+ ["modkit", "pileup", input_file, output_bedmethyl]
+ + region_command_list
+ + motif_command_list
+ + ref_genome_command_list
+ + filter_command_list
+ + mod_thresh_command_list
+ + cores_command_list
+ + log_command_list
+ )
+
+ # TODO: Do we need to store and use the output from this method? Previously was being printed immediately afterward.
+ _ = run_modkit.run_with_progress_bars(
+ command_list=pileup_command_list,
+ input_file=input_file,
+ ref_genome=ref_genome,
+ motifs=motifs,
+ load_fasta_regex=r"\s+\[.*?\]\s+(\d+)\s+Reading",
+ find_motifs_regex=r"\s+(\d+)/(\d+)\s+finding\s+([A-Za-z0-9,]+)\s+motifs",
+ contigs_progress_regex=r"\s+(\d+)/(\d+)\s+contigs",
+ single_contig_regex=r"\s+(\d+)/(\d+)\s+processing\s+([\w]+)[^\w]",
+ buffer_size=50,
+ progress_granularity=25,
+ done_str="Done",
+ err_str="Error",
+ expect_done=True,
+ quiet=quiet,
+ )
+ # print(done_string)
+
+ ## Sort, compress, and index the output bedmethyl file
+
+ with open(output_bedmethyl_sorted, "w") as sorted_file:
+ subprocess.run(
+ ["sort", "-k1,1", "-k2,2n", output_bedmethyl], stdout=sorted_file
)
- print(str_out)
+ pysam.tabix_compress(output_bedmethyl_sorted, output_pileup_path, force=True)
+ pysam.tabix_index(str(output_pileup_path), preset="bed", force=True)
+
+ # TODO: Can cleanup be consolidated?
+ if cleanup:
+ if output_bedmethyl.exists():
+ output_bedmethyl.unlink()
+ if output_bedmethyl_sorted.exists():
+ output_bedmethyl_sorted.unlink()
+
+ return output_pileup_path, processed_regions_path
+
+
+def extract(
+ input_file: str | Path,
+ output_name: str,
+ ref_genome: str | Path,
+ output_directory: str | Path | None = None,
+ regions: str | Path | list[str | Path] | None = None,
+ motifs: list = ["A,0", "CG,0", "GCH,1"],
+ thresh: float | None = None,
+ window_size: int | None = None,
+ cores: int | None = None,
+ log: bool = False,
+ cleanup: bool = True,
+ quiet: bool = False,
+ override_checks: bool = False,
+) -> tuple[Path, Path]:
+ """
+ Takes a bam file containing long read sequencing data aligned
+ to a reference genome with modification calls for one or more bases/contexts
+ and pulls out data from each individual read.
+
+ The current implementation of this method uses modkit, a tool built by
+ Nanopore Technologies, along with h5py to build the final output file. The
+ modkit command in this function is `modkit extract`.
+ https://github.com/nanoporetech/modkit/
+
+ The intermediate outputs are plain text files containing a list of all base modifications,
+ with a file for each motif. The compressed and indexed output contains vectors of valid
+ and modified positions within each read.
+
+ Args:
+ output_file: a string or Path object pointing to the location of a .bam file.
+ The file should follow at least v1.6 of the .bam file specifications,
+ found here: https://samtools.github.io/hts-specs/
+ https://samtools.github.io/hts-specs/SAMv1.pdf
+
+ The file needs to have modifications stored in the standard format,
+ with MM and ML tags (NOT mm and ml) and mod names m for 5mC and a
+ for 6mA.
+
+ Furthermore, the file must have a .bam.bai index file with the same name.
+ You can create an index if needed using samtools index.
+ output_name: a string that will be used to create an output folder
+ containing the intermediate and final outputs, along with any logs.
+ ref_genome: a string of Path objecting pointing to the .fasta file
+ for the reference genome to which the .bam file is aligned.
+ output_directory: optional str or Path pointing to an output directory.
+ If left as None, outputs will be stored in a new folder within the input
+ directory.
+ regions: TODO
+ motifs: a list of strings specifying which base modifications to look for.
+ The basemods are each specified as {sequence_motif},{position_of_modification}.
+ For example, a methylated adenine is specified as 'A,0' and CpG methylation
+ is specified as 'CG,0'.
+ thresh: float point number specifying the base modification probability threshold
+ used to delineate modificaton calls as True or False. When set to None, modkit
+ will select its own threshold automatically based on the data.
+ window_size: an integer specifying a window around the center of each bed_file
+ region. If set to None, the bed_file is used unmodified. If set to a non-zero
+ positive integer, the bed_file regions are replaced by new regions with that
+ window size in either direction of the center of the original bed_file regions.
+ This is used for e.g. extracting information from around known motifs or peaks.
+ cores: an integer specifying how many parallel cores modkit gets to use.
+ By default modkit will use all of the available cores on the machine.
+ log: a boolean specifying whether to output logs into the output folder.
+ cleanup: a boolean specifying whether to clean up to keep intermediate
+ outputs. The final processed files are not human-readable, whereas the intermediate
+ outputs are. However, intermediate outputs can also be quite large.
+ override_checks: convert errors from input checking into warnings if True
-def make_bed_file_output(fileName, sampleName, outDir, region, mod):
+ Returns:
+ Path object pointing to the compressed and indexed output .h5 file, ready for
+ plotting functions.
+ Path object pointing to 'regions.processed.bed', the `--include-bed` file used for `modkit extract`
+
+ """
"""
- Make output bed file that can be used to visualize aggregate with other genome browsers
+ TODO: There are a lot of issues that are all related here:
+ dimelo/parse_bam.py:374: error: Incompatible types in assignment (expression has type "Path | None", variable has type "str | Path") [assignment]
+ dimelo/parse_bam.py:393: error: Argument "input_file" to "prep_outputs" has incompatible type "str | Path"; expected "Path" [arg-type]
+ dimelo/parse_bam.py:480: error: Argument "input_file" to "run_with_progress_bars" has incompatible type "str | Path"; expected "Path" [arg-type]
+ dimelo/parse_bam.py:481: error: Argument "ref_genome" to "run_with_progress_bars" has incompatible type "str | Path"; expected "Path" [arg-type]
+
+ I'm not sure of the most elegant way to fix it. Come back and address.
"""
- r = Region(region)
- f = fileName.split("/")[-1].replace(".bam", "")
- out_path = f"{outDir}/{f}.db"
- aggregate_counts_all = pd.read_sql(
- "SELECT * from methylationAggregate_" + sampleName,
- sqlite3.connect(out_path),
+
+ ## Verify and prepare inputs and outputs
+
+ input_file, ref_genome, output_directory = utils.sanitize_path_args(
+ input_file, ref_genome, output_directory
)
- aggregate_counts_mod = aggregate_counts_all[
- aggregate_counts_all["mod"].str.contains(mod)
- ].copy()
- aggregate_counts_mod["chr"] = r.chromosome
- aggregate_counts_mod["end"] = aggregate_counts_mod["pos"] + 1
- # columns are: id(pos:mod), pos, mod, methylated_bases, total_bases
- dictionary_agg = {
- "chr": aggregate_counts_mod["chr"],
- "start": aggregate_counts_mod["pos"],
- "end": aggregate_counts_mod["end"],
- "methylated": aggregate_counts_mod["methylated_bases"],
- "total": aggregate_counts_mod["total_bases"],
- }
- bed_agg = pd.DataFrame(dictionary_agg)
- bed_agg.sort_values(by="start", ascending=True, inplace=True)
- if "A" in mod:
- mod_name = "A"
- if "C" in mod:
- mod_name = "CG"
- bed_agg.to_csv(
- f"{outDir}/{f}_{sampleName}_{r.string}_{mod_name}.bed",
- sep="\t",
- header=False,
- index=False,
+
+ try:
+ verify_inputs(input_file, motifs, ref_genome)
+ except Exception as e:
+ if override_checks:
+ if not quiet:
+ print(f"WARNING: {e}")
+ else:
+ raise Exception(
+ f'{e}\nIf you are confident that your inputs are ok, pass "override_checks=True" to convert to warning and proceed with processing.'
+ ) from e
+
+ # TODO: Add intermediate mod-specific .txt files?
+ output_path, (output_reads_path,) = prep_output_directory(
+ output_directory=output_directory,
+ output_name=output_name,
+ input_file=input_file,
+ output_file_names=["reads.combined_basemods.h5"],
)
+ ## Build up the command lists shared across motifs to be sent to modkit
-def parse_reads_window(
- fileName: str,
- sampleName: str,
- basemod: str,
- windowSize: int,
- window: Region,
- center: bool,
- threshA: int,
- threshC: int,
- batchSize: int,
- outDir: str,
- extractAllBases: bool,
- showReadProgress: bool = False,
-) -> None:
- """Parse all reads in window and put data into methylationByBase table.
+ region_command_list, processed_regions_path = create_region_command_list(
+ output_path,
+ regions,
+ window_size,
+ )
- Args:
- :param bam: read in bam file with Mm and Ml tags
- :param fileName: name of bam file
- :param sampleName: name of sample for output file name labelling
- :param basemod: which basemods, currently supported options are 'A', 'CG', 'A+CG'
- :param windowSize: window size around center point of feature of interest to plot (+/-); only mods within this window are stored; only applicable for center=True
- :param window: single window
- :param center: report positions with respect to reference center (+/- window size) if True or in original reference space if False
- :param threshA: threshold above which to call an A base methylated
- :param threshC: threshold above which to call a C base methylated
- :param showReadProgress: when true, display progress for read processing
- """
- bam = pysam.AlignmentFile(fileName, "rb")
- data = []
- if showReadProgress:
- total_reads = bam.count(
- reference=window.chromosome, start=window.begin, end=window.end
+ cores_avail = multiprocessing.cpu_count()
+ if cores is None:
+ if not quiet:
+ print(
+ f"No specified number of cores requested. {cores_avail} available on machine, allocating all."
+ )
+ cores_command_list = ["--threads", str(cores_avail)]
+ elif cores > cores_avail:
+ if not quiet:
+ print(
+ f"Warning: {cores} cores request, {cores_avail} available. Allocating {cores_avail}"
+ )
+ cores_command_list = ["--threads", str(cores_avail)]
+ else:
+ if not quiet:
+ print(f"Allocating requested {cores} cores.")
+ cores_command_list = ["--threads", str(cores)]
+
+ mod_thresh_command_list: list[str] = []
+ if thresh is None:
+ if not quiet:
+ print(
+ "No valid base modification threshold provided. Raw probs will be saved."
+ )
+ adjusted_threshold = None
+ else:
+ adjusted_threshold = utils.adjust_threshold(thresh, quiet=quiet)
+ if adjusted_threshold < 0.5 and not quiet:
+ print(
+ f"WARNING: thresh {thresh} is very low and may lead to unexpected behavior. Typical thresholds are at least 0.5 or 128."
+ )
+ for motif in motifs:
+ parsed_motif = utils.ParsedMotif(motif)
+ for mod_code in parsed_motif.mod_codes:
+ mod_thresh_command_list = mod_thresh_command_list + [
+ "--mod-thresholds",
+ f"{mod_code}:{adjusted_threshold}",
+ ]
+
+ if log:
+ if not quiet:
+ print("logging to ", Path(output_path) / "extract-log")
+ log_command_list = ["--log-filepath", Path(output_path) / "extract-log"]
+ else:
+ log_command_list = []
+
+ ref_genome_command_list = ["--ref", ref_genome]
+ filter_command_list = ["--filter-threshold", "0"]
+
+ # Run modkit once for each motif, because the output .txt can be ambiguous otherwise
+ # There is no column currently to specify the motif (e.g. CG,0 vs GCH,1), only canonical
+ # base (e.g. C) and mod code (e.g. m)
+ # There is a 5mer context so we could technically manually motif check if we want to.
+ # Our current design paradigm is to leave all such operations to modkit, hence the loop below.
+ for motif in motifs:
+ # Here we prepare the motif-specific commands and delete any old .txt file because
+ # modkit will crash otherwise
+ motif_command_list = []
+ parsed_motif = utils.ParsedMotif(motif)
+ motif_command_list.append("--motif")
+ motif_command_list.append(parsed_motif.motif_seq)
+ motif_command_list.append(str(parsed_motif.modified_pos))
+
+ output_txt = Path(output_path) / (f"reads.{motif}.txt")
+
+ if os.path.exists(output_txt):
+ os.remove(output_txt)
+
+ extract_command_list = (
+ ["modkit", "extract", input_file, output_txt]
+ + region_command_list
+ + motif_command_list
+ + cores_command_list
+ + log_command_list
+ + ref_genome_command_list
+ + filter_command_list
)
- bam.reset()
- reads = bam.fetch(
- reference=window.chromosome, start=window.begin, end=window.end
- )
- if showReadProgress:
- reads = tqdm(
- reads, desc="Processing reads", unit="reads", total=total_reads
+
+ # TODO: Do we need to store and use the output from this method? Previously was being printed immediately afterward.
+ # This is something the user might want to see - it's the end-of-process message for modkit, says e.g. how many reads were processed and stuff
+ _ = run_modkit.run_with_progress_bars(
+ command_list=extract_command_list,
+ input_file=input_file,
+ ref_genome=ref_genome,
+ motifs=[motif],
+ load_fasta_regex=r"\s+\[.*?\]\s+(\d+)\s+parsing FASTA",
+ find_motifs_regex=r"\s+(\d+)/(\d+)\s+([\w]+)\s+searched",
+ contigs_progress_regex=r"\s+(\d+)/(\d+)\s+contigs\s+[^s]",
+ single_contig_regex=r"\s+(\d+)/(\d+)\s+processing\s+([\w]+)[^\w]",
+ buffer_size=100,
+ progress_granularity=50,
+ done_str="Done",
+ err_str="Error",
+ expect_done=False,
+ quiet=quiet,
)
- for read in reads:
- [
- (mod, positions, probs),
- (mod2, positions2, probs2),
- ] = get_modified_reference_positions(
- read,
- basemod,
- window,
- center,
- threshA,
- threshC,
- windowSize,
- fileName,
- sampleName,
- outDir,
- extractAllBases,
+ # print(done_string)
+
+ # Create the compressed and indexed output
+ read_by_base_txt_to_hdf5(
+ output_txt,
+ output_reads_path,
+ motif,
+ adjusted_threshold,
+ quiet=quiet,
)
- # Generate rows for methylationByBase database update
- for pos, prob in zip(positions, probs):
- if pos is not None:
- if (center is True and abs(pos) <= windowSize) or (
- center is False and pos > window.begin and pos < window.end
- ): # to decrease memory, only store bases within the window
- d = (
- read.query_name + ":" + str(pos),
- read.query_name,
- window.chromosome,
- int(pos),
- int(prob),
- mod,
- )
- data.append(d)
- for pos, prob in zip(positions2, probs2):
- if pos is not None:
- if (center is True and abs(pos) <= windowSize) or (
- center is False and pos > window.begin and pos < window.end
- ): # to decrease memory, only store bases within the window
- d = (
- read.query_name + ":" + str(pos),
- read.query_name,
- window.chromosome,
- int(pos),
- int(prob),
- mod2,
- )
- data.append(d)
- if data:
- # data is list of tuples associated with given read
- # or ignore because a read may overlap multiple windows
- DATABASE_NAME = (
- outDir + "/" + fileName.split("/")[-1].replace(".bam", "") + ".db"
+ # Delete intermediate file
+ if cleanup:
+ os.remove(output_txt)
+
+ return output_reads_path, processed_regions_path
+
+
+"""
+Helper functions to facilitate bam parse operations
+"""
+
+
+def verify_inputs(
+ input_file,
+ motifs,
+ ref_genome,
+):
+ """
+ Checks .bam format and alignment quality (to verify that you are using the right reference genome)
+
+ The correct-bases-called fraction, if under 35%, means the user almost definitely passed the wrong reference genome.
+ """
+ check_bam_format(input_file, motifs)
+ correct_bases, total_bases = get_alignment_quality(input_file, ref_genome)
+ if total_bases == 0:
+ raise ValueError(
+ f"First {NUM_READS_TO_CHECK} reads are empty. Please verify your {input_file.name} contents."
)
- table_name = "methylationByBase_" + sampleName
- command = (
- """INSERT OR IGNORE INTO """
- + table_name
- + """ VALUES(?,?,?,?,?,?);"""
+ elif correct_bases / total_bases < 0.35:
+ raise ValueError(
+ f"First {NUM_READS_TO_CHECK} reads have anomalously low alignment quality: only {100 * correct_bases / total_bases}% of bases align.\nPlease verify that {input_file.name} is actually aligned to {ref_genome.name}."
)
- connection = sqlite3.connect(DATABASE_NAME, timeout=60.0)
- execute_sql_command(command, DATABASE_NAME, data, connection)
- connection.close()
-
-
-def get_modified_reference_positions(
- read: pysam.AlignedSegment,
- basemod: str,
- window: Region,
- center: bool,
- threshA: int,
- threshC: int,
- windowSize: int,
- fileName: str,
- sampleName: str,
- outDir: str,
- extractAllBases: bool,
+ return
+
+
+def check_bam_format(
+ bam_file: str | Path,
+ motifs: list = ["A,0", "CG,0"],
):
- """Extract mA and mC pos & prob information for the read
- Args:
- :param read: single read from bam file
- :param basemod: which basemods, currently supported options are 'A', 'CG', 'A+CG'
- :param window: window from bed file
- :param center: report positions with respect to reference center (+/- window size) if True or in original reference space if False
- :param threshA: threshold above which to call an A base methylated
- :param threshC: threshold above which to call a C base methylated
- :param windowSize: window size around center point of feature of interest to plot (+/-); only mods within this window are stored; only applicable for center=True
- Return:
- For each mod, you get the positions where those mods are and the probabilities for those mods (parallel vectors)
"""
- if (read.has_tag("Mm")) & (";" in read.get_tag("Mm")):
- mod1 = read.get_tag("Mm").split(";")[0].split(",", 1)[0]
- mod2 = read.get_tag("Mm").split(";")[1].split(",", 1)[0]
- base = basemod[0] # this will be A, C, or A
- if basemod == "A+CG":
- base2 = basemod[2] # this will be C for A+CG case
- else: # in the case of a single mod will just be checking that single base
- base2 = base
- if base in mod1 or base2 in mod1:
- mod1_return = get_mod_reference_positions_by_mod(
- read,
- mod1,
- 0,
- window,
- center,
- threshA,
- threshC,
- windowSize,
- fileName,
- sampleName,
- outDir,
- extractAllBases,
- )
- else:
- mod1_return = (None, [None], [None])
- if base in mod2 or base2 in mod2:
- mod2_return = get_mod_reference_positions_by_mod(
- read,
- mod2,
- 1,
- window,
- center,
- threshA,
- threshC,
- windowSize,
- fileName,
- sampleName,
- outDir,
- extractAllBases,
- )
- return (mod1_return, mod2_return)
- else:
- return (mod1_return, (None, [None], [None]))
- else:
- return ((None, [None], [None]), (None, [None], [None]))
-
-
-def get_mod_reference_positions_by_mod(
- read: pysam.AlignedSegment,
- basemod: str,
- index: int,
- window: Region,
- center: bool,
- threshA: int,
- threshC: int,
- windowSize: int,
- fileName: str,
- sampleName: str,
- outDir: str,
- extractAllBases: bool,
-):
- """Get positions and probabilities of modified bases for a single read
+ Check whether a .bam file is formatted appropriately for modkit.
+ * bam file has a .bai index
+ * modification tags named MM/ML (NOT Mm/Ml)
+ * tags contain ambiguity specification (? vs.)
+ * bam file contains the expected modifications (motif, mod code)
+
Args:
- :param read: one read in bam file
- :param mod: which basemod, reported as base+x/y/m
- :param window: window from bed file
- :param center: report positions with respect to reference center (+/- window size) if True or in original reference space if False
- :param threshA: threshold above which to call an A base methylated
- :param threshC: threshold above which to call a C base methylated
- :param windowSize: window size around center point of feature of interest to plot (+/-); only mods within this window are stored; only applicable for center=True
- :param index: 0 or 1
+ bam_file: a formatted .bam file with a .bai index
+ motifs: a list of base modification motifs
+
+ Returns:
+ None. If the function returns, you are ok.
+
"""
- modsPresent = True
- base, mod = basemod.split("+")
- num_base = len(read.get_tag("Mm").split(";")[index].split(",")) - 1
- # get base_index
- base_index = np.array(
- [
- i
- for i, letter in enumerate(read.get_forward_sequence())
- if letter == base
- ]
- )
- # get reference positons
- refpos = np.array(read.get_reference_positions(full_length=True))
- if read.is_reverse:
- refpos = np.flipud(refpos)
- modified_bases = []
- if num_base == 0:
- modsPresent = False
- if modsPresent:
- deltas = [
- int(i) for i in read.get_tag("Mm").split(";")[index].split(",")[1:]
- ]
- Ml = read.get_tag("Ml")
- if index == 0:
- probabilities = np.array(Ml[0:num_base], dtype=int)
- if index == 1:
- probabilities = np.array(Ml[0 - num_base :], dtype=int)
- # determine locations of the modified bases, where index_adj is the adjustment of the base_index
- # based on the cumulative sum of the deltas
- locations = np.cumsum(deltas)
- # loop through locations and increment index_adj by the difference between the next location and current one + 1
- # if the difference is zero, therefore, the index adjustment just gets incremented by one because no base should be skipped
- index_adj = []
- index_adj.append(locations[0])
- i = 0
- for i in range(len(locations) - 1):
- diff = locations[i + 1] - locations[i]
- index_adj.append(index_adj[i] + diff + 1)
- # get the indices of the modified bases
- modified_bases = base_index[index_adj]
-
- # extract CpG sites only rather than all mC
- keep = []
- prob_keep = []
- all_bases_index = []
- probs = []
- i = 0
- seq = read.get_forward_sequence()
- # deal with None for refpos from soft clipped / unaligned bases
- if "C" in basemod:
- for b in base_index:
- if (
- b < len(seq) - 1
- ): # if modified C is not the last base in the read
- if (refpos[b] is not None) & (refpos[b + 1] is not None):
- if seq[b + 1] == "G":
+ basemods_found_dict = {}
+ mod_codes_dict = {}
+ mod_codes_found_dict = defaultdict(set)
+ for motif in motifs:
+ parsed_motif = utils.ParsedMotif(motif)
+ mod_codes_dict[parsed_motif.modified_base] = parsed_motif.mod_codes
+ basemods_found_dict[parsed_motif.modified_base] = False
+
+ input_bam = pysam.AlignmentFile(bam_file)
+
+ try:
+ for counter, read in enumerate(input_bam.fetch()):
+ read_dict = read.to_dict()
+ for tag_string in read_dict["tags"]:
+ tag = tag_string.split(",")[0].split(":")[0]
+ if tag == "Mm" or tag == "Ml":
+ raise ValueError(
+ f'Base modification tags are out of spec (Mm and Ml instead of MM and ML). \n\nConsider using "modkit update-tags {str(bam_file)} new_file.bam" in the command line with your conda environment active and then trying with the new file. For megalodon basecalling/modcalling, you may also need to pass "--mode ambiguous.\nBe sure to index the resulting .bam file."'
+ )
+ elif tag == "MM":
+ for tag_substring in tag_string.split(";"):
+ tag_fields = tag_substring.split(",")[0].split(":")
+ if len(tag_fields) >= 3:
+ tag_value = tag_fields[2]
+ else:
+ tag_value = tag_fields[0]
if (
- abs(refpos[b + 1] - refpos[b]) == 1
- ): # ensure there isn't a gap
- all_bases_index.append(
- b
- ) # add to all_bases_index whether or not modified
- if b in modified_bases:
- if probabilities[i] >= threshC:
- keep.append(b)
- prob_keep.append(i)
- if extractAllBases:
- if b in modified_bases:
- probs.append(probabilities[i])
+ len(tag_value) > 0
+ and tag_value[-1] != "?"
+ and tag_value[-1] != "."
+ ):
+ raise ValueError(
+ f'Base modification tags are out of spec. Need ? or . in TAG:TYPE:VALUE for MM tag, else modified probability is considered to be implicit. \n\nConsider using "modkit update-tags {str(bam_file)} new_file.bam --mode ambiguous" in the command line with your conda environment active and then trying with the new file.'
+ )
+ else:
+ if (
+ len(tag_value) > 0
+ and tag_value[0] in basemods_found_dict
+ ):
+ correct_mod_codes = mod_codes_dict[tag_value[0]]
+ # valid_mod_codes = mod_codes_dict[tag_value[0]].union(
+ # utils.BASEMOD_NAMES_DICT[tag_value[0]]
+ # )
+ if tag_value[2] in correct_mod_codes:
+ basemods_found_dict[tag_value[0]] = True
else:
- probs.append(0)
- # increment for each instance of modified base
- if b in modified_bases:
- i = i + 1
- else: # for m6A no need to look at neighboring base; do need to remove refpos that are None
- for b in base_index:
- if refpos[b] is not None:
- all_bases_index.append(
- b
- ) # add to all_bases_index whether or not modified
- if b in modified_bases:
- if probabilities[i] >= threshA:
- keep.append(b)
- prob_keep.append(i)
- if extractAllBases:
- if b in modified_bases:
- probs.append(probabilities[i])
- else:
- probs.append(0)
- # increment for each instance of modified base
- if b in modified_bases:
- i = i + 1
- # adjust position to be centered at 0 at the center of the motif; round in case is at 0.5
- # add returning base_index for plotting mod/base_abundance
- if center is True:
- if window.strand == "+":
- refpos_mod_adjusted = np.array(refpos[keep]) - round(
- ((window.end - window.begin) / 2 + window.begin)
- )
- refpos_total_adjusted = np.array(refpos[all_bases_index]) - round(
- ((window.end - window.begin) / 2 + window.begin)
- )
- if window.strand == "-":
- refpos_mod_adjusted = -1 * (
- np.array(refpos[keep])
- - round(((window.end - window.begin) / 2 + window.begin))
- )
- refpos_total_adjusted = -1 * (
- np.array(refpos[all_bases_index])
- - round(((window.end - window.begin) / 2 + window.begin))
- )
- update_methylation_aggregate_db(
- refpos_mod_adjusted,
- refpos_total_adjusted,
- basemod,
- center,
- windowSize,
- window,
- fileName,
- sampleName,
- outDir,
- )
- if extractAllBases:
- return (basemod, refpos_total_adjusted, probs)
- elif not modsPresent:
- return (None, [None], [None])
+ mod_codes_found_dict[tag_value[0]].add(tag_value[2])
+ # With the mode-code-aware motifs, it no longer makes sense to throw this error
+ # This is because the warning the user gets if their mod code isn't found, or (if none is specified)
+ # the default mod codes aren't found, can tell them what mod codes *were* found and they can add them
+ # to their motif or use adjust_mods according to what makes sense. Thus, unexpected codes are not a
+ # problem (in part this is because parse_bam will now set thresholds for the motif-specified OR default mod codes)
+ # elif tag_value[2] not in valid_mod_codes:
+ # raise ValueError(
+ # f'Base modification name unexpected: {tag_value[2]} to modify {tag_value[0]}, should be in set {valid_mod_codes}. \n\nIf you know what your mod names correspond to in terms of the latest .bam standard, consider using "modkit adjust-mods {str(bam_file)} new_file.bam --convert 5mC_name m --convert N6mA_name a --convert other_basemod_name correct_label" and then trying with the new file. Note: currently supported mod names are {utils.BASEMOD_NAMES_DICT}'
+ # )
+ if all(basemods_found_dict.values()):
+ return
+ if counter >= NUM_READS_TO_CHECK:
+ missing_bases = []
+ for base, found in basemods_found_dict.items():
+ if not found:
+ missing_bases.append(base)
+ print(
+ f"""
+WARNING: no modified appropriately-coded values found for {missing_bases} in the first {counter} reads.
+Do you expect this file to contain these modifications? parse_bam is looking for {motifs} but for {missing_bases} found only found {[f"{base}+{mod_codes}" for base, mod_codes in mod_codes_found_dict.items()]}.
+
+Consider passing only the motifs and mod codes (e.g. m,h,a) that you expect to be present in your file.
+You can use modkit adjust-mods --convert [OPTIONS] to update or consolidate mod codes.
+See https://github.com/nanoporetech/modkit/blob/master/book/src/advanced_usage.md
+ """
+ )
+ return
+ except ValueError as e:
+ if "fetch called on bamfile without index" in str(e):
+ raise ValueError(
+ f'{e}. Consider using "samtools index {str(bam_file)}" to create an index if your .bam is already sorted.'
+ ) from e
else:
- return (basemod, refpos_mod_adjusted, probabilities[prob_keep])
- else:
- update_methylation_aggregate_db(
- refpos[keep],
- refpos[all_bases_index],
- basemod,
- center,
- windowSize,
- window,
- fileName,
- sampleName,
- outDir,
+ raise
+ except:
+ raise
+
+
+def get_alignment_quality(
+ bam_file,
+ ref_genome,
+) -> tuple[int, int]:
+ """
+ Determine fraction of read bases that line up with reference genome in first NUM_READS_TO_CHECK reads in bam file
+ """
+ ref_genome_index = ref_genome.parent / (ref_genome.name + ".fai")
+ if not ref_genome_index.exists():
+ print(f"Indexing {ref_genome.name}. This only needs to be done once.")
+ pysam.faidx(str(ref_genome))
+ input_bam = pysam.AlignmentFile(bam_file, "rb")
+ genome_fasta = pysam.FastaFile(str(ref_genome))
+ total_bases = 0
+ correct_bases = 0
+ # For NUM_READS_TO_CHECK=100 this is <1s on most machines
+ for index, read in enumerate(input_bam.fetch()):
+ if index >= NUM_READS_TO_CHECK:
+ return correct_bases, total_bases
+
+ # The query sequence is the entire sequence as stored in the .bam file
+ # So it is reverse complemented if it was a reverse read
+ # Meaning we can compare it directly against the reference genome
+ read_sequence = read.query_sequence
+
+ # print(read.mapping_quality)
+
+ # get_aligned_pairs returns a list of (read_coord,ref_coord) pairs with None values when not aligned
+ # So if we just skip Nones and compare the remainder it'll tell us the accuracy
+ # (in Dorado-basecalled r10 files, as of July 2024, we observe some fraction of reads
+ # with empty read_sequence, despite having intact tags and alignment info. the reason
+ # for this isn't currentyl known, but with this None check we avoid errors in this alignment
+ # checking stage.)
+
+ if read_sequence is not None:
+ for pos_in_read, pos_in_ref in read.get_aligned_pairs():
+ if pos_in_read is not None and pos_in_ref is not None:
+ total_bases += 1
+ if read_sequence[pos_in_read] == str(
+ genome_fasta.fetch(
+ read.reference_name, pos_in_ref, pos_in_ref + 1
+ )
+ ):
+ correct_bases += 1
+
+ return correct_bases, total_bases
+
+
+def create_region_command_list(
+ output_path,
+ regions,
+ window_size,
+):
+ """
+ Creates commands to pass to modkit for specifying genomic regions.
+
+ TODO: Split into two function? Convert to bed, then construct commands
+ """
+
+ if regions is not None:
+ bed_filepath_processed = output_path / "regions.processed.bed"
+ regions_dict = utils.regions_dict_from_input(
+ regions,
+ window_size,
)
- if extractAllBases:
- return (basemod, np.array(refpos[all_bases_index]), probs)
- elif not modsPresent:
- return (None, [None], [None])
- else:
- return (basemod, np.array(refpos[keep]), probabilities[prob_keep])
-
-
-def update_methylation_aggregate_db(
- refpos_mod: np.ndarray,
- refpos_total: np.ndarray,
- basemod: str,
- center: bool,
- windowSize: int,
- window: Region,
- fileName: str,
- sampleName: str,
- outDir: str,
+ utils.bed_from_regions_dict(regions_dict, bed_filepath_processed)
+ region_specifier = ["--include-bed", str(bed_filepath_processed)]
+
+ else:
+ bed_filepath_processed = None
+ region_specifier = []
+
+ return region_specifier, bed_filepath_processed
+
+
+def read_by_base_txt_to_hdf5(
+ input_txt: str | Path,
+ output_h5: str | Path,
+ motif: str,
+ thresh: float | None = None,
+ quiet: bool = False,
+ compress_level: int = 1,
+ chunk_size: int = 1000,
) -> None:
- """Updates the aggregate methylation table with all of the methylation information from a single read.
+ """
+ Takes in a txt file generated by modkit extract and appends
+ all the data from a specified motif into an hdf5 file. If a thresh is specified, it
+ also binarizes the mod calls.
+
+ If the h5 file does not exist it will be created and datasets will be added for read_name,
+ chromosome, read_start, read_end, strand, motif, mod_vector, and val_vector.
+
+ All the datasets (exception threshold) are parallel arrays of length num_reads
+
+ Each read's position data is defined in genomic reference coordinates on the positive strand
+ (i.e. the read_start is the leftmost aligned position, read_end is the rightmost, vectors
+ are left to right along genomic coordinates)
+
+ TODO: Make a nice key:value map of the h5 file structure, make sure start and end are documented
+ as reconstructions NOT original cigarstring alignment info. mention pysam
+
Args:
- :param refpos_mod: list of modified reference positions
- :param refpos_total: list of all reference positions for the base in question
- df with columns pos:modification, pos, mod, methylated_bases, total_bases
+ input_txt: a string or Path pointing to a modkit extracted base-by-base modifications
+ file. This file is assumed to have been created by modkit v0.2.4, other versions may
+ have a different format and may not function normally.
+ output_h5: a string or Path pointing to a valid place to save an .h5 file. If this
+ file already exists, it will not be cleared and will simply be appended to.
+ motif: a string specifying a single base modification. Basemods are specified as
+ {sequence_motif},{position_of_modification},{optional mod_code}. For example,
+ a methylated adenine is specified as 'A,0' or 'A,0,a' and CpG methylation is
+ specified as 'CG,0' or 'CG,0,m'.
+ thresh: a floating point threshold for base modification calling, between zero and one.
+ If specified as None, raw probabilities will be saved in the .h5 output.
+ quiet: if True, this suppresses outputs
+ compress_level: gzip compression level for datasets, specifically for vectors for now
+ chunk_size: size of write chunks in reads
+
+ Returns:
+ None
+
"""
- # store list of entries for a given read
- data = []
- for pos in refpos_total:
- # only store positions within window
- if (center is True and abs(pos) <= windowSize) or (
- center is False and pos > window.begin and pos < window.end
- ):
- # key is pos:mod
- id = str(pos) + ":" + basemod
- if pos in refpos_mod:
- data.append((id, int(pos), basemod, 1, 1))
+ """
+ TODO: There are some issues that are all related here:
+ dimelo/parse_bam.py:718: error: Incompatible types in assignment (expression has type "Path | None", variable has type "str | Path") [assignment]
+ dimelo/parse_bam.py:725: error: Item "str" of "str | Path" has no attribute "open" [union-attr]
+ dimelo/parse_bam.py:890: error: Item "str" of "str | Path" has no attribute "name" [union-attr]
+
+ I'm not sure of the most elegant way to fix it. Come back and address.
+ """
+ input_txt, output_h5 = utils.sanitize_path_args(input_txt, output_h5)
+
+ parsed_motif = utils.ParsedMotif(motif)
+
+ read_name = ""
+ num_reads = 0
+ # TODO: I think the function calls can be consolidated; lots of repetition
+ # TODO: Consider opening both files at once
+ with input_txt.open() as txt:
+ # Check file length
+ for line_index, line in enumerate(txt):
+ fields = line.split("\t")
+ if line_index > 0 and read_name != fields[0]:
+ read_name = fields[0]
+ num_reads += 1
+ num_lines = line_index
+ txt.seek(0)
+
+ with h5py.File(output_h5, "a") as h5:
+ ## Define hdf5 dataset types for later
+ dt_str = h5py.string_dtype(encoding="utf-8")
+ # mod and val vectors -> uint8 allows us to just write whatever bytes we want
+ # h5py does not appear to otherwise support vlen binary
+ dt_vlen = h5py.vlen_dtype(np.dtype("uint8"))
+
+ ## Format threshold value and create dataset to store whether this data is thresholded (binary) or raw (float16)
+ # TODO: should this method thresholding without binarization
+ # None becomes NaN
+ threshold_to_store = np.nan if thresh is None else thresh
+ if "threshold" in h5:
+ threshold_from_existing = h5["threshold"][()]
+ if threshold_from_existing != threshold_to_store and not (
+ np.isnan(threshold_from_existing) and np.isnan(threshold_to_store)
+ ):
+ raise ValueError(
+ "existing threshold in output_h5 does not match provided threshold for read_by_base_txt_to_hdf5."
+ )
else:
- data.append((id, int(pos), basemod, 0, 1))
+ h5.create_dataset("threshold", data=threshold_to_store)
- if data: # if data to append is not empty
- DATABASE_NAME = (
- outDir + "/" + fileName.split("/")[-1].replace(".bam", "") + ".db"
- )
- # set variables for sqlite entry
- table_name = "methylationAggregate_" + sampleName
-
- # create or ignore if key already exists
- # need to add 0 filler here so later is not incremented during update command
- command = (
- """INSERT OR IGNORE INTO """
- + table_name
- + """ VALUES(?,?,?,?,?);"""
- )
+ ## Create read metadata datasets
+ # TODO: loop through dict instead?
+ if "read_name" in h5:
+ old_size = h5["read_name"].shape[0]
+ h5["read_name"].resize((old_size + num_reads,))
+ else:
+ old_size = 0
+ h5.create_dataset(
+ "read_name",
+ (num_reads,),
+ maxshape=(None,),
+ dtype=dt_str,
+ compression="gzip",
+ compression_opts=9,
+ )
+ if "chromosome" in h5:
+ if old_size != h5["chromosome"].shape[0]:
+ print("size mismatch: read_name:chromosome")
+ else:
+ h5["chromosome"].resize((old_size + num_reads,))
+ else:
+ h5.create_dataset(
+ "chromosome",
+ (num_reads,),
+ maxshape=(None,),
+ dtype=dt_str,
+ compression="gzip",
+ compression_opts=9,
+ )
+ if "read_start" in h5:
+ if old_size != h5["read_start"].shape[0]:
+ print("size mismatch", "read_name", "read_start")
+ else:
+ h5["read_start"].resize((old_size + num_reads,))
+ else:
+ h5.create_dataset(
+ "read_start",
+ (num_reads,),
+ maxshape=(None,),
+ dtype="i",
+ compression="gzip",
+ compression_opts=9,
+ )
+ if "read_end" in h5:
+ if old_size != h5["read_end"].shape[0]:
+ print("size mismatch", "read_name", "read_end")
+ else:
+ h5["read_end"].resize((old_size + num_reads,))
+ else:
+ h5.create_dataset(
+ "read_end",
+ (num_reads,),
+ maxshape=(None,),
+ dtype="i",
+ compression="gzip",
+ compression_opts=9,
+ )
+ if "strand" in h5:
+ if old_size != h5["strand"].shape[0]:
+ print("size mismatch", "read_name", "strand")
+ else:
+ h5["strand"].resize((old_size + num_reads,))
+ else:
+ h5.create_dataset(
+ "strand",
+ (num_reads,),
+ maxshape=(None,),
+ dtype=dt_str,
+ compression="gzip",
+ compression_opts=9,
+ )
+ if "motif" in h5:
+ if old_size != h5["motif"].shape[0]:
+ print("size mismatch", "read_name", "motif")
+ else:
+ h5["motif"].resize((old_size + num_reads,))
+ else:
+ h5.create_dataset(
+ "motif",
+ (num_reads,),
+ maxshape=(None,),
+ dtype=dt_str,
+ compression="gzip",
+ compression_opts=9,
+ )
- data_fill = [(x[0], x[1], x[2], 0, 0) for x in data]
- connection = sqlite3.connect(DATABASE_NAME, timeout=60.0)
- execute_sql_command(command, DATABASE_NAME, data_fill, connection)
- connection.close()
-
- # update table for all entries
- # values: methylated_bases, total_bases, id
- # these are entries 3, 4, 0 in list of tuples
- values_subset = [(x[3], x[4], x[0]) for x in data]
- command = (
- """UPDATE """
- + table_name
- + """ SET methylated_bases = methylated_bases + ?, total_bases = total_bases + ? WHERE id = ?"""
- )
- connection = sqlite3.connect(DATABASE_NAME, timeout=60.0)
- execute_sql_command(command, DATABASE_NAME, values_subset, connection)
- connection.close()
+ ## Create the vector datasets. These will contain raw bytes formatted into a uint8 array
+ # TODO: loop through dict instead
+ if "mod_vector" in h5:
+ if old_size != h5["mod_vector"].shape[0]:
+ print("size mismatch read_name:mod_vector")
+ else:
+ h5["mod_vector"].resize((old_size + num_reads,))
+ else:
+ h5.create_dataset(
+ "mod_vector",
+ (num_reads,),
+ maxshape=(None,),
+ dtype=dt_vlen,
+ # compression='gzip', # we are handling compression ourselves because hdf5 is bad at it
+ # compression_opts=9,
+ )
+ if "val_vector" in h5:
+ if old_size != h5["val_vector"].shape[0]:
+ print("size mismatch read_name:val_vector")
+ else:
+ h5["val_vector"].resize((old_size + num_reads,))
+ else:
+ h5.create_dataset(
+ "val_vector",
+ (num_reads,),
+ maxshape=(None,),
+ dtype=dt_vlen,
+ # compression='gzip', # we are handling compression ourselves because hdf5 is bad at it
+ # compression_opts=9,
+ )
+ ## Add data to datasets from txt file
+ # Initialize loop vars - these will go into datasets
+ # TODO: initialize read name to actual first read so we can get rid of the logic in the loop
+ read_name = ""
+ read_chrom = ""
+ read_len = 0
+ ref_strand = ""
+ read_start = 0
+ read_end = 0
+ valid_coordinates_list: list[int] = []
+ mod_values_list: list[float] = []
+
+ # Count reads for batched write
+ read_counter = 0
+ # Keys (strings): dataset names, values: lists of dataset values by read; string or ints or arrays
+ # Contents reset at the end of each chunk, after writing to h5
+ chunk_datasets_contents: defaultdict[str, list[str | int | np.ndarray]] = (
+ defaultdict(list)
+ )
+ # TODO: replace in loop with read_counter%chunk_size as appropriate
+ reads_in_chunk = 0
+
+ # Setting up progress bars if not in quiet mode
+ # Skip header
+ iterator = enumerate(txt)
+ next(iterator)
+ if not quiet:
+ iterator = tqdm(
+ iterator,
+ total=num_lines,
+ desc=f"Transferring {num_reads} from {input_txt.name} into {output_h5.name}, new size {old_size + num_reads}",
+ bar_format="{bar}| {desc} {percentage:3.0f}% | {elapsed}<{remaining}",
+ )
-def main():
- parser = argparse.ArgumentParser(
- description="Parse a bam file into DiMeLo database tables"
- )
+ # Loop through txt file
+ for line_index, line in iterator:
+ # TODO: use csv module
+ fields = line.split("\t")
+ pos_in_genome = int(fields[2])
+ canonical_base = fields[15]
+ prob = float(fields[10])
+ mod_code = fields[11]
+
+ if read_name != fields[0]:
+ # Record the previous read details unless this is the first line
+ if line_index > 1:
+ # TODO: Replace this with read_end-read_start; this will pad vectors and require
+ # regenerating test reference data
+ if len(valid_coordinates_list) > 0:
+ read_len_along_ref = max(valid_coordinates_list) + 1
+ else:
+ read_len_along_ref = read_len
+
+ # Populate mod vector array appropriately based on thresh settings
+ mod_vector = np.zeros(read_len_along_ref, dtype=np.uint8)
+ if thresh is None:
+ # We subtract 0.25 because in modkit they add 0.5, but our elements are zero when the
+ # base motif isn't present, so to get things to round to the right integers to match the
+ # original .bam file, subtracting 0.25 is good. Anything from 0.001 to 0.4999 would work I think
+ mod_vector[valid_coordinates_list] = np.rint(
+ np.array(mod_values_list) * 256 - 0.25
+ ).astype(np.uint8)
+ else:
+ mod_vector[valid_coordinates_list] = np.array(
+ mod_values_list
+ ).astype(np.uint8)
+ # TODO: consolidate compression into a function shared across
+ mod_vector_compressed = np.frombuffer(
+ gzip.compress(
+ mod_vector.tobytes(), compresslevel=compress_level
+ ),
+ dtype=np.uint8,
+ )
+
+ # Populate valid vector array
+ valid_vector = np.zeros(read_len_along_ref, dtype=np.uint8)
+ valid_vector[valid_coordinates_list] = 1
+ valid_vector_compressed = np.frombuffer(
+ gzip.compress(
+ valid_vector.tobytes(), compresslevel=compress_level
+ ),
+ dtype=np.uint8,
+ )
+
+ chunk_datasets_contents["read_name"].append(read_name)
+ chunk_datasets_contents["chromosome"].append(read_chrom)
+ chunk_datasets_contents["read_start"].append(read_start)
+ chunk_datasets_contents["read_end"].append(read_end)
+ chunk_datasets_contents["strand"].append(ref_strand)
+ chunk_datasets_contents["motif"].append(motif)
+ chunk_datasets_contents["mod_vector"].append(
+ mod_vector_compressed
+ )
+ chunk_datasets_contents["val_vector"].append(
+ valid_vector_compressed
+ )
+
+ # Write chunk if enough reads have built up
+ reads_in_chunk += 1
+ if reads_in_chunk >= chunk_size:
+ for dataset, entry in chunk_datasets_contents.items():
+ start_index = (
+ old_size + (read_counter // chunk_size) * chunk_size
+ )
+ end_index = old_size + read_counter + 1
+ h5[dataset][start_index:end_index] = entry
+ chunk_datasets_contents = defaultdict(list)
+ reads_in_chunk = 0
+ read_counter += 1
+
+ ## Set up for next read
+ read_name = fields[0]
+ read_chrom = fields[3]
+ read_len = int(fields[9])
+ ref_strand = fields[5]
+ # TODO: verify that read position is in the right (ref) coordinate system
+ if ref_strand == "+":
+ pos_in_read_ref = int(fields[1])
+ elif ref_strand == "-":
+ pos_in_read_ref = read_len - int(fields[1]) - 1
+ # Calculate read info
+ read_start = pos_in_genome - pos_in_read_ref
+ read_end = read_start + read_len
+ # Instantiate lists
+ mod_values_list = []
+ valid_coordinates_list = []
+
+ # Regardless of whether its a new read or not,
+ # add modification to vector if motif type is correct
+ # for the motif in question
+ if (
+ canonical_base == parsed_motif.modified_base
+ and mod_code in parsed_motif.mod_codes
+ ):
+ valid_coordinates_list.append(pos_in_genome - read_start)
+ if thresh is None:
+ mod_values_list.append(prob)
+ elif prob >= thresh:
+ mod_values_list.append(1)
+ else:
+ mod_values_list.append(0)
+
+ # Save the last read
+ # TODO: try to consolidate
+ if len(read_name) > 0:
+ # Build the vectors
+ if len(valid_coordinates_list) > 0:
+ read_len_along_ref = max(valid_coordinates_list) + 1
+ else:
+ read_len_along_ref = read_len
+
+ # Populate mod vector array appropriately based on thresh settings
+ mod_vector = np.zeros(read_len_along_ref, dtype=np.uint8)
+ if thresh is None:
+ # We subtract 0.25 because in modkit they add 0.5, but our elements are zero when the
+ # base motif isn't present, so to get things to round to the right integers to match the
+ # original .bam file, subtracting 0.25 is good. Anything from 0.001 to 0.4999 would work I think
+ mod_vector[valid_coordinates_list] = np.rint(
+ np.array(mod_values_list) * 256 - 0.25
+ ).astype(np.uint8)
+ else:
+ mod_vector[valid_coordinates_list] = np.array(
+ mod_values_list
+ ).astype(np.uint8)
+ # TODO: consolidate compression into a function shared across
+ mod_vector_compressed = np.frombuffer(
+ gzip.compress(mod_vector.tobytes(), compresslevel=compress_level),
+ dtype=np.uint8,
+ )
- # Required arguments
- required_args = parser.add_argument_group("required arguments")
- required_args.add_argument(
- "-f",
- "--fileName",
- required=True,
- help="name of bam file with Mm and Ml tags",
- )
- required_args.add_argument(
- "-s",
- "--sampleName",
- required=True,
- help="name of sample for output SQL table name labelling",
- )
- required_args.add_argument(
- "-o",
- "--outDir",
- required=True,
- help="directory where SQL database is stored",
- )
+ # Populate valid vector array
+ valid_vector = np.zeros(read_len_along_ref, dtype=np.uint8)
+ valid_vector[valid_coordinates_list] = 1
+ valid_vector_compressed = np.frombuffer(
+ gzip.compress(valid_vector.tobytes(), compresslevel=compress_level),
+ dtype=np.uint8,
+ )
- # Required, mutually exclusive arguments
- window_group = parser.add_mutually_exclusive_group(required=True)
- window_group.add_argument(
- "-b",
- "--bedFile",
- help="name of bed file that defines regions of interest over which to extract mod calls",
- )
- window_group.add_argument(
- "-r",
- "--region",
- help='single region over which to extract base mods, e.g. "chr1:1-100000"',
- )
+ chunk_datasets_contents["read_name"].append(read_name)
+ chunk_datasets_contents["chromosome"].append(read_chrom)
+ chunk_datasets_contents["read_start"].append(read_start)
+ chunk_datasets_contents["read_end"].append(read_end)
+ chunk_datasets_contents["strand"].append(ref_strand)
+ chunk_datasets_contents["motif"].append(motif)
+ chunk_datasets_contents["mod_vector"].append(mod_vector_compressed)
+ chunk_datasets_contents["val_vector"].append(valid_vector_compressed)
+
+ for dataset, entry in chunk_datasets_contents.items():
+ start_index = old_size + (read_counter // chunk_size) * chunk_size
+ end_index = old_size + read_counter + 1
+ h5[dataset][start_index:end_index] = entry
+ read_counter += 1
+ return
+
+
+def prep_output_directory(
+ output_directory: Path | None,
+ output_name: str,
+ input_file: Path,
+ output_file_names: list[str],
+) -> tuple[Path, list[Path]]:
+ """
+ As a side effect, if files exist that match the requested outputs, they are deleted.
- # Optional arguments
- parser.add_argument(
- "-m",
- "--basemod",
- type=str,
- default=DEFAULT_BASEMOD,
- choices=["A", "CG", "A+CG"],
- help="which base modifications to extract",
- )
- parser.add_argument(
- "-A",
- "--threshA",
- type=int,
- default=DEFAULT_THRESH_A,
- help="threshold above which to call an A base methylated",
- )
- parser.add_argument(
- "-C",
- "--threshC",
- type=int,
- default=DEFAULT_THRESH_C,
- help="threshold above which to call a C base methylated",
- )
- parser.add_argument(
- "-e",
- "--extractAllBases",
- action="store_true",
- help="store all base mod calls, regardless of methylation probability threshold",
- )
- parser.add_argument(
- "-p",
- "--cores",
- type=int,
- help="number of cores over which to parallelize",
- )
- parser.add_argument(
- "-c",
- "--center",
- action="store_true",
- help="report positions with respect to center of motif window; only valid with bed file input",
- )
- parser.add_argument(
- "-w",
- "--windowSize",
- type=int,
- default=DEFAULT_WINDOW_SIZE,
- help=f"window size around center point of feature of interest to plot (+/-); only mods within this window are stored (default: {DEFAULT_WINDOW_SIZE} bp)",
- )
+ TODO: Is it kind of silly that this takes in input_file? Maybe should take in some generic default parameter, or this default should be set outside this method?
+
+ Args:
+ output_directory: Path pointing to an output directory.
+ If left as None, outputs will be stored in a new folder within the input
+ directory.
+ output_name: a string that will be used to create an output folder
+ containing the intermediate and final outputs, along with any logs.
+ input_file: Path to input file; used to define default output directory
+ output_file_names: list of names of desired output files
+
+ Returns:
+ * Path to top-level output directory
+ * List of Paths to requested output files
+ """
+ if output_directory is None:
+ output_directory = input_file.parent
+ print(f"No output directory provided, using input directory {output_directory}")
+
+ output_path = output_directory / output_name
+
+ output_files = [output_path / file_name for file_name in output_file_names]
+
+ # Ensure output path exists, and that any of the specified output files do not already exist (necessary for some outputs)
+ # Delete the files that do already exist
+ output_path.mkdir(parents=True, exist_ok=True)
+ for output_file in output_files:
+ output_file.unlink(missing_ok=True)
- args = parser.parse_args()
- parse_bam(**vars(args))
+ return output_path, output_files
diff --git a/dimelo/plot_browser.py b/dimelo/plot_browser.py
deleted file mode 100644
index 974e2dc3..00000000
--- a/dimelo/plot_browser.py
+++ /dev/null
@@ -1,847 +0,0 @@
-r"""
-=================
-plot_browser module
-=================
-.. currentmodule:: dimelo.plot_browser
-.. autosummary::
- plot_browser
-
-plot_browser plots single molecules with colored base modifications in region of interest
-
-
-Portions of code adapted from methplotlib:
-Copyright (c) 2018 Wouter De Coster
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-"""
-
-# code adapted from methplotlib
-# https://doi.org/10.1093/bioinformatics/btaa093
-
-import argparse
-import multiprocessing
-import os
-import sqlite3
-import sys
-
-import matplotlib.pyplot as plt
-import pandas as pd
-import plotly
-import plotly.graph_objs as go
-import pyranges as pr
-import seaborn as sns
-
-from dimelo.parse_bam import parse_bam
-
-# import plotly.io as pio
-
-COLOR_A = "#053C5E"
-COLOR_C = "#BB4430"
-DEFAULT_THRESH_A = 129
-DEFAULT_THRESH_C = 129
-DEFAULT_SMOOTH = 1000
-DEFAULT_MIN_PERIODS = 100
-DEFAULT_DOTSIZE = 4
-
-
-class DataTraces(object):
- def __init__(self, traces, names):
- self.traces = traces
- self.names = names
- self.index = 0
-
- def __iter__(self):
- return self
-
- def __next__(self):
- if self.index == len(self.traces):
- raise StopIteration
- else:
- self.index += 1
- return self.traces[self.index - 1]
-
-
-class Region(object):
- def __init__(self, region, fasta=None):
- if ":" in region:
- try:
- self.chromosome, interval = region.replace(",", "").split(":")
- self.begin, self.end = [int(i) for i in interval.split("-")]
- except ValueError:
- sys.exit(
- "\n\nERROR: Region (-w/--region) inproperly formatted, "
- "examples of accepted formats are:\n"
- "'chr5:150200605-150423790'\n\n"
- )
- self.size = self.end - self.begin
- self.string = f"{self.chromosome}_{self.begin}_{self.end}"
-
-
-def plot_browser(
- fileNames,
- sampleNames,
- region,
- basemod,
- outDir,
- threshA=DEFAULT_THRESH_A,
- threshC=DEFAULT_THRESH_C,
- bedFileFeatures=None,
- smooth=DEFAULT_SMOOTH,
- min_periods=DEFAULT_MIN_PERIODS,
- colorA=COLOR_A,
- colorC=COLOR_C,
- dotsize=DEFAULT_DOTSIZE,
- static=False,
- cores=None,
-):
- """
- fileNames
- list of names of bam files with Mm and Ml tags; indexed; or single file name as string
- sampleNames
- list of names of samples for output plot name labelling; or single sample name as string; valid names contain [``a-zA-Z0-9_``].
- region
- formatted as for example: "chr1:1-100000"
- basemod
- One of the following:
-
- * ``'A'`` - extract mA only
- * ``'CG'`` - extract mCpG only
- * ``'A+CG'`` - extract mA and mCpG
- outDir
- directory to output plot
- threshA
- threshold for calling mA; default 129
- threshC
- threshold for calling mCG; default 129
- bedFileFeatures
- bedFile specifying regions to display in browser (optional); default None
- smooth
- window over which to smooth aggregate curve; default of 1000 bp
- min_periods
- minimum number of bases to consider for smoothing: default of 100 bp
- colorA
- color in hex for mA; default #053C5E
- colorC
- color in hex for mCG; default #BB4430
- dotsize
- size of points; default 4
- static
- One of the following:
-
- * ``'True'`` - pdf output
- * ``'False'`` - interactive html output; default is False
- cores
- number of cores over which to parallelize; default is all available
-
- **Example**
-
- >>> dm.plot_browser("dimelo/test/data/mod_mappings_subset.bam", "test", "chr1:2907273-2909473", "A+CG", "dimelo/dimelo_test", static=False)
- >>> dm.plot_browser(["dimelo/test/data/mod_mappings_subset.bam", "dimelo/test/data/mod_mappings_subset.bam"], ["test1", "test2"], "chr1:2907273-2909473", "A+CG", "dimelo/dimelo_test", static=False)
-
- **Return**
-
- * PDF or HTML file with single molecules displayed over region of interest. Modified bases are colored according to colorA and colorC.
- * PDFs of aggregate coverage and fraction of bases modified over region of interest.
- * A summary bed file is also produced to support visualizing aggregate data with any genome browser tool. The columns of this bed file are chr, start, end, methylated_bases, total_bases. For example, to take a summary output bed and create a file with fraction of modified bases with a window size of 100 bp for visualization with the WashU browser, you could run the below commands in terminal:
-
- * ``bedtools makewindows -g ref_genome.chromsizes.txt -w 100 > ref_genome_windows.100.bp.bed``
- * ``bedtools map -a ref_genome_windows.100.bp.bed -b outDir/fileName_sampleName_chr_start_end_A.bed -c 4,5 -o sum,sum -null 0 | awk -v "OFS=\\t" '{if($5>0){print $1,$2,$3,$4/$5}else{print $1,$2,$3,$5}}' > outDir/fileName_sampleName_chr_start_end_A.100.bed``
- * ``bgzip outDir/fileName_sampleName_chr_start_end_A.100.bed``
- * ``tabix -f -p bed outDir/fileName_sampleName_chr_start_end_A.100.bed.gz``
-
- **Example Plots**
-
- :ref:`sphx_glr_auto_examples_browser_example.py`
- """
-
- if not os.path.isdir(outDir):
- os.makedirs(outDir)
-
- cores_avail = multiprocessing.cpu_count()
- if cores is None:
- num_cores = cores_avail
- else:
- # if more than available cores is specified, process with available cores
- if cores > cores_avail:
- num_cores = cores_avail
- else:
- num_cores = cores
-
- # if single bam file rather than list is entered, convert to list
- if type(fileNames) != list:
- fileNames = [fileNames]
- # if single sample name rather than list is entered, convert to list
- if type(sampleNames) != list:
- sampleNames = [sampleNames]
-
- all_data = []
- aggregate_counts = []
- for f, n in zip(fileNames, sampleNames):
- # extract all bases and both mods to get full extent of read in terms of any mod bases
- parse_bam(
- f,
- n,
- outDir,
- basemod="A+CG",
- region=region, # pass string representation
- threshA=threshA,
- threshC=threshC,
- extractAllBases=True,
- cores=num_cores,
- )
- d = pd.read_sql(
- "SELECT * from methylationByBase_" + n,
- sqlite3.connect(
- outDir + "/" + f.split("/")[-1].replace(".bam", "") + ".db"
- ),
- )
- all_data.append(d)
- aggregate_counts.append(
- pd.read_sql(
- "SELECT * from methylationAggregate_" + n,
- sqlite3.connect(
- outDir + "/" + f.split("/")[-1].replace(".bam", "") + ".db"
- ),
- )
- )
-
- # print number of reads for each sample
- print(
- "processing "
- + str(len(d["read_name"].unique()))
- + " reads for "
- + n
- + " for bam: "
- + f
- )
-
- meth_browser(
- all_data=all_data,
- aggregate_counts=aggregate_counts,
- basemod=basemod,
- region=Region(region),
- sampleNames=sampleNames,
- outDir=outDir,
- bed=bedFileFeatures,
- smooth=smooth,
- min_periods=min_periods,
- dotsize=dotsize,
- static=static,
- threshA=threshA,
- threshC=threshC,
- colorA=colorA,
- colorC=colorC,
- )
-
- # print output files to std out
- if static is False:
- ext = "html"
- else:
- ext = "pdf"
-
- db_paths = []
- f_paths = []
- t_paths = []
- b_paths = []
-
- for f, s in zip(fileNames, sampleNames):
- db = outDir + "/" + f.split("/")[-1].replace(".bam", "") + ".db"
- db_paths.append(db)
- f_base = f.split("/")[-1].replace(".bam", "")
- if "A" in basemod:
- f_path = (
- outDir + "/" + s + "_" + "A" + "_sm_rolling_avg_fraction.pdf"
- )
- t_path = outDir + "/" + s + "_" + "A" + "_sm_rolling_avg_total.pdf"
- b_path = f"{outDir}/{f_base}_{s}_{Region(region).string}_A.bed"
- f_paths.append(f_path)
- t_paths.append(t_path)
- b_paths.append(b_path)
- if "C" in basemod:
- f_path = (
- outDir + "/" + s + "_" + "CG" + "_sm_rolling_avg_fraction.pdf"
- )
- t_path = (
- outDir + "/" + s + "_" + "CG" + "_sm_rolling_avg_total.pdf"
- )
- b_path = f"{outDir}/{f_base}_{s}_{Region(region).string}_CG.bed"
- f_paths.append(f_path)
- t_paths.append(t_path)
- b_paths.append(b_path)
-
- w = Region(region)
-
- browser_path = f"{outDir}/methylation_browser_{w.string}.{ext}"
- str_out = f"Outputs\n_______\nDB file: {db_paths}\nbrowser plot: {browser_path}\nrolling average fraction bases methylated plot: {f_paths}\nrolling average total bases plot: {t_paths}\nsummary bed file: {b_paths}"
- print(str_out)
-
-
-def create_subplots(num_methrows, names=None, annotation=True):
- """
- Prepare the panels (rows * 1 column) for the subplots.
- One row for each dataset, taking 90%/len(datasets) for heights
- if annotation is True (bed) then add a row with height 10%
- """
- return plotly.subplots.make_subplots(
- rows=num_methrows + annotation,
- cols=1,
- shared_xaxes=True,
- specs=[[{}] for i in range(num_methrows + annotation)],
- print_grid=False,
- subplot_titles=names,
- vertical_spacing=0.1 if num_methrows < 10 else 0.01,
- row_heights=[0.9 / num_methrows] * num_methrows + [0.1] * annotation,
- )
-
-
-def create_output(fig, outfile, region, static, outDir):
- """
- write output pdf or html
- """
- if static:
- outfile = outDir + "/" + f"methylation_browser_{region.string}.pdf"
- fig.write_image(outfile, width=1000, height=400)
- if not static:
- outfile = outDir + "/" + f"methylation_browser_{region.string}.html"
- with open(outfile, "w+") as output:
- output.write(
- plotly.offline.plot(
- fig,
- output_type="div",
- show_link=False,
- include_plotlyjs="cdn",
- )
- )
-
-
-def methylation(
- all_data,
- sampleNames,
- basemod,
- colorA=COLOR_A,
- colorC=COLOR_C,
- dotsize=4,
- threshA=DEFAULT_THRESH_A,
- threshC=DEFAULT_THRESH_C,
-):
- """
- Plot methylation traces
- """
- traces = []
- names = []
- for m, n in zip(all_data, sampleNames):
- traces.append(
- make_per_read_meth_traces_phred(
- all_data=all_data,
- table=m,
- basemod=basemod,
- colorA=colorA,
- colorC=colorC,
- dotsize=dotsize,
- threshA=threshA,
- threshC=threshC,
- )
- )
- names.append(n)
- return DataTraces(traces=traces, names=names)
-
-
-def make_per_read_meth_traces_phred(
- all_data,
- table,
- basemod,
- colorA,
- colorC,
- max_cov=1000,
- dotsize=4,
- threshA=DEFAULT_THRESH_A,
- threshC=DEFAULT_THRESH_C,
-):
- """Make traces for each read"""
- minmax_table = find_min_and_max_pos_per_read(table)
- df_heights = assign_y_height_per_read(minmax_table, max_coverage=max_cov)
- table = pd.merge(table, df_heights, left_on="read_name", right_on="read")
- traces = []
- hidden = 0
- for read in table["read_name"].unique():
- try:
- traces.append(
- make_per_read_line_trace(
- read_range=minmax_table.loc[read],
- y_pos=df_heights.loc[read, "height"],
- )
- )
- except KeyError:
- hidden += 1
- continue
- if hidden:
- sys.stderr.write(
- f"Warning: hiding {hidden} reads because coverage above {max_cov}x.\n"
- )
- read_table_mC = table[table["mod"].str.contains("C")]
- read_table_mA = table[table["mod"].str.contains("A")]
- cmapA = ["white", colorA]
- cmapC = ["white", colorC]
- if "C" in basemod:
- traces.append(
- make_per_position_phred_scatter(
- all_data=all_data,
- read_table=read_table_mC[read_table_mC["prob"] >= threshC],
- mod="mC",
- thresh=threshC,
- dotsize=dotsize,
- colorscale=cmapC,
- offset=0.05,
- )
- )
- if "A" in basemod:
- traces.append(
- make_per_position_phred_scatter(
- all_data=all_data,
- read_table=read_table_mA[read_table_mA["prob"] >= threshA],
- mod="mA",
- thresh=threshA,
- dotsize=dotsize,
- colorscale=cmapA,
- offset=0.15,
- )
- )
- return traces
-
-
-def make_per_position_phred_scatter(
- all_data, read_table, mod, thresh, dotsize=4, colorscale="Reds", offset=0
-):
- """Make scatter plot per modified base per read"""
- # get min and max probabilities across all for legend and color consistency for comparisons
- if "C" in mod:
- m = "C"
- if "A" in mod:
- m = "A"
- min_overall = 255
- max_overall = 0
- for d in all_data:
- min_temp = d[(d["mod"].str.contains(m)) & (d["prob"] >= thresh)][
- "prob"
- ].min()
- max_temp = d[(d["mod"].str.contains(m)) & (d["prob"] >= thresh)][
- "prob"
- ].max()
- if min_temp < min_overall:
- min_overall = min_temp
- if max_temp > max_overall:
- max_overall = max_temp
- return go.Scatter(
- x=read_table["pos"],
- y=read_table["height"],
- mode="markers",
- showlegend=False,
- text=round(read_table["prob"] / 255, 2),
- hoverinfo="text",
- marker=dict(
- size=dotsize,
- color=read_table["prob"],
- colorscale=colorscale,
- colorbar=dict(
- title=mod + " probability",
- titleside="right",
- tickmode="array",
- tickvals=[min_overall, max_overall],
- ticktext=[
- str(round(min_overall / 255, 2)),
- str(round(max_overall / 255, 2)),
- ],
- ticks="outside",
- thickness=15,
- x=offset + 1,
- ),
- ),
- )
-
-
-def find_min_and_max_pos_per_read(table):
- """Return a table with for every read the minimum and maximum position"""
- mm_table = (
- table.loc[:, ["read_name", "pos"]]
- .groupby("read_name")
- .min()
- .join(
- table.loc[:, ["read_name", "pos"]].groupby("read_name").max(),
- lsuffix="min",
- rsuffix="max",
- )
- )
- return mm_table
-
-
-def assign_y_height_per_read(df, max_coverage=1000):
- """Assign height of the read in the per read traces
- Gets a dataframe of read_name, posmin and posmax.
- Sorting by position.
- Determines optimal height (y coordinate) for this read
- Returns a dictionary mapping read_name to y_coord
- """
- dfs = df.sort_values(by=["posmin", "posmax"], ascending=[True, False])
- heights = [[] for i in range(max_coverage)]
- y_pos = dict()
- for read in dfs.itertuples():
- for y, layer in enumerate(heights, start=1):
- if len(layer) == 0:
- layer.append(read.posmax)
- y_pos[read.Index] = y
- break
- if read.posmin > layer[-1]:
- layer.append(read.posmax)
- y_pos[read.Index] = y
- break
- return pd.DataFrame(
- {"read": list(y_pos.keys()), "height": list(y_pos.values())}
- ).set_index("read")
-
-
-def make_per_read_line_trace(read_range, y_pos):
- """
- Make a grey line trace for a single read
- """
- return go.Scatter(
- x=[read_range["posmin"], read_range["posmax"]],
- y=[y_pos, y_pos],
- mode="lines",
- line=dict(width=1, color="lightgrey"),
- showlegend=False,
- )
-
-
-def meth_browser(
- all_data,
- aggregate_counts,
- basemod,
- region,
- sampleNames,
- outDir,
- smooth,
- min_periods,
- bed=False,
- outfile=None,
- dotsize=4,
- static=False,
- threshA=DEFAULT_THRESH_A,
- threshC=DEFAULT_THRESH_C,
- colorA=COLOR_A,
- colorC=COLOR_C,
-):
- """
- meth_data is a list of methylationByBase tables as dataframes
- all_dict is a list of methylationAggregate tables as dataframes
- annotation is optional and is a bed file
- then show one line per sample and one for the annotation, with methrows = number of datasets
- the trace to be used for annotation is thus always num_methrows + 1
- """
- meth_traces = methylation(
- all_data,
- sampleNames,
- basemod,
- colorA=colorA,
- colorC=colorC,
- dotsize=dotsize,
- threshA=threshA,
- threshC=threshC,
- )
-
- num_methrows = len(all_data)
- annot_row = num_methrows + 1
- annot_axis = f"yaxis{annot_row}"
- fig = create_subplots(
- num_methrows, names=meth_traces.names, annotation=bool(bed)
- )
- for y, sample_traces in enumerate(meth_traces, start=1):
- for meth_trace in sample_traces:
- fig.add_trace(trace=meth_trace, row=y, col=1)
- fig["layout"][f"yaxis{y}"].update(title="Reads")
- if bed:
- for annot_trace in bed_annotation(bed, region):
- fig.add_trace(trace=annot_trace, row=annot_row, col=1)
- y_max = -2
- if bed:
- fig["layout"][annot_axis].update(
- range=[-2, y_max + 1],
- showgrid=False,
- zeroline=False,
- showline=False,
- ticks="",
- showticklabels=False,
- )
- fig["layout"]["xaxis"].update(
- tickformat="g",
- separatethousands=True,
- range=[region.begin, region.end],
- )
- fig["layout"].update(
- barmode="overlay",
- title=region.chromosome,
- hovermode="closest",
- plot_bgcolor="rgba(0,0,0,0)",
- )
- if num_methrows > 10:
- for i in fig["layout"]["annotations"]:
- i["font"]["size"] = 10
- create_output(fig, outfile, region, static, outDir)
-
- i = 0
- for d in aggregate_counts:
- plot_aggregate(
- sampleNames[i],
- d,
- smooth,
- min_periods,
- region,
- basemod,
- outDir,
- colorA,
- colorC,
- )
- i = i + 1
-
-
-def bed_annotation(bed, region):
- return [
- go.Scatter(
- x=[begin, end],
- y=[-2, -2],
- mode="lines",
- line=dict(width=16, color="grey"),
- text=name,
- hoverinfo="text",
- showlegend=False,
- )
- for (begin, end, name) in parse_bed(bed, region)
- ]
-
-
-def parse_bed(bed, region):
- gr = pr.read_bed(bed)[region.chromosome, region.begin : region.end]
- df = gr.unstrand().df
- df = df.drop(columns=["Chromosome", "Score", "Strand"], errors="ignore")
- if "Name" not in df.columns:
- df["Name"] = "noname"
- df_short = df[df.columns[0:3]]
- return df_short.itertuples(index=False, name=None)
-
-
-def plot_aggregate(
- sampleName,
- aggregate_counts,
- smooth,
- min_periods,
- region,
- basemod,
- outDir,
- colorA,
- colorC,
-):
- """
- plot rolling aggregate of frac methylated
- plot rolling aggregate of total bases
- """
-
- aggregate_counts["frac"] = (
- aggregate_counts["methylated_bases"] / aggregate_counts["total_bases"]
- )
-
- # plot aggregate of fraction and of total count coverage
- if "A" in basemod:
- aggregate_A = aggregate_counts[
- aggregate_counts["mod"].str.contains("A")
- ].copy()
- # need to sort first!
- aggregate_A.sort_values(["pos"], inplace=True)
- aggregate_A_rolling = aggregate_A.rolling(
- window=smooth, min_periods=min_periods, center=True, on="pos"
- ).mean()
- plot_aggregate_frac(
- aggregate_A_rolling, sampleName, "A", colorA, outDir
- )
- plot_aggregate_total(
- aggregate_A_rolling, sampleName, "A", colorA, outDir
- )
- if "C" in basemod:
- aggregate_C = aggregate_counts[
- aggregate_counts["mod"].str.contains("C")
- ].copy()
- # need to sort first!
- aggregate_C.sort_values(["pos"], inplace=True)
- aggregate_C_rolling = aggregate_C.rolling(
- window=smooth, min_periods=min_periods, center=True, on="pos"
- ).mean()
- plot_aggregate_frac(
- aggregate_C_rolling, sampleName, "C", colorC, outDir
- )
- plot_aggregate_total(
- aggregate_C_rolling, sampleName, "C", colorC, outDir
- )
-
-
-def plot_aggregate_frac(aggregate_rolling, sampleName, mod, color, outDir):
- fig = plt.figure()
- sns.lineplot(
- x=aggregate_rolling["pos"],
- y=aggregate_rolling["frac"],
- color=color,
- )
- if "A" in mod:
- mod_name = "A"
- if "C" in mod:
- mod_name = "CG"
-
- plt.title(mod_name)
- plt.ylabel("m" + mod_name + "/" + mod_name)
- fig.savefig(
- outDir
- + "/"
- + sampleName
- + "_"
- + mod_name
- + "_sm_rolling_avg_fraction.pdf"
- )
- plt.close()
-
-
-def plot_aggregate_total(aggregate_rolling, sampleName, mod, color, outDir):
- fig = plt.figure()
- sns.lineplot(
- x=aggregate_rolling["pos"],
- y=aggregate_rolling["total_bases"],
- color=color,
- )
- if "A" in mod:
- mod_name = "A"
- if "C" in mod:
- mod_name = "CG"
-
- plt.title(mod_name)
- plt.ylabel("total " + mod_name)
- fig.savefig(
- outDir
- + "/"
- + sampleName
- + "_"
- + mod_name
- + "_sm_rolling_avg_total.pdf"
- )
- plt.close()
-
-
-def main():
- parser = argparse.ArgumentParser(description="DiMeLo plot browser")
-
- # Required arguments
- required_args = parser.add_argument_group("required arguments")
- required_args.add_argument(
- "-f", "--fileNames", required=True, nargs="+", help="bam file name(s)"
- )
- required_args.add_argument(
- "-s",
- "--sampleNames",
- required=True,
- nargs="+",
- help="sample name(s) for output file labelling",
- )
- required_args.add_argument(
- "-r",
- "--region",
- required=True,
- type=str,
- help='single region over which to extract base mods, e.g. "chr1:1-100000"',
- )
- required_args.add_argument(
- "-m",
- "--basemod",
- required=True,
- type=str,
- choices=["A", "CG", "A+CG"],
- help="which base modification to extract",
- )
- required_args.add_argument(
- "-o", "--outDir", required=True, help="directory to output plot"
- )
-
- # Smoothing options
- smoothing_args = parser.add_argument_group("smoothing options")
- smoothing_args.add_argument(
- "-t",
- "--smooth",
- type=int,
- default=DEFAULT_SMOOTH,
- help="window over which to smooth aggregate curve",
- )
- smoothing_args.add_argument(
- "-n",
- "--min_periods",
- type=int,
- default=DEFAULT_MIN_PERIODS,
- help="minimum number of bases to consider for smoothing",
- )
-
- # Plotting arguments
- plotting_args = parser.add_argument_group("plotting options")
- plotting_args.add_argument(
- "--colorA",
- type=str,
- default=COLOR_A,
- help='color in hex (e.g. "#BB4430") for mA',
- )
- plotting_args.add_argument(
- "--colorC",
- type=str,
- default=COLOR_C,
- help='color in hex (e.g. "#BB4430") for mCG',
- )
- plotting_args.add_argument(
- "-d",
- "--dotsize",
- type=float,
- default=DEFAULT_DOTSIZE,
- help="size of points",
- )
-
- # Optional arguments
- parser.add_argument(
- "-A",
- "--threshA",
- type=int,
- default=DEFAULT_THRESH_A,
- help="threshold above which to call an A base methylated",
- )
- parser.add_argument(
- "-C",
- "--threshC",
- type=int,
- default=DEFAULT_THRESH_C,
- help="threshold above which to call a C base methylated",
- )
- parser.add_argument(
- "-b",
- "--bedFileFeatures",
- help="bed file specifying annotation to display in browser",
- )
- parser.add_argument(
- "--static",
- action="store_true",
- help="output as PDF instead of interactive HTML",
- )
- parser.add_argument(
- "-p",
- "--cores",
- type=int,
- help="number of cores over which to parallelize",
- )
-
- args = parser.parse_args()
- plot_browser(**vars(args))
diff --git a/dimelo/plot_depth_histogram.py b/dimelo/plot_depth_histogram.py
new file mode 100644
index 00000000..95b1f099
--- /dev/null
+++ b/dimelo/plot_depth_histogram.py
@@ -0,0 +1,271 @@
+from pathlib import Path
+
+import numpy as np
+from matplotlib.axes import Axes
+
+from . import load_processed, utils
+
+
+def plot_depth_histogram(
+ mod_file_names: list[str | Path],
+ regions_list: list[str | Path | list[str | Path]],
+ motifs: list[str],
+ sample_names: list[str],
+ window_size: int | None = None,
+ single_strand: bool = False,
+ one_depth_per_region: bool = False,
+ quiet: bool = False,
+ cores: int | None = None,
+ split_large_regions: bool = False,
+ **kwargs,
+) -> Axes:
+ """
+ Plot depth histograms, overlaying the results on top of each other.
+
+ Each input list is expected to be parallel and the same length. Each index represents one analysis condition across the lists.
+ Using the same file for multiple conditions requires adding the same file multiple times, in the appropriate indices.
+
+ This is the most flexible method for depth histogram plotting. For most use cases, consider
+ using one of the plot_depth_histogram.by_* methods.
+
+ Args:
+ mod_file_names: list of paths to modified base data files
+ bed_file_names: list of paths to bed files specifying centered equal-length regions
+ mod_names: list of modifications to extract; expected to match mods available in the relevant mod_files
+ sample_names: list of names to use for labeling traces in the output; legend entries
+ window_size: half-size of the desired window to plot; how far the window stretches on either side of the center point
+ single_strand: True means we only grab counts from reads from the same strand as
+ the region of interest, False means we always grab both strands within the regions
+ one_depth_per_region: if True, each region will only report a single depth value, averaging across all non-zero depths. If False
+ depths will be reported separately for all nonzero count positions in each region for a more granular view of depth distribution.
+ quiet: disables progress bars
+ cores: CPU cores across which to parallelize processing. Default to None, which means all available.
+ split_large_regions: if True, regions will be run sequentially in parallelized chunks. If False,
+ each individual region's chunks will be run sequentially but there will be parallelization across
+ regions, i.e. each core will be assigned one region at a time by the executor. Set to True if you
+ are running a small number of very large regions (e.g. one or two chromosomes), otherwise to to False (default).
+ kwargs: other keyword parameters passed through to utils.line_plot
+
+ Returns:
+ Axes object containing the plot
+ """
+ if not utils.check_len_equal(mod_file_names, regions_list, motifs, sample_names):
+ raise ValueError("Unequal number of inputs")
+
+ depth_vectors = get_depth_counts(
+ mod_file_names=mod_file_names,
+ regions_list=regions_list,
+ motifs=motifs,
+ window_size=window_size,
+ single_strand=single_strand,
+ one_depth_per_region=one_depth_per_region,
+ quiet=quiet,
+ cores=cores,
+ )
+
+ axes = make_depth_histogram_plot(
+ depth_vectors=depth_vectors,
+ sample_names=sample_names,
+ one_depth_per_region=one_depth_per_region,
+ y_label="regions count" if one_depth_per_region else "positions count",
+ **kwargs,
+ )
+ return axes
+
+
+def by_modification(
+ mod_file_name: str | Path,
+ regions: str | Path,
+ motifs: list[str],
+ **kwargs,
+) -> Axes:
+ """
+ Plot depth histogram, holding modification file and regions constant, varying modification types
+
+ See plot_depth_histogram for details.
+ """
+ n_mods = len(motifs)
+ return plot_depth_histogram(
+ mod_file_names=[mod_file_name] * n_mods,
+ regions_list=[regions] * n_mods,
+ motifs=motifs,
+ sample_names=[f"{motif} depth" for motif in motifs],
+ **kwargs,
+ )
+
+
+def by_regions(
+ mod_file_name: str | Path,
+ regions_list: list[str | Path | list[str | Path]],
+ motif: str,
+ sample_names: list[str] | None = None,
+ **kwargs,
+) -> Axes:
+ """
+ Plot depth histogram, holding modification file and modification types constant, varying regions
+
+ Note: Sample names default to the names of the bed files.
+
+ See plot_depth_histogram for details.
+ """
+ if sample_names is None:
+ sample_names = regions_list
+ n_beds = len(regions_list)
+ return plot_depth_histogram(
+ mod_file_names=[mod_file_name] * n_beds,
+ regions_list=regions_list,
+ motifs=[motif] * n_beds,
+ sample_names=[f"{sample_name} depth" for sample_name in sample_names],
+ **kwargs,
+ )
+
+
+def by_dataset(
+ mod_file_names: list[str | Path],
+ regions: str | Path | list[str | Path],
+ motif: str,
+ sample_names: list[str] | None = None,
+ **kwargs,
+) -> Axes:
+ """
+ Plot depth histogram, holding modification types and regions constant, varying modification files
+
+ Note: Sample names default to the names of the modification files.
+
+ See plot_depth_histogram for details.
+ """
+ if sample_names is None:
+ sample_names = mod_file_names
+ n_mod_files = len(mod_file_names)
+ return plot_depth_histogram(
+ mod_file_names=mod_file_names,
+ regions_list=[regions] * n_mod_files,
+ motifs=[motif] * n_mod_files,
+ sample_names=[f"{sample_name} depth" for sample_name in sample_names],
+ **kwargs,
+ )
+
+
+def get_depth_counts(
+ mod_file_names: list[str | Path],
+ regions_list: list[str | Path | list[str | Path]],
+ motifs: list[str],
+ window_size: int | None,
+ single_strand: bool = False,
+ one_depth_per_region: bool = False,
+ quiet: bool = False,
+ cores: int | None = 1,
+) -> list[np.ndarray]:
+ """
+ Get the depth counts, ready for plotting.
+
+ This helper function can be useful during plot prototyping, when repeatedly building plots from the same data.
+ Its outputs can be passed as the first argument to make_depth_histogram_plot().
+
+ Args:
+ mod_file_names: list of paths to modified base data files
+ bed_file_names: list of paths to bed files specifying centered equal-length regions
+ mod_names: list of modifications to extract; expected to match mods available in the relevant mod_files
+ window_size: half-size of the desired window to plot; how far the window stretches on either side of the center point
+ single_strand: True means we only grab counts from reads from the same strand as
+ the region of interest, False means we always grab both strands within the regions
+ one_depth_per_region: if True, each region will only report a single depth value, averaging across all non-zero depths. If False
+ depths will be reported separately for all nonzero count positions in each region for a more granular view of depth distribution.
+ regions_5to3prime: True means negative strand regions get flipped, False means no flipping
+ smooth_window: size of the moving window to use for smoothing. If set to None, no smoothing is performed
+ quiet: disables progress bars
+ cores: CPU cores across which to parallelize processing
+
+ Returns:
+ List of depth vectors for histogram
+ """
+ if not utils.check_len_equal(mod_file_names, regions_list, motifs):
+ raise ValueError("Unequal number of inputs")
+ # TODO: redefinition error; still need to figure out how to do this elegantly in a way mypy likes
+ # dimelo/plot_depth_histogram.py:53: error: Item "str" of "str | Path" has no attribute "suffix" [union-attr]
+ mod_file_names = [Path(fn) for fn in mod_file_names]
+
+ depth_vectors = []
+ for mod_file, regions, motif in zip(mod_file_names, regions_list, motifs):
+ match mod_file.suffix:
+ case ".gz":
+ pileup_vectors_list = load_processed.regions_to_list(
+ function_handle=load_processed.pileup_vectors_from_bedmethyl,
+ bedmethyl_file=mod_file,
+ regions=regions,
+ motif=motif,
+ window_size=window_size,
+ single_strand=single_strand,
+ quiet=quiet,
+ cores=cores,
+ )
+ # places where read depth is zero are assumed to not have the motif present - this may not always be true,
+ # but with the available information in a pileup file it's the best we can do
+ read_depth_vectors_list = [
+ valid_base_counts[valid_base_counts > 0]
+ for _, valid_base_counts in pileup_vectors_list
+ ]
+ if one_depth_per_region:
+ # each region's read depth vector gets collapsed to a single mean value
+ read_depths = np.array(
+ [
+ np.mean(read_depth_vector)
+ for read_depth_vector in read_depth_vectors_list
+ ]
+ )
+ else:
+ # each region's read depth vector gets added to one extending read depths list without aggregating
+ read_depths = np.concatenate(read_depth_vectors_list)
+
+ case ".fake":
+ read_depths = load_processed.vector_from_fake(
+ mod_file=mod_file,
+ bed_file=regions,
+ motif=motif,
+ window_size=window_size,
+ )
+ case _:
+ raise ValueError(f"Unsupported file type for {mod_file}")
+ depth_vectors.append(read_depths)
+ return depth_vectors
+
+
+def make_depth_histogram_plot(
+ depth_vectors: list[np.ndarray],
+ sample_names: list[str],
+ y_label: str = "count",
+ one_depth_per_region: bool = False,
+ **kwargs,
+) -> Axes:
+ """
+ Plot the given depth histogram traces.
+
+ This helper function can be useful during plot prototyping, when repeatedly building plots from the same data.
+ The first argument should be the output of get_depth_histograms().
+
+ Args:
+ depth_vectors: list of depth histogram counts
+ sample_names: list of names to use for labeling traces in the output; legend entries
+ one_depth_per_region: if True, each region will only report a single depth value, averaging across all non-zero depths. If False
+ depths will be reported separately for all nonzero count positions in each region for a more granular view of depth distribution.
+ kwargs: other keyword parameters passed through to utils.line_plot
+
+ Returns:
+ Axes object containing the plot
+ """
+ if not utils.check_len_equal(depth_vectors, sample_names):
+ raise ValueError("Unequal number of inputs")
+ x_label = (
+ "per strand read\ndepth in region"
+ if one_depth_per_region
+ else "per strand read\ndepth per position"
+ )
+ axes = utils.hist_plot(
+ value_vectors=depth_vectors,
+ value_names=sample_names,
+ x_label=x_label,
+ y_label=y_label,
+ integer_values=not one_depth_per_region,
+ **kwargs,
+ )
+ return axes
diff --git a/dimelo/plot_depth_profile.py b/dimelo/plot_depth_profile.py
new file mode 100644
index 00000000..f13962fa
--- /dev/null
+++ b/dimelo/plot_depth_profile.py
@@ -0,0 +1,243 @@
+from pathlib import Path
+
+import numpy as np
+from matplotlib.axes import Axes
+
+from . import load_processed, utils
+
+
+def plot_depth_profile(
+ mod_file_names: list[str | Path],
+ regions_list: list[str | Path | list[str | Path]],
+ motifs: list[str],
+ sample_names: list[str],
+ window_size: int | None = None,
+ single_strand: bool = False,
+ regions_5to3prime: bool = False,
+ smooth_window: int | None = None,
+ quiet: bool = False,
+ cores: int | None = None,
+ **kwargs,
+) -> Axes:
+ """
+ Plot depth profiles, overlaying the resulting traces on top of each other.
+
+ Each input list is expected to be parallel and the same length. Each index represents one analysis condition across the lists.
+ Using the same file for multiple conditions requires adding the same file multiple times, in the appropriate indices.
+
+ This is the most flexible method for depth profile plotting. For most use cases, consider
+ using one of the plot_depth_profile.by_* methods.
+
+ Args:
+ mod_file_names: list of paths to modified base data files
+ bed_file_names: list of paths to bed files specifying centered equal-length regions
+ mod_names: list of modifications to extract; expected to match mods available in the relevant mod_files
+ sample_names: list of names to use for labeling traces in the output; legend entries
+ window_size: half-size of the desired window to plot; how far the window stretches on either side of the center point
+ single_strand: True means we only grab counts from reads from the same strand as
+ the region of interest, False means we always grab both strands within the regions
+ regions_5to3prime: True means negative strand regions get flipped, False means no flipping
+ smooth_window: size of the moving window to use for smoothing. If set to None, no smoothing is performed
+ quiet: disables progress bars
+ cores: CPU cores across which to parallelize processing. Default to None, which means all available.
+ kwargs: other keyword parameters passed through to utils.line_plot
+
+ Returns:
+ Axes object containing the plot
+ """
+ if not utils.check_len_equal(mod_file_names, regions_list, motifs, sample_names):
+ raise ValueError("Unequal number of inputs")
+
+ trace_vectors = get_depth_profiles(
+ mod_file_names=mod_file_names,
+ regions_list=regions_list,
+ motifs=motifs,
+ window_size=window_size,
+ single_strand=single_strand,
+ regions_5to3prime=regions_5to3prime,
+ smooth_window=smooth_window,
+ quiet=quiet,
+ cores=cores,
+ )
+
+ axes = make_depth_profile_plot(
+ trace_vectors=trace_vectors, sample_names=sample_names, **kwargs
+ )
+ return axes
+
+
+def by_modification(
+ mod_file_name: str | Path,
+ regions: str | Path,
+ motifs: list[str],
+ **kwargs,
+) -> Axes:
+ """
+ Plot depth profile, holding modification file and regions constant, varying modification types
+
+ See plot_depth_profile for details.
+ """
+ n_mods = len(motifs)
+ return plot_depth_profile(
+ mod_file_names=[mod_file_name] * n_mods,
+ regions_list=[regions] * n_mods,
+ motifs=motifs,
+ sample_names=[f"{motif} depth" for motif in motifs],
+ **kwargs,
+ )
+
+
+def by_regions(
+ mod_file_name: str | Path,
+ regions_list: list[str | Path | list[str | Path]],
+ motif: str,
+ sample_names: list[str] | None = None,
+ **kwargs,
+) -> Axes:
+ """
+ Plot depth profile, holding modification file and modification types constant, varying regions
+
+ Note: Sample names default to the names of the bed files.
+
+ See plot_depth_profile for details.
+ """
+ if sample_names is None:
+ sample_names = regions_list
+ n_beds = len(regions_list)
+ return plot_depth_profile(
+ mod_file_names=[mod_file_name] * n_beds,
+ regions_list=regions_list,
+ motifs=[motif] * n_beds,
+ sample_names=[f"{sample_name} depth" for sample_name in sample_names],
+ **kwargs,
+ )
+
+
+def by_dataset(
+ mod_file_names: list[str | Path],
+ regions: str | Path | list[str | Path],
+ motif: str,
+ sample_names: list[str] | None = None,
+ **kwargs,
+) -> Axes:
+ """
+ Plot depth profile, holding modification types and regions constant, varying modification files
+
+ Note: Sample names default to the names of the modification files.
+
+ See plot_depth_profile for details.
+ """
+ if sample_names is None:
+ sample_names = mod_file_names
+ n_mod_files = len(mod_file_names)
+ return plot_depth_profile(
+ mod_file_names=mod_file_names,
+ regions_list=[regions] * n_mod_files,
+ motifs=[motif] * n_mod_files,
+ sample_names=[f"{sample_name} depth" for sample_name in sample_names],
+ **kwargs,
+ )
+
+
+def get_depth_profiles(
+ mod_file_names: list[str | Path],
+ regions_list: list[str | Path | list[str | Path]],
+ motifs: list[str],
+ window_size: int,
+ single_strand: bool = False,
+ regions_5to3prime: bool = False,
+ smooth_window: int | None = None,
+ quiet: bool = False,
+ cores: int | None = None,
+) -> list[np.ndarray]:
+ """
+ Get the depth profile traces, ready for plotting.
+
+ This helper function can be useful during plot prototyping, when repeatedly building plots from the same data.
+ Its outputs can be passed as the first argument to make_depth_profile_plot().
+
+ Args:
+ mod_file_names: list of paths to modified base data files
+ bed_file_names: list of paths to bed files specifying centered equal-length regions
+ mod_names: list of modifications to extract; expected to match mods available in the relevant mod_files
+ window_size: half-size of the desired window to plot; how far the window stretches on either side of the center point
+ single_strand: True means we only grab counts from reads from the same strand as
+ the region of interest, False means we always grab both strands within the regions
+ regions_5to3prime: True means negative strand regions get flipped, False means no flipping
+ smooth_window: size of the moving window to use for smoothing. If set to None, no smoothing is performed
+ quiet: disables progress bars
+ cores: CPU cores across which to parallelize processing
+
+ Returns:
+ List of depth profile traces
+ """
+ if not utils.check_len_equal(mod_file_names, regions_list, motifs):
+ raise ValueError("Unequal number of inputs")
+ # TODO: redefinition error; still need to figure out how to do this elegantly in a way mypy likes
+ # dimelo/plot_depth_profile.py:53: error: Item "str" of "str | Path" has no attribute "suffix" [union-attr]
+ mod_file_names = [Path(fn) for fn in mod_file_names]
+
+ trace_vectors = []
+ for mod_file, regions, motif in zip(mod_file_names, regions_list, motifs):
+ match mod_file.suffix:
+ case ".gz":
+ _, valid_base_counts = load_processed.pileup_vectors_from_bedmethyl(
+ bedmethyl_file=mod_file,
+ regions=regions,
+ motif=motif,
+ window_size=window_size,
+ single_strand=single_strand,
+ regions_5to3prime=regions_5to3prime,
+ quiet=quiet,
+ cores=cores,
+ )
+ trace = valid_base_counts.astype(float)
+ trace[trace == 0] = np.nan
+ case ".fake":
+ trace = load_processed.vector_from_fake(
+ mod_file=mod_file,
+ bed_file=regions,
+ motif=motif,
+ window_size=window_size,
+ )
+ case _:
+ raise ValueError(f"Unsupported file type for {mod_file}")
+ if smooth_window is not None:
+ trace = utils.smooth_rolling_mean(trace, window=smooth_window)
+ trace_vectors.append(trace)
+ return trace_vectors
+
+
+def make_depth_profile_plot(
+ trace_vectors: list[np.ndarray],
+ sample_names: list[str],
+ **kwargs,
+) -> Axes:
+ """
+ Plot the given depth profile traces.
+
+ This helper function can be useful during plot prototyping, when repeatedly building plots from the same data.
+ The first argument should be the output of get_depth_profiles().
+
+ Args:
+ trace_vectors: list of depth profile traces
+ sample_names: list of names to use for labeling traces in the output; legend entries
+ kwargs: other keyword parameters passed through to utils.line_plot
+
+ Returns:
+ Axes object containing the plot
+ """
+ if not utils.check_len_equal(trace_vectors, sample_names):
+ raise ValueError("Unequal number of inputs")
+ axes = utils.line_plot(
+ indep_vector=np.arange(
+ -len(trace_vectors[0]) // 2,
+ len(trace_vectors[0]) // 2 + len(trace_vectors[0]) % 2,
+ ),
+ indep_name="pos",
+ dep_vectors=trace_vectors,
+ dep_names=sample_names,
+ y_label="per strand reads\nwith motif and mod info",
+ **kwargs,
+ )
+ return axes
diff --git a/dimelo/plot_enrichment.py b/dimelo/plot_enrichment.py
index 163faab5..d747b913 100644
--- a/dimelo/plot_enrichment.py
+++ b/dimelo/plot_enrichment.py
@@ -1,320 +1,239 @@
-r"""
-=======================
-plot_enrichment module
-=======================
-.. currentmodule:: dimelo.plot_enrichment
-.. autosummary::
- plot_enrichment
+from pathlib import Path
-plot_enrichment plots fraction of bases modified within regions of interest defined by bed file
+from matplotlib.axes import Axes
-"""
-
-import argparse
-import multiprocessing
-import os
-import sqlite3
+from . import load_processed, utils
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn as sns
-from dimelo.parse_bam import parse_bam
+def plot_enrichment(
+ mod_file_names: list[str | Path],
+ regions_list: list[str | Path | list[str | Path]],
+ motifs: list[str],
+ sample_names: list[str],
+ window_size: int | None = None,
+ single_strand: bool = False,
+ quiet: bool = False,
+ cores: int | None = None,
+ **kwargs,
+) -> Axes:
+ """
+ Plot enrichment comparison barplots using the given list of pre-processed input files.
+
+ Each input list is expected to be parallel and the same length. Each index represents one analysis condition across the lists.
+ Using the same file for multiple conditions requires adding the same file multiple times, in the appropriate indices.
+
+ This is the most flexible method for enrichment plotting. For most use cases, consider
+ using one of the plot_enrichment.by_* methods.
+
+ Args:
+ mod_file_names: list of paths to modified base pileup data files
+ bed_file_names: list of paths to bed files specifying regions to extract
+ mod_names: list of modifications to extract; expected to match mods available in the relevant mod_files
+ sample_names: list of names to use for labeling bars in the output; x-axis labels
+ window_size: (currently disabled) window around center of region, +-window_size//2
+ single_strand: True means we only grab counts from reads from the same strand as
+ the region of interest, False means we always grab both strands within the regions
+ quiet: disables progress bars
+ cores: CPU cores across which to parallelize processing. Default to None, which means all available.
+ kwargs: other keyword parameters passed through to utils.bar_plot
+
+ Returns:
+ Axes object containing the plot
+ """
+ if not utils.check_len_equal(mod_file_names, regions_list, motifs, sample_names):
+ raise ValueError("Unequal number of inputs")
+
+ mod_fractions = get_enrichments(
+ mod_file_names=mod_file_names,
+ regions_list=regions_list,
+ motifs=motifs,
+ window_size=window_size,
+ single_strand=single_strand,
+ quiet=quiet,
+ cores=cores,
+ )
-DEFAULT_THRESH_A = 129
-DEFAULT_THRESH_C = 129
-DEFAULT_COLOR_LIST = ["#2D1E2F", "#A9E5BB", "#610345", "#559CAD", "#5E747F"]
+ axes = make_enrichment_plot(
+ mod_fractions=mod_fractions,
+ sample_names=sample_names,
+ **kwargs,
+ )
+ return axes
-def plot_enrichment(
- fileNames,
- sampleNames,
- bedFiles,
- basemod,
- outDir,
- threshA=DEFAULT_THRESH_A,
- threshC=DEFAULT_THRESH_C,
- colors=DEFAULT_COLOR_LIST,
- cores=None,
-):
+def by_modification(
+ mod_file_name: str | Path,
+ regions: str | Path | list[str | Path],
+ motifs: list[str],
+ **kwargs,
+) -> Axes:
"""
- fileNames
- name(s) of bam file with Mm and Ml tags
- sampleNames
- name(s) of sample for output file name labelling; valid names contain [``a-zA-Z0-9_``].
- bedFiles
- specified windows for region(s) of interest
- basemod
- One of the following (only valid to look at one type of mod):
-
- * ``'A'`` - extract mA only
- * ``'CG'`` - extract mCpG only
- outDir
- directory to output plot
- threshA
- threshold for calling mA; default 129
- threshC
- threshold for calling mCG; default 129
- colors
- color list in hex for overlay; default is ["#2D1E2F", "#A9E5BB", "#610345", "#559CAD", "#5E747F"]
- cores
- number of cores over which to parallelize; default is all available
-
- **Example**
-
- >>> dm.plot_enrichment(["dimelo/test/data/mod_mappings_subset.bam", "dimelo/test/data/mod_mappings_subset.bam"], ["test1", "test2"], "dimelo/test/data/test.bed", "CG", "dimelo/dimelo_test", threshC=129)
- >>> dm.plot_enrichment("dimelo/test/data/mod_mappings_subset.bam", ["test1", "test2"], ["dimelo/test/data/test.bed", "dimelo/test/data/test.bed"], "CG", "dimelo/dimelo_test", threshC=129)
-
- **Return**
-
- Barplot with overall fraction of bases modified within regions of interest specified by bedFile(s)
-
- **Example Plots**
-
- * :ref:`sphx_glr_auto_examples_enrichment_multi_bam_example.py`
- * :ref:`sphx_glr_auto_examples_enrichment_multi_bed_example.py`
+ Plot enrichment bar plots, holding modification file and regions constant, varying modification types
+ See plot_enrichment for details.
"""
- if not os.path.isdir(outDir):
- os.makedirs(outDir)
-
- # default number of cores is max available
- cores_avail = multiprocessing.cpu_count()
- if cores is None:
- num_cores = cores_avail
- else:
- # if more than available cores is specified, process with available cores
- if cores > cores_avail:
- num_cores = cores_avail
- else:
- num_cores = cores
-
- # A+CG is not valid; only valid to look at one type of mod
- if (basemod != "A") and (basemod != "CG"):
- raise RuntimeError("valid basemods are A or CG")
-
- # if single bam file rather than list is entered, convert to list
- if type(fileNames) != list:
- fileNames = [fileNames]
- # if single sample name rather than list is entered, convert to list
- if type(sampleNames) != list:
- sampleNames = [sampleNames]
- # if single bed file rather than list is entered, convert to list
- if type(bedFiles) != list:
- bedFiles = [bedFiles]
-
- # extract counts and create barplots
- # get average across all bases for regions defined in the bed file
- columns = ["fileName", "bedFile", "sampleName", "fractionMethylated"]
- data = []
- if len(fileNames) > 1 or len(bedFiles) > 1:
- if len(fileNames) > 1:
- if len(bedFiles) > 1:
- raise RuntimeError(
- "only a single region file can be used when analyzing multiple bam files"
- )
- for f, n in zip(fileNames, sampleNames):
- values = get_counts(
- f,
- n,
- bedFiles[0],
- basemod,
- outDir,
- threshA,
- threshC,
- num_cores,
- )
- zipped = zip(columns, values)
- a_dictionary = dict(zipped)
- data.append(a_dictionary)
- if len(bedFiles) > 1:
- if len(fileNames) > 1:
- raise RuntimeError(
- "only a single bam file can be used when analyzing multiple bed file regions"
- )
- for b, n in zip(bedFiles, sampleNames):
- values = get_counts(
- fileNames[0],
- n,
- b,
- basemod,
- outDir,
- threshA,
- threshC,
- num_cores,
- )
- zipped = zip(columns, values)
- a_dictionary = dict(zipped)
- data.append(a_dictionary)
-
- # allow for barplot for single file and region
- if (len(fileNames) == 1) and (len(bedFiles) == 1):
- values = get_counts(
- fileNames[0],
- sampleNames[0],
- bedFiles[0],
- basemod,
- outDir,
- threshA,
- threshC,
- num_cores,
- )
- zipped = zip(columns, values)
- a_dictionary = dict(zipped)
- data.append(a_dictionary)
-
- df = pd.DataFrame(data)
- # draw from aggregate to calculate modified/total in region of interest
- if len(fileNames) == 1:
- title = "sample_" + fileNames[0].split("/")[-1].replace(".bam", "")
- if len(bedFiles) == 1:
- title = "region_" + bedFiles[0].split("/")[-1].replace(".bed", "")
- if (len(fileNames) == 1) and (len(bedFiles) == 1):
- title = (
- "sample_"
- + fileNames[0].split("/")[-1].replace(".bam", "")
- + "_region_"
- + bedFiles[0].split("/")[-1].replace(".bed", "")
- )
- plot_barchart(df, basemod, outDir, colors, title)
-
- db_paths = []
- for f in fileNames:
- db = outDir + "/" + f.split("/")[-1].replace(".bam", "") + ".db"
- db_paths.append(db)
-
- plot_path = f"{outDir}/{title}_{basemod}_enrichment_barplot.pdf"
- str_out = f"Outputs\n_______\nDB file: {db_paths}\nenrichment barplot: {plot_path}"
- print(str_out)
-
-
-def get_counts(
- fileName,
- sampleName,
- bedFile,
- basemod,
- outDir,
- threshA,
- threshC,
- num_cores,
-):
- # parse_bam for files / regions
- parse_bam(
- fileName,
- sampleName,
- outDir,
- bedFile,
- basemod,
- threshA=threshA,
- threshC=threshC,
- cores=num_cores,
+ n_mods = len(motifs)
+ return plot_enrichment(
+ mod_file_names=[mod_file_name] * n_mods,
+ regions_list=[regions] * n_mods,
+ motifs=motifs,
+ sample_names=motifs,
+ **kwargs,
)
- # get aggregate counts
- aggregate_counts = pd.read_sql(
- "SELECT * from methylationAggregate_" + sampleName,
- sqlite3.connect(
- outDir + "/" + fileName.split("/")[-1].replace(".bam", "") + ".db"
- ),
- )
- methylated_bases = aggregate_counts["methylated_bases"].sum()
- total_bases = aggregate_counts["total_bases"].sum()
- if total_bases == 0:
- fractionMethylated = 0
- else:
- fractionMethylated = methylated_bases / total_bases
- return [fileName, bedFile, sampleName, fractionMethylated]
+"""
+TODO: Re-assignment issue:
+dimelo/plot_enrichment.py:115: error: Incompatible types in assignment (expression has type "list[str | Path | list[str | Path]]", variable has type "list[str] | None") [assignment]
+dimelo/plot_enrichment.py:121: error: Argument "sample_names" to "plot_enrichment" has incompatible type "list[str] | None"; expected "list[str]" [arg-type]
+dimelo/plot_enrichment.py:141: error: Incompatible types in assignment (expression has type "list[str | Path]", variable has type "list[str] | None") [assignment]
+dimelo/plot_enrichment.py:147: error: Argument "sample_names" to "plot_enrichment" has incompatible type "list[str] | None"; expected "list[str]" [arg-type]
+"""
-def plot_barchart(data, basemod, outDir, colors, title):
+def by_regions(
+ mod_file_name: str | Path,
+ regions_list: list[str | Path | list[str | Path]],
+ motif: str,
+ sample_names: list[str] | None = None,
+ **kwargs,
+) -> Axes:
"""
- x-axis: sample or region
- y-axis: fraction methylated bases
+ Plot enrichment bar plots, holding modification file and modification types constant, varying regions
+
+ Note: Sample names default to the names of the bed files.
+
+ See plot_enrichment for details.
"""
- fig, ax1 = plt.subplots()
- plt.bar("sampleName", "fractionMethylated", data=data, color=colors)
- print("\nData for barplot")
- print("________________\n")
- print(f"{data.sampleName}")
- print(f"{data.fractionMethylated}")
- print("\n")
- sns.despine(fig)
- plt.ylabel("fraction methylated bases")
- plt.xlabel("")
- plt.savefig(
- outDir + "/" + title + "_" + basemod + "_enrichment_barplot.pdf",
+ if sample_names is None:
+ sample_names = regions_list
+ n_beds = len(regions_list)
+ return plot_enrichment(
+ mod_file_names=[mod_file_name] * n_beds,
+ regions_list=regions_list,
+ motifs=[motif] * n_beds,
+ sample_names=sample_names,
+ **kwargs,
)
- plt.close()
-def main():
- parser = argparse.ArgumentParser(
- description="Plot DiMeLo methylation enrichment"
- )
+def by_dataset(
+ mod_file_names: list[str | Path],
+ regions: str | Path | list[str | Path],
+ motif: str,
+ sample_names: list[str] | None = None,
+ **kwargs,
+) -> Axes:
+ """
+ Plot enrichment bar plots, holding modification types and regions constant, varying modification files
- # Required arguments
- required_args = parser.add_argument_group("required arguments")
- required_args.add_argument(
- "-f", "--fileNames", required=True, nargs="+", help="bam file name(s)"
- )
- required_args.add_argument(
- "-s",
- "--sampleNames",
- required=True,
- nargs="+",
- help="sample name(s) for output file labelling",
- )
- required_args.add_argument(
- "-b",
- "--bedFiles",
- required=True,
- nargs="+",
- help="name of bed file(s) defining region(s) of interest",
- )
- required_args.add_argument(
- "-m",
- "--basemod",
- required=True,
- type=str,
- choices=["A", "CG"],
- help="which base modification to extract",
- )
- required_args.add_argument(
- "-o", "--outDir", required=True, help="directory to output plot"
- )
+ Note: Sample names default to the names of the modification files.
- # Plotting arguments
- plotting_args = parser.add_argument_group("plotting options")
- plotting_args.add_argument(
- "--colors",
- type=str,
- nargs="+",
- default=DEFAULT_COLOR_LIST,
- help='color list in hex (e.g. "#BB4430") for overlay plots',
+ See plot_enrichment for details.
+ """
+ if sample_names is None:
+ sample_names = mod_file_names
+ n_mod_files = len(mod_file_names)
+ return plot_enrichment(
+ mod_file_names=mod_file_names,
+ regions_list=[regions] * n_mod_files,
+ motifs=[motif] * n_mod_files,
+ sample_names=sample_names,
+ **kwargs,
)
- # Optional arguments
- parser.add_argument(
- "-A",
- "--threshA",
- type=int,
- default=DEFAULT_THRESH_A,
- help="threshold above which to call an A base methylated",
- )
- parser.add_argument(
- "-C",
- "--threshC",
- type=int,
- default=DEFAULT_THRESH_C,
- help="threshold above which to call a C base methylated",
- )
- parser.add_argument(
- "-p",
- "--cores",
- type=int,
- help="number of cores over which to parallelize",
+
+def get_enrichments(
+ mod_file_names: list[str | Path],
+ regions_list: list[str | Path | list[str | Path]],
+ motifs: list[str],
+ window_size: int | None = None,
+ single_strand: bool = False,
+ quiet: bool = False,
+ cores: int | None = None,
+) -> list[float]:
+ """
+ Get the enrichment values, ready for plotting.
+
+ This helper function can be useful during plot prototyping, when repeatedly building plots from the same data.
+ Its outputs can be passed as the first argument to make_enrichment_plot().
+
+ TODO: I feel like this should be able to take in data directly as vectors/other datatypes, not just read from files.
+ TODO: Style-wise, is it cleaner to have it be a match statement or calling a method from a global dict? Cleaner here with a dict, cleaner overall with the match statements?
+
+ Args:
+ mod_file_names: list of paths to modified base pileup data files
+ regions_list: list of paths to bed files specifying regions to extract
+ motifs: list of modifications to extract; expected to match mods available in the relevant mod_files
+ window_size: (currently disabled) window around center of region, +-window_size//2
+ single_strand: True means we only grab counts from reads from the same strand as
+ the region of interest, False means we always grab both strands within the regions
+ quiet: disables progress bars
+ cores: CPU cores across which to parallelize processing
+
+ Returns:
+ List of modified fraction values.
+ """
+ if not utils.check_len_equal(mod_file_names, regions_list, motifs):
+ raise ValueError("Unequal number of inputs")
+ # TODO: redefinition error; still need to figure out how to do this elegantly in a way mypy likes
+ # dimelo/plot_enrichment.py:45: error: Item "str" of "str | Path" has no attribute "suffix" [union-attr]
+ mod_file_names = [Path(fn) for fn in mod_file_names]
+
+ mod_fractions = []
+ for mod_file, regions, motif in zip(mod_file_names, regions_list, motifs):
+ match mod_file.suffix:
+ case ".gz":
+ n_mod, n_total = load_processed.pileup_counts_from_bedmethyl(
+ bedmethyl_file=mod_file,
+ regions=regions,
+ motif=motif,
+ window_size=window_size,
+ single_strand=single_strand,
+ quiet=quiet,
+ cores=cores,
+ )
+ case ".fake":
+ n_mod, n_total = load_processed.counts_from_fake(
+ mod_file=mod_file, regions=regions, motif=motif
+ )
+ case _:
+ raise ValueError(f"Unsupported file type for {mod_file}")
+ try:
+ mod_fractions.append(n_mod / n_total)
+ except ZeroDivisionError:
+ mod_fractions.append(0)
+
+ return mod_fractions
+
+
+def make_enrichment_plot(
+ mod_fractions: list[float],
+ sample_names: list[str],
+ **kwargs,
+) -> Axes:
+ """
+ Plot the given enrichment values.
+
+ This helper function can be useful during plot prototyping, when repeatedly building plots from the same data.
+ The first argument should be the output of get_enrichments().
+
+ Args:
+ mod_fractions: list of modified fraction values.
+ sample_names: list of names to use for labeling bars in the output; x-axis labels
+ kwargs: other keyword parameters passed through to utils.bar_plot
+
+ Returns:
+ Axes object containing the plot
+ """
+ if not utils.check_len_equal(mod_fractions, sample_names):
+ raise ValueError("Unequal number of inputs")
+
+ axes = utils.bar_plot(
+ categories=sample_names,
+ values=mod_fractions,
+ y_label="fraction modified bases",
+ **kwargs,
)
- args = parser.parse_args()
- plot_enrichment(**vars(args))
+ return axes
diff --git a/dimelo/plot_enrichment_profile.py b/dimelo/plot_enrichment_profile.py
index 00fae97f..97ec461c 100644
--- a/dimelo/plot_enrichment_profile.py
+++ b/dimelo/plot_enrichment_profile.py
@@ -1,625 +1,273 @@
-r"""
-==================================
-plot_enrichment_profile module
-==================================
-.. currentmodule:: dimelo.plot_enrichment_profile
-.. autosummary::
- plot_enrichment_profile
+from pathlib import Path
-plot_enrichment_profile plots single molecules centered at regions of interest defined in bed file and produces aggregate profile
-
-"""
-
-import argparse
-import multiprocessing
-import os
-import sqlite3
-from itertools import cycle
-
-import matplotlib.pyplot as plt
import numpy as np
-import pandas as pd
-import seaborn as sns
-from matplotlib import colors
-from mpl_toolkits.axes_grid1 import make_axes_locatable
+from matplotlib.axes import Axes
-from dimelo.parse_bam import parse_bam
-
-DEFAULT_THRESH_A = 129
-DEFAULT_THRESH_C = 129
-DEFAULT_WINDOW_SIZE = 1000
-COLOR_A = "#053C5E"
-COLOR_C = "#BB4430"
-COLOR_LIST = ["#2D1E2F", "#A9E5BB", "#610345", "#559CAD", "#5E747F"]
-DEFAULT_DOTSIZE = 0.5
-DEFAULT_SMOOTH = 50
-DEFAULT_MIN_PERIODS = 10
+from . import load_processed, utils
def plot_enrichment_profile(
- fileNames,
- sampleNames,
- bedFiles,
- basemod,
- outDir,
- threshA=DEFAULT_THRESH_A,
- threshC=DEFAULT_THRESH_C,
- windowSize=DEFAULT_WINDOW_SIZE,
- colorA=COLOR_A,
- colorC=COLOR_C,
- colors=COLOR_LIST,
- dotsize=DEFAULT_DOTSIZE,
- smooth=DEFAULT_SMOOTH,
- min_periods=DEFAULT_MIN_PERIODS,
- cores=None,
-):
+ mod_file_names: list[str | Path],
+ regions_list: list[str | Path | list[str | Path]],
+ motifs: list[str],
+ sample_names: list[str],
+ window_size: int,
+ single_strand: bool = False,
+ regions_5to3prime: bool = False,
+ smooth_window: int | None = None,
+ quiet: bool = False,
+ cores: int | None = None,
+ **kwargs,
+) -> Axes:
"""
- fileNames
- name(s) of bam file with Mm and Ml tags
- sampleNames
- name(s) of sample for output file name labelling; valid names contain [``a-zA-Z0-9_``].
- bedFiles
- specified windows for region(s) of interest; optional 4th column in bed file to specify strand of region of interest as ``+`` or ``-``. Default is to consider regions as all ``+``. Reads will be oriented with respect to strand. Only reads overlapping regions defined in bed file will be extracted, regardless of windowSize. Plots are centered at the center of the bed file regions.
- basemod
- One of the following:
-
- * ``'A'`` - extract mA only
- * ``'CG'`` - extract mCpG only
- * ``'A+CG'`` - extract mA and mCpG
- outDir
- directory to output plot
- threshA
- threshold for calling mA; default 129
- threshC
- threshold for calling mCG; default 129
- windowSize
- window size around center point of feature of interest to plot (+/-); default 1000 bp
- colorA
- color in hex for mA; default #053C5E
- colorC
- color in hex for mCG; default #BB4430
- colors
- color list in hex for overlay plots; default is ["#2D1E2F", "#A9E5BB", "#610345", "#559CAD", "#5E747F"]
- dotsize
- size of points; default is 0.5
- smooth
- window over which to smooth aggregate curve; default of 50 bp
- min_periods
- minimum number of bases to consider for smoothing: default of 10 bp
- cores
- number of cores over which to parallelize; default is all available
-
- **Example**
-
- For single file and region:
-
- >>> dm.plot_enrichment_profile("dimelo/test/data/mod_mappings_subset.bam", "test", "dimelo/test/data/test.bed", "A+CG", "dimelo/dimelo_test", windowSize=500, dotsize=1)
-
- To overlay multiple regions of interest (can conversely also overlay multiple samples over a single region if a list of files is provided):
-
- >>> dm.plot_enrichment_profile("dimelo/test/data/mod_mappings_subset.bam", ["test1","test2"], ["dimelo/test/data/test.bed","dimelo/test/data/test.bed"], "A", "dimelo/dimelo_test", windowSize=500, dotsize=1)
-
- **Return**
-
- * Aggregate profile of fraction of bases modified centered at features of interest
- * Single molecules centered at features of interest
- * Base abundance centered at features of interest
-
- **Example Plots**
-
- * :ref:`sphx_glr_auto_examples_enrichment_profile_single_example.py`
- * :ref:`sphx_glr_auto_examples_enrichment_profile_ma_mc_example.py`
- * :ref:`sphx_glr_auto_examples_enrichment_profile_overlay_example.py`
-
+ Plot enrichment profiles, overlaying the resulting traces on top of each other.
+
+ Each input list is expected to be parallel and the same length. Each index represents one analysis condition across the lists.
+ Using the same file for multiple conditions requires adding the same file multiple times, in the appropriate indices.
+
+ This is the most flexible method for enrichment profile plotting. For most use cases, consider
+ using one of the plot_enrichment_profile.by_* methods.
+
+ TODO: I think it's reasonable for smoothing min_periods to be always set to 1 for this method, as it's a visualization tool, not quantitative. Is this unreasonable?
+ TODO: Should the more restrictive meta versions allow *args, or only **kwargs?
+ No, we want to be able to pass kwargs down to the line plotter, I think. Especially if we swap it out for one that takes more different standard args.
+ TODO: It's mildly confusing that there are required args that are only seen as *args or **kwargs in the more restrictive meta versions... But this is so much cleaner...
+
+ Args:
+ mod_file_names: list of paths to modified base data files
+ bed_file_names: list of paths to bed files specifying centered equal-length regions
+ mod_names: list of modifications to extract; expected to match mods available in the relevant mod_files
+ sample_names: list of names to use for labeling traces in the output; legend entries
+ window_size: half-size of the desired window to plot; how far the window stretches on either side of the center point
+ single_strand: True means we only grab counts from reads from the same strand as
+ the region of interest, False means we always grab both strands within the regions
+ regions_5to3prime: True means negative strand regions get flipped, False means no flipping
+ smooth_window: size of the moving window to use for smoothing. If set to None, no smoothing is performed
+ quiet: disables progress bars
+ cores: CPU cores across which to parallelize processing. Default to None, which means all available.
+ kwargs: other keyword parameters passed through to utils.line_plot
+
+ Returns:
+ Axes object containing the plot
"""
+ if not utils.check_len_equal(mod_file_names, regions_list, motifs, sample_names):
+ raise ValueError("Unequal number of inputs")
- if not os.path.isdir(outDir):
- os.makedirs(outDir)
-
- # default number of cores is max available
- cores_avail = multiprocessing.cpu_count()
- if cores is None:
- num_cores = cores_avail
- else:
- # if more than available cores is specified, process with available cores
- if cores > cores_avail:
- num_cores = cores_avail
- else:
- num_cores = cores
-
- # if single bam file rather than list is entered, convert to list
- if type(fileNames) != list:
- fileNames = [fileNames]
- # if single sample name rather than list is entered, convert to list
- if type(sampleNames) != list:
- sampleNames = [sampleNames]
- # if single bed file rather than list is entered, convert to list
- if type(bedFiles) != list:
- bedFiles = [bedFiles]
-
- db_paths = []
- for f in fileNames:
- db = outDir + "/" + f.split("/")[-1].replace(".bam", "") + ".db"
- db_paths.append(db)
-
- # overlay condition
- if len(fileNames) > 1 or len(bedFiles) > 1:
- if basemod == "A+CG":
- raise RuntimeError(
- "enrichment overlays can only be produced for a single base modification at a time"
- )
- fig = plt.figure()
- if len(fileNames) > 1:
- if len(bedFiles) > 1:
- raise RuntimeError(
- "only a single region file can be used when overlaying multiple bam files"
- )
- for f, n, c in zip(fileNames, sampleNames, cycle(colors)):
- execute_overlay(
- f,
- n,
- c,
- bedFiles[0],
- basemod,
- outDir,
- threshA,
- threshC,
- windowSize,
- dotsize,
- smooth,
- min_periods,
- num_cores,
- )
- if len(bedFiles) > 1:
- if len(fileNames) > 1:
- raise RuntimeError(
- "only a single bam file can be used when overlaying multiple bed file regions"
- )
- for b, n, c in zip(bedFiles, sampleNames, cycle(colors)):
- execute_overlay(
- fileNames[0],
- n,
- c,
- b,
- basemod,
- outDir,
- threshA,
- threshC,
- windowSize,
- dotsize,
- smooth,
- min_periods,
- num_cores,
- )
- if len(fileNames) == 1:
- title = "sample_" + fileNames[0].split("/")[-1].replace(".bam", "")
- if len(bedFiles) == 1:
- title = "region_" + bedFiles[0].split("/")[-1].replace(".bed", "")
- plt.title(basemod)
- plt.legend(sampleNames)
- fig.savefig(
- outDir
- + "/"
- + title
- + "_"
- + basemod
- + "_sm_rolling_avg_overlay.pdf"
- )
- plt.close()
-
- overlay_path = f"{outDir}/{title}_{basemod}_sm_rolling_avg_overlay.pdf"
- str_out = f"Outputs\n_______\nDB file: {db_paths}\noverlay plot: {overlay_path}"
- print(str_out)
-
- # no overlay condition
- if (len(fileNames) == 1) and (len(bedFiles) == 1):
- execute_single_plot(
- fileNames[0],
- sampleNames[0],
- bedFiles[0],
- basemod,
- outDir,
- threshA,
- threshC,
- windowSize,
- colorA,
- colorC,
- dotsize,
- smooth,
- min_periods,
- num_cores,
- )
-
- t_paths = []
- if "A" in basemod:
- t_path = (
- outDir + "/" + sampleNames[0] + "_" + "A" + "_base_count.png"
- )
- t_paths.append(t_path)
- if "C" in basemod:
- t_path = (
- outDir + "/" + sampleNames[0] + "_" + "CG" + "_base_count.png"
- )
- t_paths.append(t_path)
-
- enrichment_path = (
- f"{outDir}/{sampleNames[0]}_{basemod}_sm_rolling_avg.pdf"
- )
- sm_path = f"{outDir}/{sampleNames[0]}_{basemod}_sm_scatter.png"
- str_out = f"Outputs\n_______\nDB file: {db_paths}\nenrichment plot: {enrichment_path}\nsingle molecule plot: {sm_path}\nbase count plots: {t_paths}"
- print(str_out)
-
-
-def execute_overlay(
- fileName,
- sampleName,
- color,
- bedFile,
- basemod,
- outDir,
- threshA,
- threshC,
- windowSize,
- dotsize,
- smooth,
- min_periods,
- num_cores,
-):
- parse_bam(
- fileName,
- sampleName,
- outDir,
- bedFile,
- basemod,
- center=True,
- windowSize=windowSize,
- threshA=threshA,
- threshC=threshC,
- cores=num_cores,
- )
- aggregate_counts = pd.read_sql(
- "SELECT * from methylationAggregate_" + sampleName,
- sqlite3.connect(
- outDir + "/" + fileName.split("/")[-1].replace(".bam", "") + ".db"
- ),
- )
- aggregate_counts["frac"] = (
- aggregate_counts["methylated_bases"] / aggregate_counts["total_bases"]
- )
- if "A" in basemod:
- plot_aggregate_helper(
- aggregate_counts, "A", smooth, min_periods, color
- )
-
- if "C" in basemod:
- plot_aggregate_helper(
- aggregate_counts, "C", smooth, min_periods, color
- )
-
-
-def execute_single_plot(
- fileName,
- sampleName,
- bedFile,
- basemod,
- outDir,
- threshA,
- threshC,
- windowSize,
- colorA,
- colorC,
- dotsize,
- smooth,
- min_periods,
- num_cores,
-):
-
- parse_bam(
- fileName,
- sampleName,
- outDir,
- bedFile,
- basemod,
- center=True,
- windowSize=windowSize,
- threshA=threshA,
- threshC=threshC,
- cores=num_cores,
+ trace_vectors = get_enrichment_profiles(
+ mod_file_names=mod_file_names,
+ regions_list=regions_list,
+ motifs=motifs,
+ window_size=window_size,
+ single_strand=single_strand,
+ regions_5to3prime=regions_5to3prime,
+ smooth_window=smooth_window,
+ quiet=quiet,
+ cores=cores,
)
- all_data = pd.read_sql(
- "SELECT * from methylationByBase_" + sampleName,
- sqlite3.connect(
- outDir + "/" + fileName.split("/")[-1].replace(".bam", "") + ".db"
- ),
- )
- aggregate_counts = pd.read_sql(
- "SELECT * from methylationAggregate_" + sampleName,
- sqlite3.connect(
- outDir + "/" + fileName.split("/")[-1].replace(".bam", "") + ".db"
- ),
- )
-
- print(
- "processing "
- + str(len(all_data["read_name"].unique()))
- + " reads with methylation above threshold for "
- + sampleName
- + " for bam: "
- + fileName
+ axes = make_enrichment_profile_plot(
+ trace_vectors=trace_vectors, sample_names=sample_names, **kwargs
)
+ return axes
- fig, ax = plt.subplots()
- colors = {"A+Y": colorA, "A+a": colorA, "C+Z": colorC, "C+m": colorC}
+def by_modification(
+ mod_file_name: str | Path,
+ regions: str | Path,
+ motifs: list[str],
+ **kwargs,
+) -> Axes:
+ """
+ Plot enrichment profile, holding modification file and regions constant, varying modification types
- sns.scatterplot(
- data=all_data,
- x="pos",
- y="read_name",
- hue="mod",
- palette=colors,
- s=dotsize,
- marker="s",
- linewidth=0,
- legend=None,
+ See plot_enrichment_profile for details.
+ """
+ n_mods = len(motifs)
+ return plot_enrichment_profile(
+ mod_file_names=[mod_file_name] * n_mods,
+ regions_list=[regions] * n_mods,
+ motifs=motifs,
+ sample_names=motifs,
+ **kwargs,
)
- ax.spines[["top", "right", "left"]].set_visible(False)
-
- plt.yticks([])
- plt.ylabel("")
- plt.xlabel("")
- plt.xlim(-windowSize, windowSize)
- fig.savefig(
- outDir + "/" + sampleName + "_" + basemod + "_sm_scatter.png", dpi=600
- )
- plt.close()
-
- plot_aggregate_me_frac(
- sampleName,
- aggregate_counts,
- smooth,
- min_periods,
- windowSize,
- basemod,
- outDir,
- colorA,
- colorC,
- )
+"""
+TODO: Re-assignment issue:
+dimelo/plot_enrichment_profile.py:142: error: Incompatible types in assignment (expression has type "list[str | Path | list[str | Path]]", variable has type "list[str] | None") [assignment]
+dimelo/plot_enrichment_profile.py:148: error: Argument "sample_names" to "plot_enrichment_profile" has incompatible type "list[str] | None"; expected "list[str]" [arg-type]
+dimelo/plot_enrichment_profile.py:168: error: Incompatible types in assignment (expression has type "list[str | Path]", variable has type "list[str] | None") [assignment]
+dimelo/plot_enrichment_profile.py:174: error: Argument "sample_names" to "plot_enrichment_profile" has incompatible type "list[str] | None"; expected "list[str]" [arg-type]
-# average profile
-def plot_aggregate_me_frac(
- sampleName,
- aggregate_counts,
- smooth,
- min_periods,
- windowSize,
- basemod,
- outDir,
- colorA,
- colorC,
-):
- aggregate_counts["frac"] = (
- aggregate_counts["methylated_bases"] / aggregate_counts["total_bases"]
- )
+If sample names is None we assign it non-None values, so it's not clear what the problem is to me. We could make an intermediate dummy variable I guess? If that is the complaint?
+"""
- fig = plt.figure()
- labels = []
- if "A" in basemod:
- plot_aggregate_helper(
- aggregate_counts, "A", smooth, min_periods, colorA
- )
- labels.append("A")
- if "C" in basemod:
- plot_aggregate_helper(
- aggregate_counts, "C", smooth, min_periods, colorC
- )
- labels.append("CG")
- plt.title(basemod)
- plt.legend(labels)
- fig.savefig(
- outDir + "/" + sampleName + "_" + basemod + "_sm_rolling_avg.pdf"
- )
- plt.close()
-
- if "A" in basemod:
- aggregate_A = aggregate_counts[
- aggregate_counts["mod"].str.contains("A")
- ].copy()
- # need to sort first!
- aggregate_A.sort_values(["pos"], inplace=True)
- plot_base_abundance(
- sampleName,
- aggregate_A,
- "A",
- windowSize,
- outDir,
- )
- if "C" in basemod:
- aggregate_C = aggregate_counts[
- aggregate_counts["mod"].str.contains("C")
- ].copy()
- # need to sort first!
- aggregate_C.sort_values(["pos"], inplace=True)
- plot_base_abundance(
- sampleName,
- aggregate_C,
- "CG",
- windowSize,
- outDir,
- )
-
-
-# helper function to create smoothed lineplot
-def plot_aggregate_helper(aggregate_counts, mod, smooth, min_periods, color):
- aggregate = aggregate_counts[
- aggregate_counts["mod"].str.contains(mod)
- ].copy()
- # need to sort first!
- aggregate.sort_values(["pos"], inplace=True)
- aggregate_rolling = aggregate.rolling(
- window=smooth, min_periods=min_periods, center=True, on="pos"
- ).mean()
- sns.lineplot(
- x=aggregate_rolling["pos"], y=aggregate_rolling["frac"], color=color
- )
+def by_regions(
+ mod_file_name: str | Path,
+ regions_list: list[str | Path | list[str | Path]],
+ motif: str,
+ sample_names: list[str] | None = None,
+ **kwargs,
+) -> Axes:
+ """
+ Plot enrichment profile, holding modification file and modification types constant, varying regions
-def plot_base_abundance(
- sampleName, aggregate_counts, basemod, windowSize, outDir
-):
- cmapPurple = colors.LinearSegmentedColormap.from_list(
- "custom purple", ["white", "#2D1E2F"], N=200
- )
- aggregate_counts = (
- aggregate_counts.set_index("pos")
- .reindex(
- pd.Index(
- np.arange(
- aggregate_counts["pos"].min(),
- aggregate_counts["pos"].max(),
- 1,
- ),
- name="pos",
- )
- )
- .reset_index()
- )
- aggregate_counts = aggregate_counts.fillna(0)
- fig = plt.figure()
- x = aggregate_counts["pos"].to_numpy()
- y = aggregate_counts["total_bases"].to_numpy() # base_count
- fig, (ax, ax2) = plt.subplots(nrows=2, sharex=True)
- extent = [x[0] - (x[1] - x[0]) / 2.0, x[-1] + (x[1] - x[0]) / 2.0, 0, 1]
- im = ax.imshow(
- y[np.newaxis, :], cmap=cmapPurple, aspect="auto", extent=extent
- )
- divider = make_axes_locatable(ax)
- cax = divider.append_axes("top", size="5%", pad=0.25)
- fig.colorbar(im, cax=cax, orientation="horizontal")
- ax.set_yticks([])
- ax.set_xlim(extent[0], extent[1])
- ax2.plot(x, y, "o", ms=0.5, color="#2D1E2F")
- ax2.set_xlim(extent[0], extent[1])
- plt.tight_layout()
- fig.savefig(
- outDir + "/" + sampleName + "_" + basemod + "_base_count.png", dpi=600
- )
- plt.close()
+ Note: Sample names default to the names of the bed files.
+ See plot_enrichment_profile for details.
+ """
+ if sample_names is None:
+ sample_names = regions_list
+ n_beds = len(regions_list)
+ return plot_enrichment_profile(
+ mod_file_names=[mod_file_name] * n_beds,
+ regions_list=regions_list,
+ motifs=[motif] * n_beds,
+ sample_names=sample_names,
+ **kwargs,
+ )
+
+
+def by_dataset(
+ mod_file_names: list[str | Path],
+ regions: str | Path | list[str | Path],
+ motif: str,
+ sample_names: list[str] | None = None,
+ **kwargs,
+) -> Axes:
+ """
+ Plot enrichment profile, holding modification types and regions constant, varying modification files
-def main():
- parser = argparse.ArgumentParser(
- description="Plot DiMeLo enrichment profile"
- )
+ Note: Sample names default to the names of the modification files.
- # Required arguments
- required_args = parser.add_argument_group("required arguments")
- required_args.add_argument(
- "-f", "--fileNames", required=True, nargs="+", help="bam file name(s)"
- )
- required_args.add_argument(
- "-s",
- "--sampleNames",
- required=True,
- nargs="+",
- help="sample name(s) for output file labelling",
- )
- required_args.add_argument(
- "-b",
- "--bedFiles",
- required=True,
- nargs="+",
- help="name of bed file(s) defining region(s) of interest",
- )
- required_args.add_argument(
- "-m",
- "--basemod",
- required=True,
- type=str,
- choices=["A", "CG", "A+CG"],
- help="which base modification to extract",
- )
- required_args.add_argument(
- "-o", "--outDir", required=True, help="directory to output plot"
- )
+ See plot_enrichment_profile for details.
+ """
+ if sample_names is None:
+ sample_names = mod_file_names
+ n_mod_files = len(mod_file_names)
+ return plot_enrichment_profile(
+ mod_file_names=mod_file_names,
+ regions_list=[regions] * n_mod_files,
+ motifs=[motif] * n_mod_files,
+ sample_names=sample_names,
+ **kwargs,
+ )
+
+
+def get_enrichment_profiles(
+ mod_file_names: list[str | Path],
+ regions_list: list[str | Path | list[str | Path]],
+ motifs: list[str],
+ window_size: int,
+ single_strand: bool = False,
+ regions_5to3prime: bool = False,
+ smooth_window: int | None = None,
+ quiet: bool = False,
+ cores: int | None = None,
+) -> list[np.ndarray]:
+ """
+ Get the enrichment profile traces, ready for plotting.
+
+ This helper function can be useful during plot prototyping, when repeatedly building plots from the same data.
+ Its outputs can be passed as the first argument to make_enrichment_profile_plot().
+
+ TODO: I feel like this should be able to take in data directly as vectors/other datatypes, not just read from files.
+ TODO: Style-wise, is it cleaner to have it be a match statement or calling a method from a global dict? Cleaner here with a dict, cleaner overall with the match statements?
+ TODO: I think it's reasonable for smoothing min_periods to be always set to 1 for this method, as it's a visualization tool, not quantitative. Is this unreasonable?
+
+ Args:
+ mod_file_names: list of paths to modified base data files
+ bed_file_names: list of paths to bed files specifying centered equal-length regions
+ mod_names: list of modifications to extract; expected to match mods available in the relevant mod_files
+ window_size: half-size of the desired window to plot; how far the window stretches on either side of the center point
+ single_strand: True means we only grab counts from reads from the same strand as
+ the region of interest, False means we always grab both strands within the regions
+ regions_5to3prime: True means negative strand regions get flipped, False means no flipping
+ quiet: disables progress bars
+ cores: CPU cores across which to parallelize processing
+ smooth_window: size of the moving window to use for smoothing. If set to None, no smoothing is performed
+
+ Returns:
+ List of enrichment profile traces
+ """
+ if not utils.check_len_equal(mod_file_names, regions_list, motifs):
+ raise ValueError("Unequal number of inputs")
+ # TODO: redefinition error; still need to figure out how to do this elegantly in a way mypy likes
+ # dimelo/plot_enrichment_profile.py:53: error: Item "str" of "str | Path" has no attribute "suffix" [union-attr]
+ mod_file_names = [Path(fn) for fn in mod_file_names]
+
+ trace_vectors = []
+ for mod_file, regions, motif in zip(mod_file_names, regions_list, motifs):
+ match mod_file.suffix:
+ case ".gz":
+ modified_base_counts, valid_base_counts = (
+ load_processed.pileup_vectors_from_bedmethyl(
+ bedmethyl_file=mod_file,
+ regions=regions,
+ motif=motif,
+ window_size=window_size,
+ single_strand=single_strand,
+ regions_5to3prime=regions_5to3prime,
+ quiet=quiet,
+ cores=cores,
+ )
+ )
+ # Default to nan so we can skip over unfilled values when plotting or doing a rolling average
+ nans_everywhere = np.full_like(
+ modified_base_counts, np.nan, dtype=float
+ )
+ trace = np.divide(
+ modified_base_counts,
+ valid_base_counts,
+ out=nans_everywhere,
+ where=valid_base_counts != 0,
+ )
+ case ".fake":
+ trace = load_processed.vector_from_fake(
+ mod_file=mod_file,
+ bed_file=regions,
+ motif=motif,
+ window_size=window_size,
+ )
+ case _:
+ raise ValueError(f"Unsupported file type for {mod_file}")
+ if smooth_window is not None:
+ trace = utils.smooth_rolling_mean(trace, window=smooth_window)
+ trace_vectors.append(trace)
+ return trace_vectors
+
+
+def make_enrichment_profile_plot(
+ trace_vectors: list[np.ndarray],
+ sample_names: list[str],
+ **kwargs,
+) -> Axes:
+ """
+ Plot the given enrichment profile traces.
- # Smoothing options
- smoothing_args = parser.add_argument_group("smoothing options")
- smoothing_args.add_argument(
- "-t",
- "--smooth",
- type=int,
- default=DEFAULT_SMOOTH,
- help="window over which to smooth aggregate curve",
- )
- smoothing_args.add_argument(
- "-n",
- "--min_periods",
- type=int,
- default=DEFAULT_MIN_PERIODS,
- help="minimum number of bases to consider for smoothing",
- )
+ This helper function can be useful during plot prototyping, when repeatedly building plots from the same data.
+ The first argument should be the output of get_enrichment_profiles().
- # Plotting arguments
- plotting_args = parser.add_argument_group("plotting options")
- plotting_args.add_argument(
- "--colorA",
- type=str,
- default=COLOR_A,
- help='color in hex (e.g. "#BB4430") for mA',
- )
- plotting_args.add_argument(
- "--colorC",
- type=str,
- default=COLOR_C,
- help='color in hex (e.g. "#BB4430") for mCG',
- )
- plotting_args.add_argument(
- "--colors",
- type=str,
- nargs="+",
- default=COLOR_LIST,
- help='color list in hex (e.g. "#BB4430") for overlay plots',
- )
- plotting_args.add_argument(
- "-d",
- "--dotsize",
- type=float,
- default=DEFAULT_DOTSIZE,
- help="size of points",
- )
+ Args:
+ trace_vectors: list of enrichment profile traces
+ sample_names: list of names to use for labeling traces in the output; legend entries
+ kwargs: other keyword parameters passed through to utils.line_plot
- # Optional arguments
- parser.add_argument(
- "-A",
- "--threshA",
- type=int,
- default=DEFAULT_THRESH_A,
- help="threshold above which to call an A base methylated",
- )
- parser.add_argument(
- "-C",
- "--threshC",
- type=int,
- default=DEFAULT_THRESH_C,
- help="threshold above which to call a C base methylated",
- )
- parser.add_argument(
- "-w",
- "--windowSize",
- type=int,
- default=DEFAULT_WINDOW_SIZE,
- help="window size around center point of feature of interest to plot (+/-)",
- )
- parser.add_argument(
- "-p",
- "--cores",
- type=int,
- help="number of cores over which to parallelize",
+ Returns:
+ Axes object containing the plot
+ """
+ if not utils.check_len_equal(trace_vectors, sample_names):
+ raise ValueError("Unequal number of inputs")
+ axes = utils.line_plot(
+ indep_vector=np.arange(
+ -len(trace_vectors[0]) // 2,
+ len(trace_vectors[0]) // 2 + len(trace_vectors[0]) % 2,
+ ),
+ indep_name="pos",
+ dep_vectors=trace_vectors,
+ dep_names=sample_names,
+ y_label="fraction modified bases",
+ **kwargs,
)
-
- args = parser.parse_args()
- plot_enrichment_profile(**vars(args))
+ return axes
diff --git a/dimelo/plot_read_browser.py b/dimelo/plot_read_browser.py
new file mode 100644
index 00000000..4ead8395
--- /dev/null
+++ b/dimelo/plot_read_browser.py
@@ -0,0 +1,447 @@
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import plotly
+
+from . import load_processed, utils
+
+
+def plot_read_browser(
+ mod_file_name: str | Path,
+ region: str,
+ motifs: list[str],
+ thresh: int | float | None = None,
+ single_strand: bool = False,
+ sort_by: str | list[str] = "shuffle",
+ hover: bool = True,
+ subset_parameters: dict | None = None,
+ **kwargs,
+) -> plotly.graph_objs.Figure:
+ """
+ Plot base modifications on single reads in a high-quality, interactive-enabled fashion.
+
+ This method returns a plotly Figure object, which can be used in a number of ways to view and save
+ the figure in different formats. To view the figure interactively (in a notebook or python script),
+ simply call the show() method of the returned Figure object. See the helper methods below for saving
+ figures.
+
+ Additional keyword arguments will be passed down to collapse_rows() if sort_by == "collapse". See that
+ method for details.
+
+ Args:
+ mod_file_name: path to file containing modification data for single reads
+ region: region string specifying the region to plot
+ motifs: list of modifications to extract; expected to match mods available in the relevant mod_files
+ thresh: A modification calling threshold. While the browser always displays float probabilities, setting
+ this to a value will limit display to only modification events over the given threshold. Else, display
+ all modifications regardless of probability.
+ single_strand: True means we only grab counts from reads from the same strand as
+ the region of interest, False means we always grab both strands within the regions
+ sort_by: ordered list for hierarchical sort; see load_processed.read_vectors_from_hdf5() for details.
+ Can also pass the argument "collapse" to allow multiple reads on single rows of the browser, for a
+ more condensed visualization. Note that "collapse" is mutually exclusive with all other sorting options,
+ and is only allowed to be passed as a single string option.
+ hover: if False, disables display of information on mouse hover
+ subset_parameters: Parameters to pass to the utils.random_sample() method, to subset the
+ reads to be returned. If not None, at least one of n or frac must be provided.
+
+ Returns:
+ plotly Figure object containing the plot
+
+ TODO: Improve color specification? User should be able to set their own colors.
+ TODO: Should this let the user set arbitrary thresholds for each motif individually?
+ TODO: The way that "collapse" is specified is unintuitive and problematic; what if the user passes "collapse" as
+ an element in an array?
+ TODO: Is it worth having an option for meta-sorting of collapsed reads? It's here for now to enable testing.
+ """
+ # If asked to collapse reads, set up the initial read sorting appropriately and prep for later
+ collapse = False
+ if sort_by == "collapse":
+ collapse = True
+ sort_by = "read_start"
+
+ read_tuples, entry_labels, _ = load_processed.read_vectors_from_hdf5(
+ file=mod_file_name,
+ regions=region,
+ motifs=motifs,
+ single_strand=single_strand,
+ sort_by=sort_by,
+ calculate_mod_fractions=False,
+ subset_parameters=subset_parameters,
+ )
+
+ mod_vector_index = entry_labels.index("mod_vector")
+
+ if read_tuples[0][mod_vector_index].dtype == np.bool_:
+ raise ValueError(
+ "A threshold has been applied to this .h5 single read data. plot_read_browser must be used with an .h5 file extracted using thresh=None."
+ )
+
+ read_extent_df, mod_event_df = format_browser_data(
+ read_tuples=read_tuples, entry_labels=entry_labels
+ )
+
+ # Apply threshold to mod_event_df
+ if thresh is not None:
+ mod_event_df = mod_event_df[mod_event_df.prob > utils.adjust_threshold(thresh)]
+ else:
+ # Still need to filter out all values that are effectively 0, or the read bars cannot be seen
+ # TODO: This seems like the wrong place to be handling this.
+ mod_event_df = mod_event_df[mod_event_df.prob > utils.adjust_threshold(2)]
+
+ try:
+ chrom, (region_start, region_end, _) = utils.parse_region_string(
+ region=region, window_size=None
+ )
+ except ValueError as err:
+ raise ValueError(
+ "Invalid region specification: plot_read_browser requires a single genomic locus."
+ ) from err
+
+ fig = make_browser_figure(
+ read_extent_df=read_extent_df,
+ mod_event_df=mod_event_df,
+ collapse=collapse,
+ chrom=chrom,
+ region_start=region_start,
+ region_end=region_end,
+ hover=hover,
+ **kwargs,
+ )
+
+ return fig
+
+
+def format_browser_data(
+ read_tuples: list[tuple],
+ entry_labels: list[str],
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+ """
+ Take data from load_processed.read_vectors_from_hdf5() and format it for browser plotting.
+
+ Argument descriptions taken directly from documentation of load_processed.read_vectors_from_hdf5().
+
+ Args:
+ read_tuples: a list of tuples, each tuple containing all datasets corresponding to an individual read that
+ was within the specified regions.
+ entry_labels: a list of strings, naming the datasets returned.
+
+ Returns:
+ * dataframe defining the start, end, name, and desired y-index (sorting) of each read
+ * dataframe defining all necessary information to place each modification event on the browser
+
+ TODO: There is an observed issue where there are duplicated reads; duplicates are currently thrown away. This only
+ manifests itself when subsetting reads, as different random subsets of the same requested size will show up as
+ different numbers of rows.
+ """
+ # Coerce read tuples to initial dataframe, throwing away unnecessary columns
+ to_exclude = ["chromosome", "strand", "region_start", "region_end"]
+ read_df = pd.DataFrame.from_records(
+ read_tuples, columns=entry_labels, exclude=to_exclude
+ )
+
+ # For each row, pull out just the positions of valid bases and the probabilities at those positions
+ # TODO: I don't like using iterrows, but it seems silly to have two calls to apply that do basically the same thing redundantly; just looping once instead
+ prob_vectors = []
+ pos_vectors = []
+ for _, row in read_df.iterrows():
+ selector = row.val_vector == 1
+ all_positions = np.arange(len(row.mod_vector)) + row.read_start
+ prob_vectors.append(row.mod_vector[selector])
+ pos_vectors.append(all_positions[selector])
+ read_df["prob_vector"] = prob_vectors
+ read_df["pos_vector"] = pos_vectors
+
+ # assign reads y-axis values based on their unique names, preserving the pre-set order
+ # TODO: I'm pretty sure that however I do this, there will possibly be an "overdrawing" problem if the same read shows up multiple times. Do I care? Will it be taken care of by dropping duplicates?
+ read_df["y_index"] = read_df.read_name.map(
+ {k: v for v, k in enumerate(read_df.read_name.unique())}
+ )
+
+ # TODO: Dropping duplicates should hide the "overdrawing" problem when reads are duplicated in the dataset. Is this a problem or the intended behavior?
+ # Get two separate dataframes:
+ # * represents the read extents, to draw the grey lines
+
+ # Added the read_name subsetting to avoid an index mapping error in make_browser_figure
+ # caused by the read metadata for the same read with different motifs having slight
+ # start and end offsets in some cases, when using Dorado or PacBio data. These 1-5bp
+ # differences are caused by the assumptions made in parse_bam.extract h5 conversion
+ # and don't matter for much but do cause the full duplicate check to leave duplicates,
+ # which then cause non-unique indices for mapping in collapse mode
+ read_extent_df = read_df[
+ ["read_start", "read_end", "y_index", "read_name"]
+ ].drop_duplicates(subset=["read_name"])
+ # * represents the methylation events
+ mod_event_df = (
+ read_df[["y_index", "read_name", "motif", "pos_vector", "prob_vector"]]
+ .explode(["pos_vector", "prob_vector"])
+ .rename(columns={"pos_vector": "pos", "prob_vector": "prob"})
+ )
+
+ return read_extent_df, mod_event_df
+
+
+def collapse_rows(
+ read_extent_df: pd.DataFrame,
+ minimum_gap: int = 500,
+ meta_sort: str | None = "full_extent",
+) -> pd.Series:
+ """
+ Takes a sorted dataframe of read extents and collapses reads onto a smaller number of rows.
+
+ The input dataframe is expected to be sorted in a sensible fashion and pre-indexed with a set of
+ unique index values. This method has been tailored for and verified using read_start pre-sorting.
+ Behavior using other starting sorts may be undefined.
+
+ Optionally, performs a "meta-sort" of the resulting rows. Current options work as follows:
+ * full_extent: sort by full extent of the covered reads in the row (max end - min start);
+ rows covering a larger region at the bottom
+ * covered_bases: sort by number of bases covered by reads in the row;
+ rows covering more bases at the bottom
+ * None: no meta-sorting
+
+ Returns a series that maps the original indices on to the final collapsed and meta-sorted indices.
+ This series can be applied to the original data by using the pd.Series.map() method.
+
+ Args:
+ read_extent_df: read extent dataframe from format_browser_data()
+ minimum_gap: minimum number of bases allowed between the end of one read and the beginning of
+ the next for the two reads to be placed on the same row
+ meta_sort: type of meta sorting to do; one of ["full_extent", "covered_bases", None]
+
+ Returns:
+ Series mapping original indices to meta indices
+
+ TODO: This could be improved by checking for overlaps on both ends of the seed read for each row.
+ This might allow other types of pre-sorting to work more effectively.
+ """
+ # Sentinel value for un-indexed reads is -1
+ collapsed_indices = -np.ones(len(read_extent_df), dtype=int)
+
+ # Collapse reads
+ curr_y_idx = 0
+ for seed_read_idx in range(len(read_extent_df)):
+ # If seed read has been indexed already, move on
+ if collapsed_indices[seed_read_idx] != -1:
+ continue
+ collapsed_indices[seed_read_idx] = curr_y_idx
+
+ # Add any other non-indexed reads that fit onto the current row
+ curr_row_end = read_extent_df.iloc[seed_read_idx]["read_end"]
+ for other_read_idx in range(seed_read_idx + 1, len(read_extent_df)):
+ # If other read has been indexed already, move on
+ if collapsed_indices[other_read_idx] != -1:
+ continue
+ # If other read fits onto the current row, index it
+ if (
+ read_extent_df.iloc[other_read_idx]["read_start"]
+ > curr_row_end + minimum_gap
+ ):
+ collapsed_indices[other_read_idx] = curr_y_idx
+ curr_row_end = read_extent_df.iloc[other_read_idx]["read_end"]
+
+ curr_y_idx += 1
+
+ # Series mapping original indices to collapsed indices
+ idx_map_orig2collapse = pd.Series(
+ collapsed_indices, index=read_extent_df["y_index"]
+ )
+
+ if meta_sort is not None:
+ # Perform meta-sorting
+ match meta_sort:
+ case "full_extent":
+ # sort by full extent of the covered reads in the row (max end - min start)
+ idx_map_meta2collapse = (
+ read_extent_df.groupby(collapsed_indices)
+ .apply(
+ lambda row_group: row_group.read_end.max()
+ - row_group.read_start.min()
+ )
+ .sort_values(ascending=False)
+ .reset_index()["index"]
+ )
+ case "covered_bases":
+ # sort by number of bases covered by reads in the row
+ read_lengths = read_extent_df["read_end"] - read_extent_df["read_start"]
+ idx_map_meta2collapse = (
+ read_lengths.groupby(collapsed_indices)
+ .sum()
+ .sort_values(ascending=False)
+ .reset_index()["index"]
+ )
+ case _:
+ raise ValueError(f"Invalid meta sorting option: {meta_sort}")
+
+ # Series mapping collapsed indices to meta indices
+ idx_map_collapse2meta = pd.Series(
+ idx_map_meta2collapse.index.values, index=idx_map_meta2collapse
+ )
+
+ # Return Series mapping original indices to meta indices
+ return idx_map_orig2collapse.map(idx_map_collapse2meta)
+ else:
+ # Return Series mapping original indices to collapsed indices
+ return idx_map_orig2collapse
+
+
+def make_browser_figure(
+ read_extent_df: pd.DataFrame,
+ mod_event_df: pd.DataFrame,
+ collapse: bool,
+ chrom: str,
+ region_start: int,
+ region_end: int,
+ hover: bool = True,
+ **kwargs,
+) -> plotly.graph_objs.Figure:
+ """
+ Make a browser figure, using the provided pre-processed data
+
+ Additional keyword arguments will be passed down to collapse_rows() if collapse == True. See that
+ method for details.
+
+ Args:
+ read_extent_df: read extent dataframe from format_browser_data()
+ mod_event_df: mod event dataframe from format_browser_data()
+ collapse: if True, allows multiple reads on single rows of the browser for a more condensed
+ visualization.
+ chrom: chromosome of the region being browsed
+ region_start: start position of the region being browsed
+ region_end: end position of the region being browsed
+ hover: if False, disables display of information on mouse hover
+
+ TODO: Think about how this interfaces with different types of initial sorting...
+ TODO: Make it so that this method does NOT modify the input dataframe
+ TODO: Should this method do the collapsing, or should this method require collapsing outside?
+ """
+ if collapse:
+ index_map = collapse_rows(read_extent_df, **kwargs)
+ read_extent_df["y_index"] = read_extent_df["y_index"].map(index_map)
+ mod_event_df["y_index"] = mod_event_df["y_index"].map(index_map)
+
+ # Build final figure
+ # TODO: Enable setting some relevant parameters
+
+ # TODO: Understand all of the options here; are they all as desired?
+ layout = plotly.graph_objs.Layout(
+ barmode="overlay",
+ title=chrom,
+ hovermode="closest",
+ plot_bgcolor="rgba(0,0,0,0)",
+ xaxis=dict(range=[region_start, region_end]),
+ )
+ # TODO: I feel like there has to be a cleaner way to do this, maybe using plotly express, but I dont know and I'm just trying to get this done first. Lots of iterrows. Sad.
+ fig = plotly.graph_objects.Figure(layout=layout)
+ for _, row in read_extent_df.iterrows():
+ # TODO: How can I get the hover information for the reads to match the ones for the mod events? Not sure how to get customdata and hovertemplate working here.
+ fig.add_trace(
+ plotly.graph_objects.Scatter(
+ x=[row.read_start, row.read_end],
+ y=[row.y_index, row.y_index],
+ mode="lines",
+ line=dict(width=1, color="lightgrey"),
+ showlegend=False,
+ hoverinfo="text",
+ hovertext=row.read_name,
+ )
+ )
+ for motif_idx, (motif, motif_df) in enumerate(mod_event_df.groupby("motif")):
+ min_overall = motif_df["prob"].min()
+ max_overall = motif_df["prob"].max()
+ fig.add_trace(
+ plotly.graph_objs.Scatter(
+ x=motif_df["pos"],
+ y=motif_df["y_index"],
+ mode="markers",
+ showlegend=False,
+ customdata=motif_df[["read_name", "prob"]],
+ hovertemplate=" ".join(
+ [
+ "Read: %{customdata[0]}",
+ "Position: %{x:,}",
+ "Probability: %{customdata[1]:.2f}",
+ ]
+ ),
+ marker=dict(
+ size=4,
+ color=motif_df["prob"],
+ colorscale=utils.DEFAULT_COLORSCALES[motif],
+ colorbar=dict(
+ title=dict(
+ text=f"{motif} probability",
+ side="right",
+ ),
+ tickmode="array",
+ tickvals=[min_overall, max_overall],
+ ticktext=[
+ str(round(min_overall, 2)),
+ str(round(max_overall, 2)),
+ ],
+ ticks="outside",
+ thickness=15,
+ # TODO: Is this positioning system dumb?
+ x=1 + (motif_idx * 0.10),
+ ),
+ ),
+ )
+ )
+ if not hover:
+ fig.update_layout(hovermode=False)
+ return fig
+
+
+def save_static(
+ fig: plotly.graph_objs.Figure,
+ output_dir: str | Path,
+ output_basename: str,
+ format: str | list[str] = "pdf",
+ width: int = 1000,
+ height: int = 400,
+) -> None:
+ """
+ Helper function for saving static plot browser images.
+
+ Args:
+ fig: plotly figure to save
+ output_dir: directory in which to save images
+ output_basename: descriptive basename of output file (before file extension)
+ format: one or more valid output formats for plotly; for valid options, see
+ https://plotly.github.io/plotly.py-docs/generated/plotly.io.write_image.html
+ width: width of output image in pixels
+ height: height of output image in pixels
+ """
+ # sanitize and prep inputs
+ output_dir = Path(output_dir)
+ output_dir.mkdir(exist_ok=True)
+
+ if isinstance(format, str):
+ format = [format]
+
+ # write figures
+ for fmt in format:
+ fig.write_image(
+ output_dir / f"{output_basename}.{fmt}", width=width, height=height
+ )
+
+
+def save_interactive(
+ fig: plotly.graph_objs.Figure,
+ output_dir: str | Path,
+ output_basename: str,
+) -> None:
+ """
+ Helper function for saving interactive plot browsers.
+
+ Args:
+ fig: plotly figure to save
+ output_dir: directory in which to save images
+ output_basename: descriptive basename of output file (before file extension)
+ """
+ # sanitize inputs
+ output_dir = Path(output_dir)
+ output_dir.mkdir(exist_ok=True)
+
+ # write figure
+ fig.write_html(output_dir / f"{output_basename}.html", include_plotlyjs="cdn")
diff --git a/dimelo/plot_reads.py b/dimelo/plot_reads.py
new file mode 100644
index 00000000..c6a80f20
--- /dev/null
+++ b/dimelo/plot_reads.py
@@ -0,0 +1,136 @@
+"""
+I'm conflicted about how to handle some of this.
+
+There are two different ways of doing single read plotting: "rectangular" and "whole read".
+"rectangular" means displaying exactly the requested region.
+"whole read" means displaying the entirety of any read overlapping the requested region.
+Probably need separate methods for all of this? Is there shared functionality? Do they live in the same file? Etc.
+
+I'm beginning to lose the thread of where we check for regions making sense.
+Maybe this is an argument for an internal region class that makes checking easy? I don't know.
+"""
+
+from pathlib import Path
+
+import pandas as pd
+import seaborn as sns
+from matplotlib.axes import Axes
+
+from . import load_processed, utils
+
+
+def plot_reads(
+ mod_file_name: str | Path,
+ regions: str | Path | list[str | Path],
+ motifs: list[str],
+ window_size: int | None = None,
+ single_strand: bool = False,
+ regions_5to3prime: bool = False,
+ sort_by: str | list[str] = "shuffle",
+ thresh: float | None = None,
+ relative: bool = True,
+ **kwargs,
+) -> Axes:
+ """
+ Plots centered single reads as a scatterplot, cut off at the boundaries of the requested regions?
+
+ TODO: I feel like this should be able to take in data directly as vectors/other datatypes, not just read from files.
+ TODO: Style-wise, is it cleaner to have it be a match statement or calling a method from a global dict? Cleaner here with a dict, cleaner overall with the match statements?
+ TODO: So far, this is the only method to do plotting without utility methods. Is this reasonable? Is it that unique?
+
+ Args:
+ mod_file_name: path to file containing modification data for single reads
+ regions: path to bed file specifying regions to extract
+ motifs: list of modifications to extract; expected to match mods available in the relevant mod_files
+ window_size: we plot +-window_size//2 from the center of the region(s)
+ single_strand: True means we only grab counts from reads from the same strand as
+ the region of interest, False means we always grab both strands within the regions
+ regions_5to3prime: True means negative strand regions get flipped, False means no flipping. Only works if relative=True
+ sort_by: ordered list for hierarchical sort. Currently only smallest to biggest.
+ thresh: if no threshold has been applied already, this will threshold the mod calls for plotting (method is only boolean)
+ relative: if True, all regions are centered
+
+ Returns:
+ Axes object containing the plot
+ """
+ mod_file_name = Path(mod_file_name)
+ # bed_file_name = Path(bed_file_name)
+ size = kwargs.pop("s", 0.5)
+
+ palette = kwargs.pop("palette", {})
+
+ merged_palette = {**utils.DEFAULT_COLORS, **palette}
+
+ match mod_file_name.suffix:
+ # TODO: Fix how the fake reads options work, and make sure they have the same interface as the real ones.
+ # dimelo/plot_reads.py:63: error: Argument "regions" to "reads_from_fake" has incompatible type "str | Path | list[str | Path]"; expected "Path" [arg-type]
+ # Will also fix the following error:
+ # dimelo/plot_reads.py:68: error: Incompatible types in assignment (expression has type "dict[Any, Any] | None", variable has type "dict[Any, Any]") [assignment]
+ case ".fake":
+ reads, read_names, mods, regions_dict = load_processed.reads_from_fake(
+ file=mod_file_name,
+ regions=regions,
+ motifs=motifs,
+ )
+ case _:
+ reads, read_names, mods, regions_dict = (
+ load_processed.readwise_binary_modification_arrays(
+ file=mod_file_name,
+ regions=regions,
+ motifs=motifs,
+ window_size=window_size,
+ single_strand=single_strand,
+ regions_5to3prime=regions_5to3prime,
+ thresh=thresh,
+ relative=relative,
+ sort_by=sort_by,
+ )
+ )
+
+ # Convert data frame where each row represents a read to a data frame where each row represents a single modified position in a read
+ df = pd.DataFrame({"read_name": read_names, "mod": mods, "pos": reads}).explode(
+ "pos"
+ )
+ axes = sns.scatterplot(
+ data=df,
+ x="pos",
+ y="read_name",
+ hue="mod",
+ # palette=colors,
+ s=size,
+ marker="s",
+ linewidth=0,
+ palette=merged_palette,
+ **kwargs,
+ )
+ # Retrieve the existing legend
+ legend = axes.legend_
+
+ # Retrieve legend handles and labels
+ handles, labels = axes.get_legend_handles_labels()
+
+ # Update legend properties
+ # TODO: Do we need to do this now and after?
+ if legend is not None:
+ legend.set_title("Mod")
+
+ # Update marker size for all handles
+ for handle in handles:
+ if hasattr(handle, "set_markersize"):
+ handle.set_markersize(10) # Set a larger marker size for legend
+
+ # Re-apply the legend with updated handles
+ # TODO: Is this step necessary?
+ axes.legend(handles, labels, title="Mod")
+
+ # TODO: Technically, regions_dict can be None by this point. In that scenario, it will error out when checking the length.
+ # It can be None according to type hints but in the actual logical flow I believe it cannot be None
+ # However, we can easily just check whether it is None here as well, in case we change behavior elsewhere.
+ # Identified with mypy through the following error:
+ # dimelo/plot_reads.py:101: error: Argument 1 to "len" has incompatible type "dict[Any, Any] | None"; expected "Sized" [arg-type]
+ if relative and regions_dict is not None and len(regions_dict) > 0:
+ region1_start, region1_end, _ = next(iter(regions_dict.values()))[0]
+ effective_window_size = (region1_end - region1_start) // 2
+ axes.set_xlim([-effective_window_size, effective_window_size])
+
+ return axes
diff --git a/dimelo/qc.py b/dimelo/qc.py
deleted file mode 100644
index 61ebb3a9..00000000
--- a/dimelo/qc.py
+++ /dev/null
@@ -1,470 +0,0 @@
-r"""
-=================
-QC module
-=================
-.. currentmodule:: dimelo.qc_report
-.. autosummary::
- qc_report
-
-qc_report provides a detailed summary report of many important quality control information including read length, mapping quality, etc.
-
-"""
-import argparse
-import multiprocessing
-import os
-import sqlite3
-import time
-from math import log
-
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import pysam
-from joblib import Parallel, delayed
-from tqdm import tqdm
-
-from dimelo.parse_bam import make_db
-from dimelo.utils import execute_sql_command
-
-DEFAULT_COLOR_LIST = [
- "#BB4430",
- "#FFBC0A",
- "#053C5E",
- "#A9E5BB",
- "#610345",
- "#2D1E2F",
- "#559CAD",
- "#5E747F",
- "#F343F4",
-]
-
-
-def batch_read_generator(file_bamIn, filename):
- counter = 0
- r_list = []
-
- lines = pysam.idxstats(filename).splitlines()
- total_reads = sum(
- [
- int(line.split("\t")[2])
- for line in lines
- if not line.startswith("#")
- ]
- )
- batch_size = 0.1 * total_reads
-
- for read in file_bamIn.fetch(until_eof=True):
- r = [
- read.query_name,
- read.reference_name,
- read.reference_start,
- read.reference_end,
- read.query_length,
- "-" if read.is_reverse else "+",
- read.mapping_quality,
- ave_qual(read.query_qualities),
- ave_qual(read.query_alignment_qualities),
- ]
-
- r = tuple(r)
- if counter < batch_size:
- r_list.append(r)
- counter += 1
- else:
- yield r_list
- counter = 0
- r_list = [r]
- yield r_list
-
-
-def logger(statement):
- print(statement)
-
-
-def prob_bin(bin):
- # probability a base in the window (or across reads or across bases within a read) is methylated by:
- # calculating probability that no base in the window (or across reads) is methylated and then taking the complement
- # treat p=1 as 254/255 for prevent log(0)
- probs = [
- np.log(1 - p) for p in bin if ((p < 1) and (p >= 0.5))
- ] # only consider probabilities > 0.5 and handle 1 on next line
- probs1 = [np.log(1 - 254 / 255) for p in bin if p == 1]
- probsAll = probs + probs1
- prob = 1 - np.exp(sum(probsAll))
- return prob
-
-
-def errs_tab(n):
- """Generate list of error rates for qualities less than equal than n."""
- return [10 ** (q / -10) for q in range(n + 1)]
-
-
-def ave_qual(quals, qround=False, tab=errs_tab(129)):
- """Calculate average basecall quality of a read.
- Receive the integer quality scores of a read and return the average quality for that read
- First convert Phred scores to probabilities,
- calculate average error probability
- convert average back to Phred scale
- """
- if quals:
- mq = -10 * log(sum([tab[q] for q in quals]) / len(quals), 10)
- if qround:
- return round(mq)
- else:
- return mq
- else:
- return None
-
-
-def parse_bam_read(bamIn, sampleName, outDir, cores=None):
- file_bamIn = pysam.AlignmentFile(bamIn, "rb")
-
- DB_NAME, tables = make_db(bamIn, sampleName, outDir, qc=True)
- template_command = (
- """INSERT INTO """ + tables[0] + """ VALUES(?,?,?,?,?,?,?,?,?);"""
- )
- connect = sqlite3.connect(DB_NAME, timeout=60.0, check_same_thread=False)
- cores_avail = multiprocessing.cpu_count()
- if cores is None:
- num_cores = cores_avail
- else:
- # if more than available cores is specified, process with available cores
- if cores > cores_avail:
- num_cores = cores_avail
- else:
- num_cores = cores
-
- c = connect.cursor()
- c.execute("BEGIN TRANSACTION")
-
- Parallel(n_jobs=num_cores, backend="threading")(
- delayed(execute_sql_command)(template_command, DB_NAME, i, connect)
- for i in tqdm(
- batch_read_generator(file_bamIn, bamIn),
- total=10,
- desc="Processing reads",
- unit=" batches",
- )
- )
-
- c.close()
- connect.close()
- return DB_NAME, tables[0]
-
-
-def get_runtime(f, inp1, inp2, inp3):
- start = time.time()
- re_val = f(inp1, inp2, inp3)
- time.sleep(1)
- end = time.time()
- return f"Runtime of the program is {end - start}", re_val
-
-
-def qc_plot(x, sampleName, plotType, colors, num, axes):
- an_array = np.array(x)
- if all(v is None for v in an_array):
- return []
- q1 = np.quantile(an_array, 0.25)
- q3 = np.quantile(an_array, 0.75)
- iq = q3 - q1
- outlier = q3 + 3 * iq
- not_outlier = an_array <= outlier
- no_outliers = an_array[not_outlier]
-
- ptype = ""
- unit = ""
- xlabel = ""
- hasN50 = False
- if plotType == "L":
- ptype = " Read Length"
- xlabel = "Read Length (bp)"
- unit = " bp"
- hasN50 = True
- n50 = calculate_N50(x)
- elif plotType == "M":
- ptype = " Mapping Quality"
- xlabel = "Mapping Quality"
- elif plotType == "B":
- ptype = " Basecall Quality"
- xlabel = "Average Basecall Quality"
- elif plotType == "A":
- ptype = " Alignment Quality"
- xlabel = "Average Alignment Quality"
- plt.hist(no_outliers, bins=200, color=colors[6], density=True) #
- plt.axvline(
- x.median(),
- color=colors[0],
- linestyle="dashed",
- linewidth=1.3,
- label="median: " + str(round(x.median())) + unit,
- )
- plt.axvline(
- x.mean(),
- color=colors[2],
- linestyle="dashed",
- linewidth=1.3,
- label="mean: " + str(round(x.mean())) + unit,
- )
- if hasN50:
- plt.axvline(
- n50,
- color=colors[1],
- linestyle="dashed",
- linewidth=1,
- label="N50: " + str(round(n50)) + unit,
- )
- plt.title(ptype)
- plt.xlabel(xlabel)
- plt.ylabel("Frequency")
- plt.xlim(
- 0,
- )
- plt.plot([], [], " ", label="max: " + str(round(max(x))) + unit)
- plt.legend()
-
- values = [
- round(min(x)),
- round(q1),
- round(x.median()),
- round(q3),
- round(max(x)),
- round(x.mean()),
- ]
- return values
-
-
-def calculate_N50(x):
- array_rl = np.array(x)
- N = np.sum(array_rl)
- array_rl[::-1].sort()
- rl_cumsum = np.cumsum(array_rl)
- n50 = array_rl[np.argmax(rl_cumsum > N / 2)]
- return n50
-
-
-def qc_report(
- fileNames,
- sampleNames,
- outDir,
- colors=DEFAULT_COLOR_LIST,
- cores=None,
-):
-
- """
- fileNames
- list of names of bam files; indexed; or single file name as string
- sampleNames
- list of names of samples for output plot name labelling; or single sample name as string; valid names contain [``a-zA-Z0-9_``].
- outDir
- directory to output QC summary report
- cores
- number of cores over which to parallelize; default is all available
- colors
- color list in hex for overlay plots; default is:
- ["#BB4430","#FFBC0A","#053C5E","#A9E5BB","#610345",
- "#2D1E2F","#559CAD","#5E747F","#F343F4"]
-
- **Example**
-
- For single sample:
-
- >>> dm.qc_report("dimelo/test/data/mod_mappings_subset.bam", "test", "dimelo/dimelo_test")
-
- For multiple sample files:
-
- >>> dm.qc_report(["dimelo/test/data/mod_mappings_subset.bam", "dimelo/test/data/winnowmap_guppy_merge_subset.bam"], ["test1", "test2"], "dimelo/dimelo_test")
-
- **Return**
-
- * PDF of QC Summary Report which includes:
- * read length histogram
- * mapping quality histogram
- * average alignment quality per read histogram (if basecaller provided information)
- * average basecall quality per read histogram (if basecaller provided information)
- * summary table describing spread of data
- * number of reads, number of basepairs
-
- Returns a SQL database in the specified output directory. Database can be converted into pandas dataframe with:
-
- >>> fileName = "dimelo/test/data/mod_mappings_subset.bam"
- >>> sampleName = "test"
- >>> outDir = "dimelo/dimelo_test"
- >>> all_reads = pd.read_sql("SELECT * from reads_" + sampleName, sqlite3.connect(outDir + "/" + fileName.split("/")[-1].replace(".bam", "") + ".db"))
-
-
- After QC, each database contains this table with columns listed below:
-
- reads_sampleName
- * name
- * chr
- * start
- * end
- * length
- * strand
- * mapq
- * ave_baseq
- * ave_alignq
-
- **Example Plots**
- :ref:`sphx_glr_auto_examples_plot_qc_example.py`
-
- """
- if not os.path.isdir(outDir):
- os.makedirs(outDir)
-
- if type(fileNames) != list:
- fileNames = [fileNames]
- sampleNames = [sampleNames]
-
- for index in range(len(fileNames)):
- filebamIn = fileNames[index]
- sampleName = sampleNames[index]
- DB_NAME, TABLE_NAME = parse_bam_read(
- filebamIn, sampleName, outDir, cores
- )
-
- if sampleName is None:
- sampleName = DB_NAME.split("/")[-1][:-3]
-
- plot_feature_df = pd.read_sql(
- "SELECT * from " + TABLE_NAME, con=sqlite3.connect(DB_NAME)
- )
-
- fig = plt.figure(figsize=(12, 10))
- grid = plt.GridSpec(3, 2, figure=fig)
-
- ax_5 = plt.subplot2grid(shape=(3, 2), loc=(0, 0), colspan=2)
- ax_5.axis("off")
-
- keep_values = [0, 0, 0, 0]
- valRL = []
- valMQ = []
- valBQ = []
- valAQ = []
-
- # Read Length
- x = plot_feature_df["length"]
- ax_1 = fig.add_subplot(grid[0, 0])
- valRL = qc_plot(x, sampleName, "L", colors, 1, ax_1)
- if valRL:
- keep_values[0] = 1
-
- # Mapping Quality
- x = plot_feature_df["mapq"]
- ax_2 = fig.add_subplot(grid[0, 1])
- valMQ = qc_plot(x, sampleName, "M", colors, 2, ax_2)
- if valMQ:
- keep_values[1] = 1
-
- # Basecall Quality
- x = plot_feature_df["ave_baseq"]
- ax_3 = fig.add_subplot(grid[1, 0])
- valBQ = qc_plot(x, sampleName, "B", colors, 3, ax_3)
- if valBQ:
- keep_values[2] = 1
-
- # Alignment Quality
- x = plot_feature_df["ave_alignq"]
- ax_4 = fig.add_subplot(grid[1, 1])
- valAQ = qc_plot(x, sampleName, "A", colors, 4, ax_4)
- if valAQ:
- keep_values[3] = 1
-
- val_table = [valRL, valMQ, valBQ, valAQ]
- val_table_new = []
- cols = [
- "Read Length",
- "Mapping Quality",
- "Basecall Quality",
- "Alignment Quality",
- ]
- columns = []
- axes_stored = [ax_1, ax_2, ax_3, ax_4]
- for i in range(len(keep_values)):
- if keep_values[i] == 1:
- val_table_new.append(val_table[i])
- columns.append(cols[i])
- else:
- plt.delaxes(axes_stored[i])
-
- report_table = np.array(val_table_new).T
-
- rows = ["Min", "25%", "Median", "75%", "Max", "Mean"]
- print("mean length: ", valRL[5])
- print("num reads: ", len(x))
- print("num bases: ", round(valRL[5] * len(x)))
-
- if len(columns) <= 2:
- ax_5 = plt.subplot2grid(shape=(3, 2), loc=(1, 0), colspan=2)
- else:
- ax_5 = plt.subplot2grid(shape=(3, 2), loc=(2, 0), colspan=2)
- ax_5.axis("off")
- ax_5.table(
- cellText=report_table,
- rowLabels=rows,
- colLabels=columns,
- loc="center",
- )
-
- fig.tight_layout(w_pad=2, h_pad=4)
-
- summary_data = "mean length: " + str(valRL[5]) + " bp"
- summary_data = summary_data + "; num reads: " + str(len(x))
- summary_data = (
- summary_data + "; " + "num bases: " + str(round(valRL[5] * len(x)))
- )
- fig.suptitle(sampleName + " QC Summary Report", y=1.05)
-
- plt.title(summary_data, y=0.8)
-
- # saving as PDF
- final_file_name = outDir + "/" + sampleName + "_qc_report"
- plt.savefig(final_file_name + ".pdf", bbox_inches="tight")
- plt.close()
-
- print("QC report located at: " + final_file_name + ".pdf")
- print("Database located at: " + DB_NAME)
-
-
-def main():
- parser = argparse.ArgumentParser(description="Generate DiMeLo qc report")
-
- # Required arguments
- required_args = parser.add_argument_group("required arguments")
- required_args.add_argument(
- "-f", "--fileNames", required=True, nargs="+", help="bam file name(s)"
- )
- required_args.add_argument(
- "-s",
- "--sampleNames",
- required=True,
- nargs="+",
- help="sample name(s) for output labelling",
- )
- required_args.add_argument(
- "-o",
- "--outDir",
- required=True,
- help="directory to output QC summary report",
- )
-
- # Plotting arguments
- plotting_args = parser.add_argument_group("plotting options")
- plotting_args.add_argument(
- "--colors",
- type=str,
- nargs="+",
- default=DEFAULT_COLOR_LIST,
- help='color list in hex (e.g. "#BB4430") for overlay plots',
- )
-
- # Optional arguments
- parser.add_argument(
- "-p",
- "--cores",
- type=int,
- help="number of cores over which to parallelize",
- )
-
- args = parser.parse_args()
- qc_report(**vars(args))
diff --git a/dimelo/run_modkit.py b/dimelo/run_modkit.py
new file mode 100644
index 00000000..c49106c3
--- /dev/null
+++ b/dimelo/run_modkit.py
@@ -0,0 +1,422 @@
+import os
+
+# I believe that pty does not currently work on Windows, although this may change in future releases: https://bugs.python.org/issue41663
+# However, it may be that pywinpty, which is installable from pip, would work fine. That just needs to be tested with a Windows machine
+# My current thinking is to wait on this until Nanopore puts Windows executables on Anaconda: https://anaconda.org/nanoporetech/modkit
+import pty
+import re
+import select
+import subprocess
+import sys
+from pathlib import Path
+from typing import Optional, cast
+
+from tqdm.auto import tqdm
+
+# This should be updated in tandem with the environment.yml nanoporetech::modkit version
+EXPECTED_MODKIT_VERSION = "0.2.4"
+
+"""
+Import checks
+"""
+# Add conda env bin folder to path if it is not already present
+# On some systems, the directory containing executables for the active environment isn't automatically on the path
+# If this is the case, add that directory to the path so modkit can run
+current_interpreter = sys.executable
+env_bin_path = os.path.dirname(current_interpreter)
+if env_bin_path not in os.environ["PATH"]:
+ print(
+ f"PATH does not include the conda environment /bin folder. Adding {env_bin_path}."
+ )
+ os.environ["PATH"] = f"{env_bin_path}:{os.environ['PATH']}"
+ print(f"PATH is now {os.environ['PATH']}")
+
+# Check modkit on first import: does it run; does it have the right version
+try:
+ result = subprocess.run(["modkit", "--version"], stdout=subprocess.PIPE, text=True)
+ modkit_version = result.stdout
+ if modkit_version.split()[1] == EXPECTED_MODKIT_VERSION:
+ pass
+ # print(f"modkit found with expected version {EXPECTED_MODKIT_VERSION}")
+ else:
+ print(
+ f"modkit found with unexpected version {modkit_version.split()[1]}. Versions other than {EXPECTED_MODKIT_VERSION} may exhibit unexpected behavior. It is recommended that you use v{EXPECTED_MODKIT_VERSION}"
+ )
+except subprocess.CalledProcessError:
+ print(
+ 'Executable not found for modkit. Install dimelo using "conda env create -f environment.yml" or install modkit manually to your conda environment using "conda install nanoporetech::modkit==0.2.4". Without modkit you cannot run parse_bam functions.'
+ )
+
+
+def run_with_progress_bars(
+ command_list: list[str],
+ input_file: Path,
+ ref_genome: Path,
+ motifs: list[str],
+ load_fasta_regex: str,
+ find_motifs_regex: str,
+ contigs_progress_regex: str,
+ single_contig_regex: str,
+ buffer_size: int = 50,
+ progress_granularity: int = 10,
+ done_str: str = "Done",
+ err_str: str = "Error",
+ expect_done: bool = False,
+ quiet: bool = False,
+) -> str:
+ r"""
+ This function runs modkit with subprocess / pseudoterminal and grabs the progress outputs to populate progress bars
+
+ Args:
+ command_list: a list of commands to pass to subprocess: [modkit, pileup, ...] or [modkit, extract, ...]
+ load_fasta_regex: a regular expression that captures the contig being loaded in the step where modkit
+ reads fasta sequence. Should specify all the output context
+ so that groups aren't captured unless there is whitespace on either end i.e. the whole output
+ e.g. r'\s+\[.*?\]\s+(\d+)\s+Reading' for pileup in 0.2.4
+ input_file: the bam file you are processing
+ ref_genome: the reference genome to which your bam is aligned
+ motifs: the list of motifs you are looking for
+ find_motifs_regex: a regular expression that captures contigs-so-far and total-contigs-to-process
+ in the step where modkit is finding motifs throughout the genome. Should specify all the output context
+ so that groups aren't captured unless there is whitespace on either end i.e. the whole output
+ has been loaded into the buffer
+ e.g. r'\s+(\d+)/(\d+)\s+finding\s+([A-Za-z0-9,]+)\s+motifs' for pileup in 0.2.4
+ contigs_progress_regex: a regular expression that captures currently-processing-contig and total-contigs
+ -to-process in the step where modkit is running through the bam file. Should specify all the output context
+ so that groups aren't captured unless there is whitespace on either end i.e. the whole output
+ e.g. r'\s+(\d+)/(\d+)\s+contigs' for pileup in 0.2.4
+ single_contig_regex: a regular expression that captures reads-processed, reads-total, and contig-name for
+ a contig that is being processing from the bam file. Should specify all the output context
+ so that groups aren't captured unless there is whitespace on either end i.e. the whole output
+ e.g. r'\s+(\d+)/(\d+)\s+processing\s+([\w]+)[^\w]' for pileup in 0.2.4
+ buffer_size: the length of the string that the modkit stderr output gets saved into. This size will not
+ be respected if you hit Done or Error; in that case the rest of the output will be captured and returned
+ or raised.
+ progress_granularity: this tells the function how often to check the output buffer string for the various regex.
+ Less frequent checking is good because it means fewer spurious updates and less overhead. However you
+ need this to be sufficiently less than buffer_size that you can always capture the entirety of your
+ relevant information.
+ done_str: a string telling the function what to look for to know that modkit is done processing. Everything
+ after this will get returned
+ err_str: a string telling the function what to look for to know that modkit has encountered an error. Everything
+ after this will be raised as a ValueError
+ expect_done: specifies whether the command is expected to show a clear "Done" at the end of the output
+ quiet: sending True will suppress all progress bars and stdout outputs.
+
+ Returns:
+ The command line stderr output string after the point where we detect modkit is done parsing
+ """
+
+ # modkit 0.2.4 does not like gzipped or bgzipped fasta files
+ with open(ref_genome, "rb") as f:
+ if f.read(2) == b"\x1f\x8b":
+ raise ValueError(
+ f"{ref_genome.name} is gzipped, which will cause modkit to fail.\ngunzip {ref_genome.name} and try again."
+ )
+
+ # Set up progress bar variables to display progress updates when not in quiet mode
+ format_pre = "{bar}| {desc} {percentage:3.0f}% | {elapsed}"
+ format_contigs = "{bar}| {desc} {percentage:3.0f}% | {elapsed}<{remaining}"
+ format_chr = "{bar}| {desc} {percentage:3.0f}%"
+ pbar_pre: Optional[tqdm] = None
+ pbar_contigs: Optional[tqdm] = None
+ pbar_chr: Optional[tqdm] = None
+
+ # TODO: Is this the correct type annotation? I think it is, based on approx. line 280
+ finding_progress_dict: dict[str, tuple[int, int]] = {}
+ in_contig_progress = (0, 1)
+ total_contigs = 0
+
+ # Set up output buffer variables to capture modkit output
+ buffer_bytes = bytearray()
+ tail_buffer = ""
+
+ # Set up flags for modkit error / modkit done from text outputs
+ err_flag = False
+ done_flag = False
+
+ # Create a pseudo-terminal in which to run the modkit subprocess
+ master_fd, slave_fd = pty.openpty()
+
+ # Start modkit subprocess with the slave end as stdio
+ process = subprocess.Popen(
+ command_list,
+ stdin=slave_fd,
+ stdout=slave_fd,
+ stderr=subprocess.STDOUT,
+ close_fds=True,
+ )
+ os.close(slave_fd)
+
+ readout_count = 0
+ progress_bars_initialized = False
+ region_parsing_started = False
+
+ # Grab output bytes for as long as they're coming
+ while True:
+ # Wait for the process to be ready to provide bytes
+ ready, _, _ = select.select([master_fd], [], [], 0.1)
+ if ready:
+ try:
+ # Read a single byte
+ data = os.read(master_fd, 1)
+ if not data:
+ break # No more data
+
+ if quiet:
+ # If we are in quiet mode, nothing gets grabbed
+ continue
+ else:
+ # Create the progress bars when first entering this code block
+ if not progress_bars_initialized:
+ pbar_pre = tqdm(
+ total=100,
+ desc=f"Step 1: Identify motif locations in {ref_genome.name}",
+ bar_format=format_pre,
+ )
+ pbar_contigs = tqdm(
+ total=100,
+ desc=f"Step 2: Parse regions in {input_file.name}",
+ bar_format=format_contigs,
+ )
+ pbar_chr = tqdm(
+ total=100,
+ desc="",
+ bar_format=format_chr,
+ )
+ progress_bars_initialized = True
+
+ buffer_bytes += data # Accumulate bytes in the buffer
+
+ try:
+ # Try to decode the accumulated bytes
+ # This will throw a UnicodeDecodeError if not complete, which is ok! Then we just continue on
+ text = buffer_bytes.decode("utf-8")
+ readout_count += 1
+ buffer_bytes.clear() # Clear the buffer after successful decoding
+ # If we have hit an error or modkit is done, just accumulate the rest of the output and then deal with it:
+ # no need to check the progress tracking stuff in that case
+ if err_flag or done_flag:
+ tail_buffer += text
+ # If we haven't hit an error or a done state, first check for that
+ else:
+ tail_buffer = (tail_buffer + text)[-buffer_size:]
+ if err_str in tail_buffer:
+ index = tail_buffer.find(err_str)
+ tail_buffer = tail_buffer[index:]
+ err_flag = True
+ elif done_str in tail_buffer:
+ index = tail_buffer.find(done_str)
+ tail_buffer = tail_buffer[index - 2 :]
+ done_flag = True
+ # If the process is ongoing, then go through the possible cases and create/adjust pbars accordingly
+ # We only sometimes want to update progress because otherwise the constant updates slow us down
+ elif (
+ readout_count % progress_granularity == 0
+ and progress_bars_initialized
+ ):
+ region_parsing_started, in_contig_progress = (
+ update_progress_bars(
+ pbar_pre=pbar_pre,
+ pbar_contigs=pbar_contigs,
+ pbar_chr=pbar_chr,
+ tail_buffer=tail_buffer,
+ contigs_progress_regex=contigs_progress_regex,
+ single_contig_regex=single_contig_regex,
+ find_motifs_regex=find_motifs_regex,
+ load_fasta_regex=load_fasta_regex,
+ region_parsing_started=region_parsing_started,
+ in_contig_progress=in_contig_progress,
+ finding_progress_dict=finding_progress_dict,
+ ref_genome=ref_genome,
+ input_file=input_file,
+ motifs=motifs,
+ )
+ )
+
+ except UnicodeDecodeError:
+ # If decoding fails, continue accumulating bytes
+ continue
+ except Exception as e:
+ raise e
+ except OSError:
+ break
+
+ # After the data stops coming, we wait until the process is done so we can grab the return code
+ process.wait()
+ return_code = process.returncode
+ # Modkit gives return code 0 if it terminates successfully; any other return code should be raised
+ # This catches system kills caused by memory and disk space
+ if return_code != 0 or err_flag:
+ if err_flag:
+ print(tail_buffer)
+ if return_code == -9:
+ print(
+ "It looks like the process was killed with SIGKILL. This is often due to the system running out of memory.",
+ "\nConsider setting 'cores=1' to reduce resource usage or reducing the data size being processed.",
+ "\nNote that pileup and extract both create large intermediate plain-text files before reducing down to a compressed and indexed format.",
+ "\nRunning out of disk space can indirectly cause memory issues."
+ "\nYou might also want to check if there are any resource limits set for your user or in the environment where the code is running.",
+ )
+ raise subprocess.CalledProcessError(
+ return_code, command_list, output=tail_buffer
+ )
+
+ # If modkit gives return code 0, it can still have had an unusual state.
+ # If the expected done flag was seen, or if the command doesn't have a done
+ # string (i.e. extract), update the progress bars to reflect the final status
+ # If the progress bars are not initialized, then the code must have been run in
+ # quiet mode
+ elif done_flag or not expect_done:
+ if progress_bars_initialized:
+ pbar_pre = cast(tqdm, pbar_pre)
+ pbar_contigs = cast(tqdm, pbar_contigs)
+ pbar_chr = cast(tqdm, pbar_chr)
+ pbar_pre.close()
+ pbar_contigs.n = 100
+ pbar_contigs.set_description(
+ f"Step 2 complete. {total_contigs} contigs processed from {input_file.name}"
+ )
+ pbar_contigs.refresh()
+ pbar_contigs.close()
+ pbar_chr.n = 100
+ ansi_escape_pattern = re.compile(r"(\[2K>)")
+ pbar_chr.set_description(
+ command_list[0]
+ + " "
+ + command_list[1]
+ + " return code "
+ + str(return_code)
+ + " | "
+ + ansi_escape_pattern.sub("", tail_buffer).strip()
+ )
+ pbar_chr.refresh()
+ pbar_chr.close()
+ return tail_buffer
+ # Indicate unusual state
+ else:
+ if progress_bars_initialized:
+ pbar_pre = cast(tqdm, pbar_pre)
+ pbar_contigs = cast(tqdm, pbar_contigs)
+ pbar_chr = cast(tqdm, pbar_chr)
+ pbar_pre.close()
+ pbar_contigs.set_description("Unexpected modkit outputs")
+ pbar_contigs.refresh()
+ pbar_contigs.close()
+ pbar_chr.set_description(
+ command_list[0]
+ + " "
+ + command_list[1]
+ + " return code "
+ + str(return_code)
+ )
+ pbar_chr.refresh()
+ pbar_chr.close()
+ print(
+ 'WARNING: the modkit command may not have completed normally. Consider re-running with "log=True" if you do not get the expected outputs.'
+ )
+ return tail_buffer
+
+
+def update_progress_bars(
+ pbar_pre,
+ pbar_contigs,
+ pbar_chr,
+ tail_buffer,
+ contigs_progress_regex,
+ single_contig_regex,
+ find_motifs_regex,
+ load_fasta_regex,
+ region_parsing_started,
+ in_contig_progress,
+ finding_progress_dict,
+ ref_genome,
+ input_file,
+ motifs,
+):
+ # We check these in the reverse order from that in which they occur, which I guess will save a tiny
+ # amount of processing time because we don't check for previous steps when on later steps
+ # Once we are in the contig progress stage, step 1 is done by definition
+ if contigs_progress_matches := re.search(contigs_progress_regex, tail_buffer):
+ # If we get here we can be sure the pbars are initialized
+ pbar_pre = cast(tqdm, pbar_pre)
+ pbar_contigs = cast(tqdm, pbar_contigs)
+ if not region_parsing_started:
+ # These are now no longer indicating future steps, but rather counting the actual
+ # time for step 2
+ pbar_contigs.reset()
+ pbar_chr.reset()
+ region_parsing_started = True
+ # Now that region parsing has started, we can close out the preprocessing pbar
+ pbar_pre.n = 100
+ pbar_pre.set_description(
+ f"Step 1 complete. Located {motifs} in {ref_genome.name}"
+ )
+ pbar_pre.refresh()
+ pbar_pre.close()
+ # This progress bar tracks how many contigs/chromosomes have been processed
+ current_contig = int(contigs_progress_matches.group(1))
+ total_contigs = int(contigs_progress_matches.group(2))
+ pbar_contigs.n = (
+ (
+ 100
+ * (
+ current_contig
+ + (
+ in_contig_progress[0] / in_contig_progress[1]
+ if in_contig_progress[1] > 0
+ else 0
+ )
+ )
+ )
+ / total_contigs
+ if total_contigs > 0
+ else 0
+ )
+ pbar_contigs.set_description(
+ f"Step 2: parsing {current_contig}/{total_contigs} from {input_file.name}"
+ )
+ pbar_contigs.refresh()
+ elif region_parsing_started and (
+ single_contig_matches := re.search(single_contig_regex, tail_buffer)
+ ):
+ # If we get here we can be sure the pbars are initialized
+ pbar_chr = cast(tqdm, pbar_chr)
+ # This progress bar tracks reads processed within a chromosomes
+ chromosome = single_contig_matches.group(3)
+ reads_done = int(single_contig_matches.group(1))
+ reads_total = int(single_contig_matches.group(2))
+ pbar_chr.n = 100 * reads_done / reads_total if reads_total > 0 else 0
+ in_contig_progress = (reads_done, reads_total)
+ pbar_chr.set_description(
+ f"Step 2: {chromosome} {reads_done}/{reads_total} chunks processed"
+ )
+ pbar_chr.refresh()
+
+ elif find_motifs_matches := re.search(find_motifs_regex, tail_buffer):
+ # If we get here we can be sure the pbars are initialized
+ pbar_pre = cast(tqdm, pbar_pre)
+ finding_progress_dict[find_motifs_matches.group(3)] = (
+ int(find_motifs_matches.group(1)),
+ int(find_motifs_matches.group(2)),
+ )
+ num_sum, denom_sum = 0, 0
+ for (
+ num,
+ denom,
+ ) in finding_progress_dict.values():
+ num_sum += num
+ denom_sum += denom
+ if denom_sum > 0:
+ pbar_pre.n = 100 * num_sum / denom_sum
+ else:
+ pbar_pre.n = 0
+ pbar_pre.set_description(f"Step 1b: finding motif(s) {motifs}")
+ pbar_pre.refresh()
+ elif load_fasta_match := re.search(load_fasta_regex, tail_buffer):
+ # If we get here we can be sure the pbars are initialized
+ pbar_pre = cast(tqdm, pbar_pre)
+ pbar_pre.n = 100 * int(load_fasta_match.group(1)) / 24
+ pbar_pre.set_description(f"Step 1a: reading {ref_genome.name}")
+ pbar_pre.refresh()
+ return region_parsing_started, in_contig_progress
diff --git a/dimelo/test/README.md b/dimelo/test/README.md
new file mode 100644
index 00000000..c59ae214
--- /dev/null
+++ b/dimelo/test/README.md
@@ -0,0 +1,25 @@
+# test
+
+This folder contains data and code to run basic tests on the `dimelo` package. `dimelo_test.py` currently contains all test code, and `data/test_targets/test_matrix.pickle` contains a mapping of test case kwargs to target values, including paths to files living in the same directory.
+
+New versions of `test_matrix.pickle` and the accompanying target files to which it refers can be generated using `generate_targets.py`, which itself can
+reference both existing results in `test_matrix.pickle` as well as a fresh set of test cases (i.e. kwarg combinations) defined in `cases.py`.
+
+## files
+
+`__init__.py` sets up a framework for running parsing, including downloading a reference genome and processing input files appropriately.
+
+`cases.py` contains a set of test cases which will be run through all tests in `dimelo_test.py`, and used to generate corresponding targets in `generate_targets.py`. The format schema contains kwargs by name in a dictionary for each test case: these are passed *directly* to the parse, load, plot, and export functions, simply filtering out un-needed ones. Formats must match the requirements of the functions that take these arguments.
+
+`dimelo_test.py` implements unit tests and integration tests using `pytest`. Tests are split into classes to handle temporary directories cleanly and separate different types of tests.
+
+`generate_targets.py` contains code to create the target outputs for the unit tests, ultimately creating a test_matrix.pkl file containing a pickled directionary of test kwargs and results. These should not be updated unless you confirm that any change in behavior is actually correct, i.e. if any test is failing make sure you know why before considering replacing the target values. *Special care should be taken for the `load_processed` outputs, which should not change with interface changes. If those outputs don't match and need to be regenerated, that is* ***reason for concern.*** However, updates to e.g. the .h5 single read format and corresponding changes to the `load_processed` methods may require making a new target .h5 file while `load_processed.binarized_read_from_hdf5` still returns the right array values and so on. Run `python generate_targets.py --help` for assistance with arguments to update only a subset of target value or test cases.
+
+The `data` folder contains .bam files and .bed files to use for testing. These files are also used by the tutorial.
+
+The `output` folder stores the reference genome and processed outputs. This folder is included in `.gitignore` so its contents should never be included in commits or merges.
+
+```
+# Ignore tutorial output files
+test/output
+```
\ No newline at end of file
diff --git a/dimelo/test/__init__.py b/dimelo/test/__init__.py
index 6c6fdcdd..7af71eda 100644
--- a/dimelo/test/__init__.py
+++ b/dimelo/test/__init__.py
@@ -1,24 +1,159 @@
+import gzip
+import subprocess
import tempfile
-import unittest
+import urllib
+from inspect import signature
from pathlib import Path
+import pysam
-class DiMeLoTestCase(unittest.TestCase):
+ref_genome_url = "https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chm13.draft_v1.0.fasta.gz"
+
+script_location = Path(__file__).resolve().parent
+output_dir = script_location / "output"
+ref_genome_gz = output_dir / "chm13.draft_v1.0.fasta.gz"
+ref_genome_fasta = output_dir / "chm13.draft_v1.0.fasta"
+
+data_dir = script_location / "data"
+bams_to_update = [data_dir / "ctcf_demo.sorted.bam"]
+
+"""
+The RelativePath class is a pickle-able Path class that will play nice with parse_bam.sanitize_path_inputs (using the __fspath__ function)
+and then can also be included in dicts and other objects for pickling. This way we can store input and output test files using relative
+paths but execute code using e.g. pytest, which operates in a different directory, and have the paths work by converting them to absolute
+paths. Clumsy maybe? Could be a better approach out there? Who knows. Not I.
+"""
+
+
+class RelativePath:
+ def __init__(self, path):
+ path = Path(path)
+ if path.is_absolute():
+ # Convert the absolute path to a relative path
+ try:
+ self.relative_path = path.relative_to(script_location)
+ except ValueError as e:
+ raise ValueError(
+ "The provided path is not in the dimelo/test directory."
+ ) from e
+ else:
+ self.relative_path = path
+ self._update_absolute_path()
+
+ def _update_absolute_path(self):
+ # Dynamically determine the absolute path based on relative path and the location of this file
+ self.base_path = Path(__file__).parent
+ self.absolute_path = (self.base_path / self.relative_path).resolve()
+
+ def __fspath__(self):
+ # Allows the object to be used by pathlib.Path and anything that accepts path-like objects
+ return str(self.absolute_path)
+
+ def __str__(self):
+ # Allows the object to be used by methods that want strings
+ return str(self.absolute_path)
+
+ def __getstate__(self):
+ # Returns the state to be pickled; i.e. just the relative path
+ return self.relative_path
+
+ def __setstate__(self, state):
+ # Restore state from the unpickled state; recreate the absolute path from the relative path
+ self.relative_path = state
+ self._update_absolute_path()
+
+
+def filter_kwargs_for_func(func, kwargs, extra_args=[]):
+ func_sig = signature(func)
+ allowed_args = set(list(func_sig.parameters) + extra_args)
+ filtered_kwargs = {k: v for k, v in kwargs.items() if k in allowed_args}
+ return filtered_kwargs
+
+
+def download_reference(force_redownload=False):
+ """
+ download the reference genome to enable downstream operations
+ """
+ if not output_dir.exists():
+ output_dir.mkdir()
+ if ref_genome_fasta.exists() and not force_redownload:
+ print("Reference genome already downloaded.")
+ return ref_genome_fasta
+ else:
+ urllib.request.urlretrieve(ref_genome_url, ref_genome_gz)
+
+ with (
+ gzip.open(ref_genome_gz, "rb") as gzip_file,
+ open(ref_genome_fasta, "wb") as output_file,
+ ):
+ for chunk in gzip_file:
+ output_file.write(chunk)
+
+ ref_genome_gz.unlink()
+ print("Reference genome downloaded and decompressed.")
+ return ref_genome_fasta
+
+
+def retag_bam(bam_path, force_retag=False):
+ """
+ use modkit to retag an input .bam file (to MM/ML vs mm/ml, with ?/. specified)
+ by default only retags if the .updated file doesn't already exist
+ """
+ bam_updated_path = (
+ bam_path.parent.parent
+ / "output"
+ / (Path(bam_path.stem).stem + ".updated" + bam_path.suffix)
+ )
+ bam_updated_index_path = (
+ bam_path.parent.parent
+ / "output"
+ / (Path(bam_path.stem).stem + ".updated" + bam_path.suffix + ".bai")
+ )
+ if (
+ bam_updated_path.exists()
+ and bam_updated_index_path.exists()
+ and not force_retag
+ ):
+ print("Input bam already retagged.")
+ return bam_updated_path
+ else:
+ subprocess.run(
+ [
+ "modkit",
+ "update-tags",
+ str(bam_path),
+ str(bam_updated_path),
+ "--mode",
+ "ambiguous",
+ ]
+ )
+ pysam.index(str(bam_updated_path))
+ return bam_updated_path
+
+
+class DiMeLoParsingTestCase:
"""
- TODO:
- - Should these be setUpClass/tearDownClass or setUp/tearDown? Is it okay that only one temporary directory is created each time?
+ This is the base class for any DiMeLo tests that need to parse data or create output files.
"""
@classmethod
- def setUpClass(cls):
+ def setup_class(cls):
+ # TODO: The existence of two variables with basically the same name is very confusing. Please clarify?
cls._outDir = tempfile.TemporaryDirectory()
cls.outDir = Path(cls._outDir.name)
+ cls.reference_genome = download_reference()
+ _ = [retag_bam(bam) for bam in bams_to_update]
@classmethod
- def tearDownClass(cls):
+ def teardown_class(cls):
cls._outDir.cleanup()
def assertOutputFileExists(self, file_name: Path):
"""Fails test if the given file name is not found in the output directory"""
+ """
+ TODO: There are a couple of things wrong here:
+ * This is never called anywhere; was this a holdover from my old infrastructure? Can it be deleted?
+ * mypy error: dimelo/test/__init__.py:157: error: "DiMeLoParsingTestCase" has no attribute "outDir" [attr-defined]
+ """
file_path = self.outDir / file_name
- self.assertTrue(file_path.exists(), msg=f"{file_path} does not exist")
+ assert file_path.exists(), f"{file_path} does not exist"
diff --git a/dimelo/test/cases.py b/dimelo/test/cases.py
new file mode 100644
index 00000000..402db58a
--- /dev/null
+++ b/dimelo/test/cases.py
@@ -0,0 +1,133 @@
+from pathlib import Path
+
+from dimelo.test import RelativePath
+
+"""
+This module contains the test cases that will be run through most dimelo_test submodules. To add new test cases, simple create new dict entries
+in test_matrix with descriptive names. Each dict entry must contain a tuple of two dicts, the first containing kwargs and the second containing
+test result targets. dimelo_test module run the applicable kwargs through different dimelo modules to test functionality, comparing results against
+those stored in the results entry. The output targets can be populated by running generate_targets.py.
+"""
+
+# Base input and output directories
+test_data_dir = Path("./data")
+output_dir = test_data_dir / "test_targets"
+
+region = "chr1:114357437-114359753" # this region is just one of the ones from the peaks bed file
+# 'chr1:9167177-9169177' # nothing is really different about this one; I swapped at one point while debugging
+
+# Paths to input files
+ctcf_bam_file = test_data_dir / "ctcf_demo.sorted.bam"
+ctcf_target_regions = RelativePath(test_data_dir / "ctcf_demo_peak.bed")
+ctcf_off_target_regions = RelativePath(test_data_dir / "ctcf_demo_not_peak.bed")
+
+ctcf_bam_file_updated = RelativePath("./output/ctcf_demo.updated.bam")
+output_dir = RelativePath(output_dir)
+
+test_matrix = {
+ "megalodon_peaks_190": (
+ # input kwargs
+ {
+ "input_file": ctcf_bam_file_updated,
+ "output_name": "megalodon_peaks_190",
+ "output_directory": output_dir,
+ "regions": [ctcf_target_regions, ctcf_off_target_regions],
+ "motifs": ["A,0", "CG,0"],
+ "thresh": 190,
+ "window_size": 5000,
+ "sort_by": ["read_start", "read_name", "motif"],
+ "smooth_window": 1,
+ "title": "megalodon_peaks_190",
+ "single_strand": False,
+ "regions_5to3prime": False,
+ "chunk_size": 1_000_000,
+ "cores": 1,
+ },
+ # outputs dict function:values
+ {}, # populated by generate_targets.py
+ ),
+ "megalodon_single_190": (
+ # input kwargs
+ {
+ "input_file": ctcf_bam_file_updated,
+ "output_name": "megalodon_single_190",
+ "output_directory": output_dir,
+ "regions": region,
+ "motifs": ["A,0", "CG,0"],
+ "thresh": 190,
+ "window_size": None,
+ "sort_by": ["read_start", "read_name", "motif"],
+ "smooth_window": 10,
+ "title": "megalodon_single_190",
+ "single_strand": False,
+ "regions_5to3prime": False,
+ "chunk_size": 100,
+ "cores": 2,
+ },
+ # outputs dict function:values
+ {}, # populated by generate_targets.py
+ ),
+ "megalodon_single_and_peaks_190": (
+ # input kwargs
+ {
+ "input_file": ctcf_bam_file_updated,
+ "output_name": "megalodon_single_and_peaks_190",
+ "output_directory": output_dir,
+ "regions": [region, ctcf_target_regions, ctcf_off_target_regions],
+ "motifs": ["A,0", "CG,0"],
+ "thresh": 190,
+ "window_size": 5000,
+ "sort_by": ["read_start", "read_name", "motif"],
+ "smooth_window": 100,
+ "title": "megalodon_single_and_peaks_190",
+ "single_strand": True,
+ "regions_5to3prime": True,
+ "chunk_size": 10_000,
+ "cores": 3,
+ },
+ # outputs dict function:values
+ {}, # populated by generate_targets.py
+ ),
+ "megalodon_peaks_nothresh": (
+ # input kwargs
+ {
+ "input_file": ctcf_bam_file_updated,
+ "output_name": "megalodon_peaks_nothresh",
+ "output_directory": output_dir,
+ "regions": [ctcf_target_regions, ctcf_off_target_regions],
+ "motifs": ["A,0", "CG,0"],
+ "thresh": None,
+ "window_size": 5000,
+ "sort_by": ["read_start", "read_name", "motif"],
+ "smooth_window": 100,
+ "title": "megalodon_peaks_nothresh",
+ "single_strand": True,
+ "regions_5to3prime": False,
+ "chunk_size": 1000,
+ "cores": 4,
+ },
+ # outputs dict function:values
+ {}, # populated by generate_targets.py
+ ),
+ "megalodon_single_nothresh": (
+ # input kwargs
+ {
+ "input_file": ctcf_bam_file_updated,
+ "output_name": "megalodon_single_nothresh",
+ "output_directory": output_dir,
+ "regions": region,
+ "motifs": ["A,0", "CG,0"],
+ "thresh": None,
+ "window_size": 5000,
+ "sort_by": ["read_start", "read_name", "motif"],
+ "smooth_window": 1,
+ "title": "megalodon_single_nothresh",
+ "single_strand": False,
+ "regions_5to3prime": True,
+ "chunk_size": 100,
+ "cores": None,
+ },
+ # outputs dict function:values
+ {}, # populated by generate_targets.py
+ ),
+}
diff --git a/dimelo/test/data/README.md b/dimelo/test/data/README.md
index e4a014d2..3872162f 100644
--- a/dimelo/test/data/README.md
+++ b/dimelo/test/data/README.md
@@ -1,7 +1,8 @@
# Test files
-## Both bams are free pA-Hia5 control in GM12878 cells
+## bams are free pA-Hia5 control in GM12878 cells
- Example bam from megalodon: mod_mappings_subset.bam
-- Example hybrid bam from winnowmap & guppy merge: winnowmap_guppy_merge_subset.bam
+- *REMOVED TO REDUCE OVERHEAD Example hybrid bam from winnowmap & guppy merge: winnowmap_guppy_merge_subset.bam. *
+*See old package version here to find these files: https://github.com/streetslab/dimelo/tree/7ff463273436d39ad4bf6dd0cbcc6c08cd4209cb/dimelo/test/data*
## Test files created using samtools subsample
diff --git a/dimelo/test/data/ctcf_demo_not_peak.bed b/dimelo/test/data/ctcf_demo_not_peak.bed
index 29d29f12..edfe6226 100644
--- a/dimelo/test/data/ctcf_demo_not_peak.bed
+++ b/dimelo/test/data/ctcf_demo_not_peak.bed
@@ -1,200 +1,200 @@
-chr14 44123158 44123308 + 465.63275
-chr14 44127026 44127176 + 465.63275
-chr14 44123159 44123308 - 465.63275
-chr14 44127026 44127175 - 465.63275
-chr1 114356587 114356736 - 441.00102
-chr1 114360454 114360603 - 441.00102
-chr1 114356586 114356736 + 441.00102
-chr1 114360454 114360604 + 441.00102
-chrX 9700334 9700484 - 418.31102
-chrX 9704202 9704352 - 418.31102
-chr15 54632158 54632308 + 410.28758
-chr15 54636060 54636176 + 410.28758
-chr15 54632159 54632308 - 410.28758
-chr15 54636060 54636175 - 410.28758
-chr15 54632192 54632308 - 410.28758
-chr15 54636060 54636210 - 410.28758
-chr15 54632193 54632308 + 410.28758
-chr15 54636060 54636209 + 410.28758
-chr6 53009844 53009994 + 398.10126
-chr6 53013712 53013862 + 398.10126
-chr16 4279536 4279624 + 394.95247
-chr16 4283404 4283554 + 394.95247
-chr16 4279474 4279624 + 394.95247
-chr16 4283404 4283490 + 394.95247
-chr16 4279537 4279624 - 394.95247
-chr16 4283404 4283553 - 394.95247
-chr7 152343847 152343935 - 390.20159
-chr7 152347715 152347865 - 390.20159
-chr7 152343848 152343935 + 390.20159
-chr7 152347715 152347864 + 390.20159
-chr7 152343786 152343935 + 390.20159
-chr7 152347715 152347802 + 390.20159
-chr7 152343785 152343935 - 390.20159
-chr7 152347715 152347803 - 390.20159
-chr16 63442391 63442541 - 389.524
-chr16 63446302 63446409 - 389.524
-chr16 63442392 63442541 + 389.524
-chr16 63446302 63446408 + 389.524
-chr16 63442436 63442541 - 389.524
-chr16 63446302 63446452 - 389.524
-chr6 41196658 41196741 - 388.86052
-chr6 41200525 41200674 - 388.86052
-chr6 41196657 41196741 + 388.86052
-chr6 41200525 41200675 + 388.86052
-chr6 41196592 41196741 + 388.86052
-chr6 41200525 41200608 + 388.86052
-chr6 41196591 41196741 - 388.86052
-chr6 41200525 41200609 - 388.86052
-chr20 36033923 36034073 + 384.11318
-chr20 36037791 36037941 + 384.11318
-chr20 36033924 36034073 - 384.11318
-chr20 36037791 36037940 - 384.11318
-chr12 57871837 57871987 - 380.55342
-chr12 57875753 57875855 - 380.55342
-chr12 57871885 57871987 - 380.55342
-chr12 57875753 57875903 - 380.55342
-chr8 22846344 22846494 + 372.97972
-chr8 22850296 22850362 + 372.97972
-chr8 22846345 22846494 - 372.97972
-chr8 22850296 22850361 - 372.97972
-chr8 22846428 22846494 + 372.97972
-chr8 22850296 22850446 + 372.97972
-chr17 45987626 45987736 + 370.32321
-chr17 45991494 45991644 + 370.32321
-chr17 45987627 45987736 - 370.32321
-chr17 45991494 45991643 - 370.32321
-chr17 45987587 45987736 - 370.32321
-chr17 45991494 45991603 - 370.32321
-chr17 45987586 45987736 + 370.32321
-chr17 45991494 45991604 + 370.32321
-chr3 48462689 48462778 + 367.84554
-chr3 48466556 48466705 + 367.84554
-chr3 48462688 48462778 - 367.84554
-chr3 48466556 48466706 - 367.84554
-chr3 48462628 48462778 - 367.84554
-chr3 48466556 48466646 - 367.84554
-chr20 33335286 33335348 - 367.5176
-chr20 33339154 33339304 - 367.5176
-chr20 33335198 33335348 - 367.5176
-chr20 33339154 33339216 - 367.5176
-chr20 33335199 33335348 + 367.5176
-chr20 33339154 33339215 + 367.5176
-chr1 150101535 150101649 + 365.77314
-chr1 150105402 150105551 + 365.77314
-chr1 150101534 150101649 - 365.77314
-chr1 150105402 150105552 - 365.77314
-chr1 150101499 150101649 + 365.77314
-chr1 150105402 150105517 + 365.77314
-chr17 42517551 42517701 - 359.33549
-chr17 42521481 42521569 - 359.33549
-chr17 42517613 42517701 - 359.33549
-chr17 42521481 42521631 - 359.33549
-chr17 42517552 42517701 + 359.33549
-chr17 42521481 42521568 + 359.33549
-chr6 15561986 15562057 - 358.9469
-chr6 15565853 15566002 - 358.9469
-chr6 15561907 15562057 + 358.9469
-chr6 15565853 15565925 + 358.9469
-chr6 15561908 15562057 - 358.9469
-chr6 15565853 15565924 - 358.9469
-chr6 15561985 15562057 + 358.9469
-chr6 15565853 15566003 + 358.9469
-chrX 99209453 99209603 - 358.41391
-chrX 99213321 99213471 - 358.41391
-chrX 99209454 99209603 + 358.41391
-chrX 99213321 99213470 + 358.41391
-chr2 44482827 44482977 + 357.20939
-chr2 44486695 44486845 + 357.20939
-chr19 43453207 43453357 - 355.83581
-chr19 43457120 43457225 - 355.83581
-chr19 43453253 43453357 + 355.83581
-chr19 43457120 43457269 + 355.83581
-chr19 43453252 43453357 - 355.83581
-chr19 43457120 43457270 - 355.83581
-chr17 44334564 44334713 - 354.9178
-chr17 44338431 44338580 - 354.9178
-chr17 44334563 44334713 + 354.9178
-chr17 44338431 44338581 + 354.9178
-chr5 138876711 138876861 + 352.26403
-chr5 138880579 138880729 + 352.26403
-chr17 82008681 82008700 - 351.42341
-chr17 82012549 82012699 - 351.42341
-chr17 82008550 82008700 - 351.42341
-chr17 82012549 82012568 - 351.42341
-chr17 82008682 82008700 + 351.42341
-chr17 82012549 82012698 + 351.42341
-chr15 38664851 38665000 + 350.08504
-chr15 38668760 38668867 + 350.08504
-chr15 38664850 38665000 - 350.08504
-chr15 38668760 38668868 - 350.08504
-chr15 38664892 38665000 - 350.08504
-chr15 38668760 38668910 - 350.08504
-chr15 38664893 38665000 + 350.08504
-chr15 38668760 38668909 + 350.08504
-chr1 9168135 9168218 + 349.08889
-chr1 9172003 9172153 + 349.08889
-chr1 9168136 9168218 - 349.08889
-chr1 9172003 9172152 - 349.08889
-chr1 9168068 9168218 + 349.08889
-chr1 9172003 9172086 + 349.08889
-chr9 145234697 145234810 - 348.5771
-chr9 145238565 145238715 - 348.5771
-chr9 145234661 145234810 - 348.5771
-chr9 145238565 145238677 - 348.5771
-chr9 145234660 145234810 + 348.5771
-chr9 145238565 145238678 + 348.5771
-chr6 73588200 73588289 + 345.89289
-chr6 73592068 73592218 + 345.89289
-chr6 73588139 73588289 - 345.89289
-chr6 73592068 73592157 - 345.89289
-chr19 41732348 41732498 - 345.26706
-chr19 41736216 41736366 - 345.26706
-chr19 41732349 41732498 + 345.26706
-chr19 41736216 41736365 + 345.26706
-chr20 59989038 59989135 + 345.12156
-chr20 59992905 59993054 + 345.12156
-chr20 59988985 59989135 - 345.12156
-chr20 59992905 59993003 - 345.12156
-chr20 59989037 59989135 - 345.12156
-chr20 59992905 59993055 - 345.12156
-chr21 24894312 24894462 - 344.78198
-chr21 24898180 24898330 - 344.78198
-chr21 24894313 24894462 + 344.78198
-chr21 24898180 24898329 + 344.78198
-chr14 17377274 17377423 - 341.73484
-chr14 17381209 17381290 - 341.73484
-chr14 17377273 17377423 + 341.73484
-chr14 17381209 17381291 + 341.73484
-chr14 17377341 17377423 + 341.73484
-chr14 17381209 17381359 + 341.73484
-chr19 44926037 44926186 - 341.02209
-chr19 44929904 44930053 - 341.02209
-chr19 44926036 44926186 + 341.02209
-chr19 44929904 44930054 + 341.02209
-chr5 111666908 111667058 + 339.94407
-chr5 111670776 111670926 + 339.94407
-chr20 40161013 40161163 - 339.91828
-chr20 40164881 40165031 - 339.91828
-chr20 40161014 40161163 + 339.91828
-chr20 40164881 40165030 + 339.91828
-chr16 61261359 61261509 + 338.57132
-chr16 61265306 61265377 + 338.57132
-chr16 61261360 61261509 - 338.57132
-chr16 61265306 61265376 - 338.57132
-chr16 61261438 61261509 + 338.57132
-chr16 61265306 61265456 + 338.57132
-chr16 61261439 61261509 - 338.57132
-chr16 61265306 61265455 - 338.57132
-chr6 155876025 155876076 - 336.32967
-chr6 155879892 155880041 - 336.32967
-chr6 155876024 155876076 + 336.32967
-chr6 155879892 155880042 + 336.32967
-chr6 155875926 155876076 - 336.32967
-chr6 155879892 155879942 - 336.32967
-chr1 224661531 224661681 - 334.48922
-chr1 224665451 224665547 - 334.48922
-chr1 224661583 224661681 + 334.48922
-chr1 224665451 224665601 + 334.48922
-chr1 224661584 224661681 - 334.48922
-chr1 224665451 224665600 - 334.48922
+chr14 44123158 44123308 + 465.63275 +
+chr14 44127026 44127176 + 465.63275 +
+chr14 44123159 44123308 - 465.63275 -
+chr14 44127026 44127175 - 465.63275 -
+chr1 114356587 114356736 - 441.00102 -
+chr1 114360454 114360603 - 441.00102 -
+chr1 114356586 114356736 + 441.00102 +
+chr1 114360454 114360604 + 441.00102 +
+chrX 9700334 9700484 - 418.31102 -
+chrX 9704202 9704352 - 418.31102 -
+chr15 54632158 54632308 + 410.28758 +
+chr15 54636060 54636176 + 410.28758 +
+chr15 54632159 54632308 - 410.28758 -
+chr15 54636060 54636175 - 410.28758 -
+chr15 54632192 54632308 - 410.28758 -
+chr15 54636060 54636210 - 410.28758 -
+chr15 54632193 54632308 + 410.28758 +
+chr15 54636060 54636209 + 410.28758 +
+chr6 53009844 53009994 + 398.10126 +
+chr6 53013712 53013862 + 398.10126 +
+chr16 4279536 4279624 + 394.95247 +
+chr16 4283404 4283554 + 394.95247 +
+chr16 4279474 4279624 + 394.95247 +
+chr16 4283404 4283490 + 394.95247 +
+chr16 4279537 4279624 - 394.95247 -
+chr16 4283404 4283553 - 394.95247 -
+chr7 152343847 152343935 - 390.20159 -
+chr7 152347715 152347865 - 390.20159 -
+chr7 152343848 152343935 + 390.20159 +
+chr7 152347715 152347864 + 390.20159 +
+chr7 152343786 152343935 + 390.20159 +
+chr7 152347715 152347802 + 390.20159 +
+chr7 152343785 152343935 - 390.20159 -
+chr7 152347715 152347803 - 390.20159 -
+chr16 63442391 63442541 - 389.524 -
+chr16 63446302 63446409 - 389.524 -
+chr16 63442392 63442541 + 389.524 +
+chr16 63446302 63446408 + 389.524 +
+chr16 63442436 63442541 - 389.524 -
+chr16 63446302 63446452 - 389.524 -
+chr6 41196658 41196741 - 388.86052 -
+chr6 41200525 41200674 - 388.86052 -
+chr6 41196657 41196741 + 388.86052 +
+chr6 41200525 41200675 + 388.86052 +
+chr6 41196592 41196741 + 388.86052 +
+chr6 41200525 41200608 + 388.86052 +
+chr6 41196591 41196741 - 388.86052 -
+chr6 41200525 41200609 - 388.86052 -
+chr20 36033923 36034073 + 384.11318 +
+chr20 36037791 36037941 + 384.11318 +
+chr20 36033924 36034073 - 384.11318 -
+chr20 36037791 36037940 - 384.11318 -
+chr12 57871837 57871987 - 380.55342 -
+chr12 57875753 57875855 - 380.55342 -
+chr12 57871885 57871987 - 380.55342 -
+chr12 57875753 57875903 - 380.55342 -
+chr8 22846344 22846494 + 372.97972 +
+chr8 22850296 22850362 + 372.97972 +
+chr8 22846345 22846494 - 372.97972 -
+chr8 22850296 22850361 - 372.97972 -
+chr8 22846428 22846494 + 372.97972 +
+chr8 22850296 22850446 + 372.97972 +
+chr17 45987626 45987736 + 370.32321 +
+chr17 45991494 45991644 + 370.32321 +
+chr17 45987627 45987736 - 370.32321 -
+chr17 45991494 45991643 - 370.32321 -
+chr17 45987587 45987736 - 370.32321 -
+chr17 45991494 45991603 - 370.32321 -
+chr17 45987586 45987736 + 370.32321 +
+chr17 45991494 45991604 + 370.32321 +
+chr3 48462689 48462778 + 367.84554 +
+chr3 48466556 48466705 + 367.84554 +
+chr3 48462688 48462778 - 367.84554 -
+chr3 48466556 48466706 - 367.84554 -
+chr3 48462628 48462778 - 367.84554 -
+chr3 48466556 48466646 - 367.84554 -
+chr20 33335286 33335348 - 367.5176 -
+chr20 33339154 33339304 - 367.5176 -
+chr20 33335198 33335348 - 367.5176 -
+chr20 33339154 33339216 - 367.5176 -
+chr20 33335199 33335348 + 367.5176 +
+chr20 33339154 33339215 + 367.5176 +
+chr1 150101535 150101649 + 365.77314 +
+chr1 150105402 150105551 + 365.77314 +
+chr1 150101534 150101649 - 365.77314 -
+chr1 150105402 150105552 - 365.77314 -
+chr1 150101499 150101649 + 365.77314 +
+chr1 150105402 150105517 + 365.77314 +
+chr17 42517551 42517701 - 359.33549 -
+chr17 42521481 42521569 - 359.33549 -
+chr17 42517613 42517701 - 359.33549 -
+chr17 42521481 42521631 - 359.33549 -
+chr17 42517552 42517701 + 359.33549 +
+chr17 42521481 42521568 + 359.33549 +
+chr6 15561986 15562057 - 358.9469 -
+chr6 15565853 15566002 - 358.9469 -
+chr6 15561907 15562057 + 358.9469 +
+chr6 15565853 15565925 + 358.9469 +
+chr6 15561908 15562057 - 358.9469 -
+chr6 15565853 15565924 - 358.9469 -
+chr6 15561985 15562057 + 358.9469 +
+chr6 15565853 15566003 + 358.9469 +
+chrX 99209453 99209603 - 358.41391 -
+chrX 99213321 99213471 - 358.41391 -
+chrX 99209454 99209603 + 358.41391 +
+chrX 99213321 99213470 + 358.41391 +
+chr2 44482827 44482977 + 357.20939 +
+chr2 44486695 44486845 + 357.20939 +
+chr19 43453207 43453357 - 355.83581 -
+chr19 43457120 43457225 - 355.83581 -
+chr19 43453253 43453357 + 355.83581 +
+chr19 43457120 43457269 + 355.83581 +
+chr19 43453252 43453357 - 355.83581 -
+chr19 43457120 43457270 - 355.83581 -
+chr17 44334564 44334713 - 354.9178 -
+chr17 44338431 44338580 - 354.9178 -
+chr17 44334563 44334713 + 354.9178 +
+chr17 44338431 44338581 + 354.9178 +
+chr5 138876711 138876861 + 352.26403 +
+chr5 138880579 138880729 + 352.26403 +
+chr17 82008681 82008700 - 351.42341 -
+chr17 82012549 82012699 - 351.42341 -
+chr17 82008550 82008700 - 351.42341 -
+chr17 82012549 82012568 - 351.42341 -
+chr17 82008682 82008700 + 351.42341 +
+chr17 82012549 82012698 + 351.42341 +
+chr15 38664851 38665000 + 350.08504 +
+chr15 38668760 38668867 + 350.08504 +
+chr15 38664850 38665000 - 350.08504 -
+chr15 38668760 38668868 - 350.08504 -
+chr15 38664892 38665000 - 350.08504 -
+chr15 38668760 38668910 - 350.08504 -
+chr15 38664893 38665000 + 350.08504 +
+chr15 38668760 38668909 + 350.08504 +
+chr1 9168135 9168218 + 349.08889 +
+chr1 9172003 9172153 + 349.08889 +
+chr1 9168136 9168218 - 349.08889 -
+chr1 9172003 9172152 - 349.08889 -
+chr1 9168068 9168218 + 349.08889 +
+chr1 9172003 9172086 + 349.08889 +
+chr9 145234697 145234810 - 348.5771 -
+chr9 145238565 145238715 - 348.5771 -
+chr9 145234661 145234810 - 348.5771 -
+chr9 145238565 145238677 - 348.5771 -
+chr9 145234660 145234810 + 348.5771 +
+chr9 145238565 145238678 + 348.5771 +
+chr6 73588200 73588289 + 345.89289 +
+chr6 73592068 73592218 + 345.89289 +
+chr6 73588139 73588289 - 345.89289 -
+chr6 73592068 73592157 - 345.89289 -
+chr19 41732348 41732498 - 345.26706 -
+chr19 41736216 41736366 - 345.26706 -
+chr19 41732349 41732498 + 345.26706 +
+chr19 41736216 41736365 + 345.26706 +
+chr20 59989038 59989135 + 345.12156 +
+chr20 59992905 59993054 + 345.12156 +
+chr20 59988985 59989135 - 345.12156 -
+chr20 59992905 59993003 - 345.12156 -
+chr20 59989037 59989135 - 345.12156 -
+chr20 59992905 59993055 - 345.12156 -
+chr21 24894312 24894462 - 344.78198 -
+chr21 24898180 24898330 - 344.78198 -
+chr21 24894313 24894462 + 344.78198 +
+chr21 24898180 24898329 + 344.78198 +
+chr14 17377274 17377423 - 341.73484 -
+chr14 17381209 17381290 - 341.73484 -
+chr14 17377273 17377423 + 341.73484 +
+chr14 17381209 17381291 + 341.73484 +
+chr14 17377341 17377423 + 341.73484 +
+chr14 17381209 17381359 + 341.73484 +
+chr19 44926037 44926186 - 341.02209 -
+chr19 44929904 44930053 - 341.02209 -
+chr19 44926036 44926186 + 341.02209 +
+chr19 44929904 44930054 + 341.02209 +
+chr5 111666908 111667058 + 339.94407 +
+chr5 111670776 111670926 + 339.94407 +
+chr20 40161013 40161163 - 339.91828 -
+chr20 40164881 40165031 - 339.91828 -
+chr20 40161014 40161163 + 339.91828 +
+chr20 40164881 40165030 + 339.91828 +
+chr16 61261359 61261509 + 338.57132 +
+chr16 61265306 61265377 + 338.57132 +
+chr16 61261360 61261509 - 338.57132 -
+chr16 61265306 61265376 - 338.57132 -
+chr16 61261438 61261509 + 338.57132 +
+chr16 61265306 61265456 + 338.57132 +
+chr16 61261439 61261509 - 338.57132 -
+chr16 61265306 61265455 - 338.57132 -
+chr6 155876025 155876076 - 336.32967 -
+chr6 155879892 155880041 - 336.32967 -
+chr6 155876024 155876076 + 336.32967 +
+chr6 155879892 155880042 + 336.32967 +
+chr6 155875926 155876076 - 336.32967 -
+chr6 155879892 155879942 - 336.32967 -
+chr1 224661531 224661681 - 334.48922 -
+chr1 224665451 224665547 - 334.48922 -
+chr1 224661583 224661681 + 334.48922 +
+chr1 224665451 224665601 + 334.48922 +
+chr1 224661584 224661681 - 334.48922 -
+chr1 224665451 224665600 - 334.48922 -
\ No newline at end of file
diff --git a/dimelo/test/data/ctcf_demo_peak.bed b/dimelo/test/data/ctcf_demo_peak.bed
index b5f1f460..279cd25f 100644
--- a/dimelo/test/data/ctcf_demo_peak.bed
+++ b/dimelo/test/data/ctcf_demo_peak.bed
@@ -1,100 +1,100 @@
-chr14 44125008 44125326 + 465.63275
-chr14 44125009 44125325 - 465.63275
-chr1 114358437 114358753 - 441.00102
-chr1 114358436 114358754 + 441.00102
-chrX 9702184 9702502 - 418.31102
-chr15 54634008 54634326 + 410.28758
-chr15 54634009 54634325 - 410.28758
-chr15 54634042 54634360 - 410.28758
-chr15 54634043 54634359 + 410.28758
-chr6 53011694 53012012 + 398.10126
-chr16 4281386 4281704 + 394.95247
-chr16 4281324 4281640 + 394.95247
-chr16 4281387 4281703 - 394.95247
-chr7 152345697 152346015 - 390.20159
-chr7 152345698 152346014 + 390.20159
-chr7 152345636 152345952 + 390.20159
-chr7 152345635 152345953 - 390.20159
-chr16 63444241 63444559 - 389.524
-chr16 63444242 63444558 + 389.524
-chr16 63444286 63444602 - 389.524
-chr6 41198508 41198824 - 388.86052
-chr6 41198507 41198825 + 388.86052
-chr6 41198442 41198758 + 388.86052
-chr6 41198441 41198759 - 388.86052
-chr20 36035773 36036091 + 384.11318
-chr20 36035774 36036090 - 384.11318
-chr12 57873687 57874005 - 380.55342
-chr12 57873735 57874053 - 380.55342
-chr8 22848194 22848512 + 372.97972
-chr8 22848195 22848511 - 372.97972
-chr8 22848278 22848596 + 372.97972
-chr17 45989476 45989794 + 370.32321
-chr17 45989477 45989793 - 370.32321
-chr17 45989437 45989753 - 370.32321
-chr17 45989436 45989754 + 370.32321
-chr3 48464539 48464855 + 367.84554
-chr3 48464538 48464856 - 367.84554
-chr3 48464478 48464796 - 367.84554
-chr20 33337136 33337454 - 367.5176
-chr20 33337048 33337366 - 367.5176
-chr20 33337049 33337365 + 367.5176
-chr1 150103385 150103701 + 365.77314
-chr1 150103384 150103702 - 365.77314
-chr1 150103349 150103667 + 365.77314
-chr17 42519401 42519719 - 359.33549
-chr17 42519463 42519781 - 359.33549
-chr17 42519402 42519718 + 359.33549
-chr6 15563836 15564152 - 358.9469
-chr6 15563757 15564075 + 358.9469
-chr6 15563758 15564074 - 358.9469
-chr6 15563835 15564153 + 358.9469
-chrX 99211303 99211621 - 358.41391
-chrX 99211304 99211620 + 358.41391
-chr2 44484677 44484995 + 357.20939
-chr19 43455057 43455375 - 355.83581
-chr19 43455103 43455419 + 355.83581
-chr19 43455102 43455420 - 355.83581
-chr17 44336414 44336730 - 354.9178
-chr17 44336413 44336731 + 354.9178
-chr5 138878561 138878879 + 352.26403
-chr17 82010531 82010849 - 351.42341
-chr17 82010400 82010718 - 351.42341
-chr17 82010532 82010848 + 351.42341
-chr15 38666701 38667017 + 350.08504
-chr15 38666700 38667018 - 350.08504
-chr15 38666742 38667060 - 350.08504
-chr15 38666743 38667059 + 350.08504
-chr1 9169985 9170303 + 349.08889
-chr1 9169986 9170302 - 349.08889
-chr1 9169918 9170236 + 349.08889
-chr9 145236547 145236865 - 348.5771
-chr9 145236511 145236827 - 348.5771
-chr9 145236510 145236828 + 348.5771
-chr6 73590050 73590368 + 345.89289
-chr6 73589989 73590307 - 345.89289
-chr19 41734198 41734516 - 345.26706
-chr19 41734199 41734515 + 345.26706
-chr20 59990888 59991204 + 345.12156
-chr20 59990835 59991153 - 345.12156
-chr20 59990887 59991205 - 345.12156
-chr21 24896162 24896480 - 344.78198
-chr21 24896163 24896479 + 344.78198
-chr14 17379124 17379440 - 341.73484
-chr14 17379123 17379441 + 341.73484
-chr14 17379191 17379509 + 341.73484
-chr19 44927887 44928203 - 341.02209
-chr19 44927886 44928204 + 341.02209
-chr5 111668758 111669076 + 339.94407
-chr20 40162863 40163181 - 339.91828
-chr20 40162864 40163180 + 339.91828
-chr16 61263209 61263527 + 338.57132
-chr16 61263210 61263526 - 338.57132
-chr16 61263288 61263606 + 338.57132
-chr16 61263289 61263605 - 338.57132
-chr6 155877875 155878191 - 336.32967
-chr6 155877874 155878192 + 336.32967
-chr6 155877776 155878092 - 336.32967
-chr1 224663381 224663697 - 334.48922
-chr1 224663433 224663751 + 334.48922
-chr1 224663434 224663750 - 334.48922
+chr14 44125008 44125326 + 465.63275 +
+chr14 44125009 44125325 - 465.63275 -
+chr1 114358437 114358753 - 441.00102 -
+chr1 114358436 114358754 + 441.00102 +
+chrX 9702184 9702502 - 418.31102 -
+chr15 54634008 54634326 + 410.28758 +
+chr15 54634009 54634325 - 410.28758 -
+chr15 54634042 54634360 - 410.28758 -
+chr15 54634043 54634359 + 410.28758 +
+chr6 53011694 53012012 + 398.10126 +
+chr16 4281386 4281704 + 394.95247 +
+chr16 4281324 4281640 + 394.95247 +
+chr16 4281387 4281703 - 394.95247 -
+chr7 152345697 152346015 - 390.20159 -
+chr7 152345698 152346014 + 390.20159 +
+chr7 152345636 152345952 + 390.20159 +
+chr7 152345635 152345953 - 390.20159 -
+chr16 63444241 63444559 - 389.524 -
+chr16 63444242 63444558 + 389.524 +
+chr16 63444286 63444602 - 389.524 -
+chr6 41198508 41198824 - 388.86052 -
+chr6 41198507 41198825 + 388.86052 +
+chr6 41198442 41198758 + 388.86052 +
+chr6 41198441 41198759 - 388.86052 -
+chr20 36035773 36036091 + 384.11318 +
+chr20 36035774 36036090 - 384.11318 -
+chr12 57873687 57874005 - 380.55342 -
+chr12 57873735 57874053 - 380.55342 -
+chr8 22848194 22848512 + 372.97972 +
+chr8 22848195 22848511 - 372.97972 -
+chr8 22848278 22848596 + 372.97972 +
+chr17 45989476 45989794 + 370.32321 +
+chr17 45989477 45989793 - 370.32321 -
+chr17 45989437 45989753 - 370.32321 -
+chr17 45989436 45989754 + 370.32321 +
+chr3 48464539 48464855 + 367.84554 +
+chr3 48464538 48464856 - 367.84554 -
+chr3 48464478 48464796 - 367.84554 -
+chr20 33337136 33337454 - 367.5176 -
+chr20 33337048 33337366 - 367.5176 -
+chr20 33337049 33337365 + 367.5176 +
+chr1 150103385 150103701 + 365.77314 +
+chr1 150103384 150103702 - 365.77314 -
+chr1 150103349 150103667 + 365.77314 +
+chr17 42519401 42519719 - 359.33549 -
+chr17 42519463 42519781 - 359.33549 -
+chr17 42519402 42519718 + 359.33549 +
+chr6 15563836 15564152 - 358.9469 -
+chr6 15563757 15564075 + 358.9469 +
+chr6 15563758 15564074 - 358.9469 -
+chr6 15563835 15564153 + 358.9469 +
+chrX 99211303 99211621 - 358.41391 -
+chrX 99211304 99211620 + 358.41391 +
+chr2 44484677 44484995 + 357.20939 +
+chr19 43455057 43455375 - 355.83581 -
+chr19 43455103 43455419 + 355.83581 +
+chr19 43455102 43455420 - 355.83581 -
+chr17 44336414 44336730 - 354.9178 -
+chr17 44336413 44336731 + 354.9178 +
+chr5 138878561 138878879 + 352.26403 +
+chr17 82010531 82010849 - 351.42341 -
+chr17 82010400 82010718 - 351.42341 -
+chr17 82010532 82010848 + 351.42341 +
+chr15 38666701 38667017 + 350.08504 +
+chr15 38666700 38667018 - 350.08504 -
+chr15 38666742 38667060 - 350.08504 -
+chr15 38666743 38667059 + 350.08504 +
+chr1 9169985 9170303 + 349.08889 +
+chr1 9169986 9170302 - 349.08889 -
+chr1 9169918 9170236 + 349.08889 +
+chr9 145236547 145236865 - 348.5771 -
+chr9 145236511 145236827 - 348.5771 -
+chr9 145236510 145236828 + 348.5771 +
+chr6 73590050 73590368 + 345.89289 +
+chr6 73589989 73590307 - 345.89289 -
+chr19 41734198 41734516 - 345.26706 -
+chr19 41734199 41734515 + 345.26706 +
+chr20 59990888 59991204 + 345.12156 +
+chr20 59990835 59991153 - 345.12156 -
+chr20 59990887 59991205 - 345.12156 -
+chr21 24896162 24896480 - 344.78198 -
+chr21 24896163 24896479 + 344.78198 +
+chr14 17379124 17379440 - 341.73484 -
+chr14 17379123 17379441 + 341.73484 +
+chr14 17379191 17379509 + 341.73484 +
+chr19 44927887 44928203 - 341.02209 -
+chr19 44927886 44928204 + 341.02209 +
+chr5 111668758 111669076 + 339.94407 +
+chr20 40162863 40163181 - 339.91828 -
+chr20 40162864 40163180 + 339.91828 +
+chr16 61263209 61263527 + 338.57132 +
+chr16 61263210 61263526 - 338.57132 -
+chr16 61263288 61263606 + 338.57132 +
+chr16 61263289 61263605 - 338.57132 -
+chr6 155877875 155878191 - 336.32967 -
+chr6 155877874 155878192 + 336.32967 +
+chr6 155877776 155878092 - 336.32967 -
+chr1 224663381 224663697 - 334.48922 -
+chr1 224663433 224663751 + 334.48922 +
+chr1 224663434 224663750 - 334.48922 -
\ No newline at end of file
diff --git a/dimelo/test/data/test_targets/megalodon_peaks_190/pileup.sorted.bed.gz b/dimelo/test/data/test_targets/megalodon_peaks_190/pileup.sorted.bed.gz
new file mode 100644
index 00000000..5d20e1ee
Binary files /dev/null and b/dimelo/test/data/test_targets/megalodon_peaks_190/pileup.sorted.bed.gz differ
diff --git a/dimelo/test/data/test_targets/megalodon_peaks_190/pileup.sorted.bed.gz.tbi b/dimelo/test/data/test_targets/megalodon_peaks_190/pileup.sorted.bed.gz.tbi
new file mode 100644
index 00000000..57239118
Binary files /dev/null and b/dimelo/test/data/test_targets/megalodon_peaks_190/pileup.sorted.bed.gz.tbi differ
diff --git a/dimelo/test/data/test_targets/megalodon_peaks_190/reads.combined_basemods.h5 b/dimelo/test/data/test_targets/megalodon_peaks_190/reads.combined_basemods.h5
new file mode 100644
index 00000000..919ca88e
Binary files /dev/null and b/dimelo/test/data/test_targets/megalodon_peaks_190/reads.combined_basemods.h5 differ
diff --git a/dimelo/test/data/test_targets/megalodon_peaks_190/regions.processed.bed b/dimelo/test/data/test_targets/megalodon_peaks_190/regions.processed.bed
new file mode 100644
index 00000000..c1f808c2
--- /dev/null
+++ b/dimelo/test/data/test_targets/megalodon_peaks_190/regions.processed.bed
@@ -0,0 +1,300 @@
+chr14 17372348 17382348 . . .
+chr14 17372348 17382348 . . .
+chr14 17372382 17382382 . . .
+chr14 17374282 17384282 . . .
+chr14 17374282 17384282 . . .
+chr14 17374350 17384350 . . .
+chr14 17376249 17386249 . . .
+chr14 17376250 17386250 . . .
+chr14 17376284 17386284 . . .
+chr14 44118233 44128233 . . .
+chr14 44118233 44128233 . . .
+chr14 44120167 44130167 . . .
+chr14 44120167 44130167 . . .
+chr14 44122100 44132100 . . .
+chr14 44122101 44132101 . . .
+chr1 9163143 9173143 . . .
+chr1 9163176 9173176 . . .
+chr1 9163177 9173177 . . .
+chr1 9165077 9175077 . . .
+chr1 9165144 9175144 . . .
+chr1 9165144 9175144 . . .
+chr1 9167044 9177044 . . .
+chr1 9167077 9177077 . . .
+chr1 9167078 9177078 . . .
+chr1 114351661 114361661 . . .
+chr1 114351661 114361661 . . .
+chr1 114353595 114363595 . . .
+chr1 114353595 114363595 . . .
+chr1 114355528 114365528 . . .
+chr1 114355529 114365529 . . .
+chr1 150096574 150106574 . . .
+chr1 150096591 150106591 . . .
+chr1 150096592 150106592 . . .
+chr1 150098508 150108508 . . .
+chr1 150098543 150108543 . . .
+chr1 150098543 150108543 . . .
+chr1 150100459 150110459 . . .
+chr1 150100476 150110476 . . .
+chr1 150100477 150110477 . . .
+chr1 224656606 224666606 . . .
+chr1 224656632 224666632 . . .
+chr1 224656632 224666632 . . .
+chr1 224658539 224668539 . . .
+chr1 224658592 224668592 . . .
+chr1 224658592 224668592 . . .
+chr1 224660499 224670499 . . .
+chr1 224660525 224670525 . . .
+chr1 224660526 224670526 . . .
+chrX 9695409 9705409 . . .
+chrX 9697343 9707343 . . .
+chrX 9699277 9709277 . . .
+chrX 99204528 99214528 . . .
+chrX 99204528 99214528 . . .
+chrX 99206462 99216462 . . .
+chrX 99206462 99216462 . . .
+chrX 99208395 99218395 . . .
+chrX 99208396 99218396 . . .
+chr15 38659925 38669925 . . .
+chr15 38659925 38669925 . . .
+chr15 38659946 38669946 . . .
+chr15 38659946 38669946 . . .
+chr15 38661859 38671859 . . .
+chr15 38661859 38671859 . . .
+chr15 38661901 38671901 . . .
+chr15 38661901 38671901 . . .
+chr15 38663813 38673813 . . .
+chr15 38663814 38673814 . . .
+chr15 38663834 38673834 . . .
+chr15 38663835 38673835 . . .
+chr15 54627233 54637233 . . .
+chr15 54627233 54637233 . . .
+chr15 54627250 54637250 . . .
+chr15 54627250 54637250 . . .
+chr15 54629167 54639167 . . .
+chr15 54629167 54639167 . . .
+chr15 54629201 54639201 . . .
+chr15 54629201 54639201 . . .
+chr15 54631117 54641117 . . .
+chr15 54631118 54641118 . . .
+chr15 54631134 54641134 . . .
+chr15 54631135 54641135 . . .
+chr6 15556982 15566982 . . .
+chr6 15556982 15566982 . . .
+chr6 15557021 15567021 . . .
+chr6 15557021 15567021 . . .
+chr6 15558916 15568916 . . .
+chr6 15558916 15568916 . . .
+chr6 15558994 15568994 . . .
+chr6 15558994 15568994 . . .
+chr6 15560888 15570888 . . .
+chr6 15560889 15570889 . . .
+chr6 15560927 15570927 . . .
+chr6 15560928 15570928 . . .
+chr6 41191666 41201666 . . .
+chr6 41191666 41201666 . . .
+chr6 41191699 41201699 . . .
+chr6 41191699 41201699 . . .
+chr6 41193600 41203600 . . .
+chr6 41193600 41203600 . . .
+chr6 41193666 41203666 . . .
+chr6 41193666 41203666 . . .
+chr6 41195566 41205566 . . .
+chr6 41195567 41205567 . . .
+chr6 41195599 41205599 . . .
+chr6 41195600 41205600 . . .
+chr6 53004919 53014919 . . .
+chr6 53006853 53016853 . . .
+chr6 53008787 53018787 . . .
+chr6 73583214 73593214 . . .
+chr6 73583244 73593244 . . .
+chr6 73585148 73595148 . . .
+chr6 73585209 73595209 . . .
+chr6 73587112 73597112 . . .
+chr6 73587143 73597143 . . .
+chr6 155871001 155881001 . . .
+chr6 155871050 155881050 . . .
+chr6 155871050 155881050 . . .
+chr6 155872934 155882934 . . .
+chr6 155873033 155883033 . . .
+chr6 155873033 155883033 . . .
+chr6 155874917 155884917 . . .
+chr6 155874966 155884966 . . .
+chr6 155874967 155884967 . . .
+chr16 4274549 4284549 . . .
+chr16 4274580 4284580 . . .
+chr16 4274580 4284580 . . .
+chr16 4276482 4286482 . . .
+chr16 4276545 4286545 . . .
+chr16 4276545 4286545 . . .
+chr16 4278447 4288447 . . .
+chr16 4278478 4288478 . . .
+chr16 4278479 4288479 . . .
+chr16 61256434 61266434 . . .
+chr16 61256434 61266434 . . .
+chr16 61256473 61266473 . . .
+chr16 61256474 61266474 . . .
+chr16 61258368 61268368 . . .
+chr16 61258368 61268368 . . .
+chr16 61258447 61268447 . . .
+chr16 61258447 61268447 . . .
+chr16 61260341 61270341 . . .
+chr16 61260341 61270341 . . .
+chr16 61260380 61270380 . . .
+chr16 61260381 61270381 . . .
+chr16 63437466 63447466 . . .
+chr16 63437466 63447466 . . .
+chr16 63437488 63447488 . . .
+chr16 63439400 63449400 . . .
+chr16 63439400 63449400 . . .
+chr16 63439444 63449444 . . .
+chr16 63441355 63451355 . . .
+chr16 63441355 63451355 . . .
+chr16 63441377 63451377 . . .
+chr7 152338860 152348860 . . .
+chr7 152338860 152348860 . . .
+chr7 152338891 152348891 . . .
+chr7 152338891 152348891 . . .
+chr7 152340794 152350794 . . .
+chr7 152340794 152350794 . . .
+chr7 152340856 152350856 . . .
+chr7 152340856 152350856 . . .
+chr7 152342758 152352758 . . .
+chr7 152342759 152352759 . . .
+chr7 152342789 152352789 . . .
+chr7 152342790 152352790 . . .
+chr20 33330273 33340273 . . .
+chr20 33330273 33340273 . . .
+chr20 33330317 33340317 . . .
+chr20 33332207 33342207 . . .
+chr20 33332207 33342207 . . .
+chr20 33332295 33342295 . . .
+chr20 33334184 33344184 . . .
+chr20 33334185 33344185 . . .
+chr20 33334229 33344229 . . .
+chr20 36028998 36038998 . . .
+chr20 36028998 36038998 . . .
+chr20 36030932 36040932 . . .
+chr20 36030932 36040932 . . .
+chr20 36032865 36042865 . . .
+chr20 36032866 36042866 . . .
+chr20 40156088 40166088 . . .
+chr20 40156088 40166088 . . .
+chr20 40158022 40168022 . . .
+chr20 40158022 40168022 . . .
+chr20 40159955 40169955 . . .
+chr20 40159956 40169956 . . .
+chr20 59984060 59994060 . . .
+chr20 59984086 59994086 . . .
+chr20 59984086 59994086 . . .
+chr20 59985994 59995994 . . .
+chr20 59986046 59996046 . . .
+chr20 59986046 59996046 . . .
+chr20 59987954 59997954 . . .
+chr20 59987979 59997979 . . .
+chr20 59987980 59997980 . . .
+chr12 57866912 57876912 . . .
+chr12 57866936 57876936 . . .
+chr12 57868846 57878846 . . .
+chr12 57868894 57878894 . . .
+chr12 57870804 57880804 . . .
+chr12 57870828 57880828 . . .
+chr8 22841419 22851419 . . .
+chr8 22841419 22851419 . . .
+chr8 22841461 22851461 . . .
+chr8 22843353 22853353 . . .
+chr8 22843353 22853353 . . .
+chr8 22843437 22853437 . . .
+chr8 22845328 22855328 . . .
+chr8 22845329 22855329 . . .
+chr8 22845371 22855371 . . .
+chr17 42512626 42522626 . . .
+chr17 42512626 42522626 . . .
+chr17 42512657 42522657 . . .
+chr17 42514560 42524560 . . .
+chr17 42514560 42524560 . . .
+chr17 42514622 42524622 . . .
+chr17 42516524 42526524 . . .
+chr17 42516525 42526525 . . .
+chr17 42516556 42526556 . . .
+chr17 44329638 44339638 . . .
+chr17 44329638 44339638 . . .
+chr17 44331572 44341572 . . .
+chr17 44331572 44341572 . . .
+chr17 44333505 44343505 . . .
+chr17 44333506 44343506 . . .
+chr17 45982661 45992661 . . .
+chr17 45982661 45992661 . . .
+chr17 45982681 45992681 . . .
+chr17 45982681 45992681 . . .
+chr17 45984595 45994595 . . .
+chr17 45984595 45994595 . . .
+chr17 45984635 45994635 . . .
+chr17 45984635 45994635 . . .
+chr17 45986548 45996548 . . .
+chr17 45986549 45996549 . . .
+chr17 45986568 45996568 . . .
+chr17 45986569 45996569 . . .
+chr17 82003625 82013625 . . .
+chr17 82003690 82013690 . . .
+chr17 82003691 82013691 . . .
+chr17 82005559 82015559 . . .
+chr17 82005690 82015690 . . .
+chr17 82005690 82015690 . . .
+chr17 82007558 82017558 . . .
+chr17 82007623 82017623 . . .
+chr17 82007624 82017624 . . .
+chr3 48457703 48467703 . . .
+chr3 48457733 48467733 . . .
+chr3 48457733 48467733 . . .
+chr3 48459637 48469637 . . .
+chr3 48459697 48469697 . . .
+chr3 48459697 48469697 . . .
+chr3 48461601 48471601 . . .
+chr3 48461630 48471630 . . .
+chr3 48461631 48471631 . . .
+chr2 44477902 44487902 . . .
+chr2 44479836 44489836 . . .
+chr2 44481770 44491770 . . .
+chr19 41727423 41737423 . . .
+chr19 41727423 41737423 . . .
+chr19 41729357 41739357 . . .
+chr19 41729357 41739357 . . .
+chr19 41731290 41741290 . . .
+chr19 41731291 41741291 . . .
+chr19 43448282 43458282 . . .
+chr19 43448304 43458304 . . .
+chr19 43448305 43458305 . . .
+chr19 43450216 43460216 . . .
+chr19 43450261 43460261 . . .
+chr19 43450261 43460261 . . .
+chr19 43452172 43462172 . . .
+chr19 43452194 43462194 . . .
+chr19 43452195 43462195 . . .
+chr19 44921111 44931111 . . .
+chr19 44921111 44931111 . . .
+chr19 44923045 44933045 . . .
+chr19 44923045 44933045 . . .
+chr19 44924978 44934978 . . .
+chr19 44924979 44934979 . . .
+chr5 111661983 111671983 . . .
+chr5 111663917 111673917 . . .
+chr5 111665851 111675851 . . .
+chr5 138871786 138881786 . . .
+chr5 138873720 138883720 . . .
+chr5 138875654 138885654 . . .
+chr9 145229735 145239735 . . .
+chr9 145229735 145239735 . . .
+chr9 145229753 145239753 . . .
+chr9 145231669 145241669 . . .
+chr9 145231669 145241669 . . .
+chr9 145231706 145241706 . . .
+chr9 145233621 145243621 . . .
+chr9 145233621 145243621 . . .
+chr9 145233640 145243640 . . .
+chr21 24889387 24899387 . . .
+chr21 24889387 24899387 . . .
+chr21 24891321 24901321 . . .
+chr21 24891321 24901321 . . .
+chr21 24893254 24903254 . . .
+chr21 24893255 24903255 . . .
diff --git a/dimelo/test/data/test_targets/megalodon_peaks_nothresh/pileup.sorted.bed.gz b/dimelo/test/data/test_targets/megalodon_peaks_nothresh/pileup.sorted.bed.gz
new file mode 100644
index 00000000..1cdb265a
Binary files /dev/null and b/dimelo/test/data/test_targets/megalodon_peaks_nothresh/pileup.sorted.bed.gz differ
diff --git a/dimelo/test/data/test_targets/megalodon_peaks_nothresh/pileup.sorted.bed.gz.tbi b/dimelo/test/data/test_targets/megalodon_peaks_nothresh/pileup.sorted.bed.gz.tbi
new file mode 100644
index 00000000..e9d81ca3
Binary files /dev/null and b/dimelo/test/data/test_targets/megalodon_peaks_nothresh/pileup.sorted.bed.gz.tbi differ
diff --git a/dimelo/test/data/test_targets/megalodon_peaks_nothresh/reads.combined_basemods.h5 b/dimelo/test/data/test_targets/megalodon_peaks_nothresh/reads.combined_basemods.h5
new file mode 100644
index 00000000..0a784b5b
Binary files /dev/null and b/dimelo/test/data/test_targets/megalodon_peaks_nothresh/reads.combined_basemods.h5 differ
diff --git a/dimelo/test/data/test_targets/megalodon_peaks_nothresh/regions.processed.bed b/dimelo/test/data/test_targets/megalodon_peaks_nothresh/regions.processed.bed
new file mode 100644
index 00000000..c1f808c2
--- /dev/null
+++ b/dimelo/test/data/test_targets/megalodon_peaks_nothresh/regions.processed.bed
@@ -0,0 +1,300 @@
+chr14 17372348 17382348 . . .
+chr14 17372348 17382348 . . .
+chr14 17372382 17382382 . . .
+chr14 17374282 17384282 . . .
+chr14 17374282 17384282 . . .
+chr14 17374350 17384350 . . .
+chr14 17376249 17386249 . . .
+chr14 17376250 17386250 . . .
+chr14 17376284 17386284 . . .
+chr14 44118233 44128233 . . .
+chr14 44118233 44128233 . . .
+chr14 44120167 44130167 . . .
+chr14 44120167 44130167 . . .
+chr14 44122100 44132100 . . .
+chr14 44122101 44132101 . . .
+chr1 9163143 9173143 . . .
+chr1 9163176 9173176 . . .
+chr1 9163177 9173177 . . .
+chr1 9165077 9175077 . . .
+chr1 9165144 9175144 . . .
+chr1 9165144 9175144 . . .
+chr1 9167044 9177044 . . .
+chr1 9167077 9177077 . . .
+chr1 9167078 9177078 . . .
+chr1 114351661 114361661 . . .
+chr1 114351661 114361661 . . .
+chr1 114353595 114363595 . . .
+chr1 114353595 114363595 . . .
+chr1 114355528 114365528 . . .
+chr1 114355529 114365529 . . .
+chr1 150096574 150106574 . . .
+chr1 150096591 150106591 . . .
+chr1 150096592 150106592 . . .
+chr1 150098508 150108508 . . .
+chr1 150098543 150108543 . . .
+chr1 150098543 150108543 . . .
+chr1 150100459 150110459 . . .
+chr1 150100476 150110476 . . .
+chr1 150100477 150110477 . . .
+chr1 224656606 224666606 . . .
+chr1 224656632 224666632 . . .
+chr1 224656632 224666632 . . .
+chr1 224658539 224668539 . . .
+chr1 224658592 224668592 . . .
+chr1 224658592 224668592 . . .
+chr1 224660499 224670499 . . .
+chr1 224660525 224670525 . . .
+chr1 224660526 224670526 . . .
+chrX 9695409 9705409 . . .
+chrX 9697343 9707343 . . .
+chrX 9699277 9709277 . . .
+chrX 99204528 99214528 . . .
+chrX 99204528 99214528 . . .
+chrX 99206462 99216462 . . .
+chrX 99206462 99216462 . . .
+chrX 99208395 99218395 . . .
+chrX 99208396 99218396 . . .
+chr15 38659925 38669925 . . .
+chr15 38659925 38669925 . . .
+chr15 38659946 38669946 . . .
+chr15 38659946 38669946 . . .
+chr15 38661859 38671859 . . .
+chr15 38661859 38671859 . . .
+chr15 38661901 38671901 . . .
+chr15 38661901 38671901 . . .
+chr15 38663813 38673813 . . .
+chr15 38663814 38673814 . . .
+chr15 38663834 38673834 . . .
+chr15 38663835 38673835 . . .
+chr15 54627233 54637233 . . .
+chr15 54627233 54637233 . . .
+chr15 54627250 54637250 . . .
+chr15 54627250 54637250 . . .
+chr15 54629167 54639167 . . .
+chr15 54629167 54639167 . . .
+chr15 54629201 54639201 . . .
+chr15 54629201 54639201 . . .
+chr15 54631117 54641117 . . .
+chr15 54631118 54641118 . . .
+chr15 54631134 54641134 . . .
+chr15 54631135 54641135 . . .
+chr6 15556982 15566982 . . .
+chr6 15556982 15566982 . . .
+chr6 15557021 15567021 . . .
+chr6 15557021 15567021 . . .
+chr6 15558916 15568916 . . .
+chr6 15558916 15568916 . . .
+chr6 15558994 15568994 . . .
+chr6 15558994 15568994 . . .
+chr6 15560888 15570888 . . .
+chr6 15560889 15570889 . . .
+chr6 15560927 15570927 . . .
+chr6 15560928 15570928 . . .
+chr6 41191666 41201666 . . .
+chr6 41191666 41201666 . . .
+chr6 41191699 41201699 . . .
+chr6 41191699 41201699 . . .
+chr6 41193600 41203600 . . .
+chr6 41193600 41203600 . . .
+chr6 41193666 41203666 . . .
+chr6 41193666 41203666 . . .
+chr6 41195566 41205566 . . .
+chr6 41195567 41205567 . . .
+chr6 41195599 41205599 . . .
+chr6 41195600 41205600 . . .
+chr6 53004919 53014919 . . .
+chr6 53006853 53016853 . . .
+chr6 53008787 53018787 . . .
+chr6 73583214 73593214 . . .
+chr6 73583244 73593244 . . .
+chr6 73585148 73595148 . . .
+chr6 73585209 73595209 . . .
+chr6 73587112 73597112 . . .
+chr6 73587143 73597143 . . .
+chr6 155871001 155881001 . . .
+chr6 155871050 155881050 . . .
+chr6 155871050 155881050 . . .
+chr6 155872934 155882934 . . .
+chr6 155873033 155883033 . . .
+chr6 155873033 155883033 . . .
+chr6 155874917 155884917 . . .
+chr6 155874966 155884966 . . .
+chr6 155874967 155884967 . . .
+chr16 4274549 4284549 . . .
+chr16 4274580 4284580 . . .
+chr16 4274580 4284580 . . .
+chr16 4276482 4286482 . . .
+chr16 4276545 4286545 . . .
+chr16 4276545 4286545 . . .
+chr16 4278447 4288447 . . .
+chr16 4278478 4288478 . . .
+chr16 4278479 4288479 . . .
+chr16 61256434 61266434 . . .
+chr16 61256434 61266434 . . .
+chr16 61256473 61266473 . . .
+chr16 61256474 61266474 . . .
+chr16 61258368 61268368 . . .
+chr16 61258368 61268368 . . .
+chr16 61258447 61268447 . . .
+chr16 61258447 61268447 . . .
+chr16 61260341 61270341 . . .
+chr16 61260341 61270341 . . .
+chr16 61260380 61270380 . . .
+chr16 61260381 61270381 . . .
+chr16 63437466 63447466 . . .
+chr16 63437466 63447466 . . .
+chr16 63437488 63447488 . . .
+chr16 63439400 63449400 . . .
+chr16 63439400 63449400 . . .
+chr16 63439444 63449444 . . .
+chr16 63441355 63451355 . . .
+chr16 63441355 63451355 . . .
+chr16 63441377 63451377 . . .
+chr7 152338860 152348860 . . .
+chr7 152338860 152348860 . . .
+chr7 152338891 152348891 . . .
+chr7 152338891 152348891 . . .
+chr7 152340794 152350794 . . .
+chr7 152340794 152350794 . . .
+chr7 152340856 152350856 . . .
+chr7 152340856 152350856 . . .
+chr7 152342758 152352758 . . .
+chr7 152342759 152352759 . . .
+chr7 152342789 152352789 . . .
+chr7 152342790 152352790 . . .
+chr20 33330273 33340273 . . .
+chr20 33330273 33340273 . . .
+chr20 33330317 33340317 . . .
+chr20 33332207 33342207 . . .
+chr20 33332207 33342207 . . .
+chr20 33332295 33342295 . . .
+chr20 33334184 33344184 . . .
+chr20 33334185 33344185 . . .
+chr20 33334229 33344229 . . .
+chr20 36028998 36038998 . . .
+chr20 36028998 36038998 . . .
+chr20 36030932 36040932 . . .
+chr20 36030932 36040932 . . .
+chr20 36032865 36042865 . . .
+chr20 36032866 36042866 . . .
+chr20 40156088 40166088 . . .
+chr20 40156088 40166088 . . .
+chr20 40158022 40168022 . . .
+chr20 40158022 40168022 . . .
+chr20 40159955 40169955 . . .
+chr20 40159956 40169956 . . .
+chr20 59984060 59994060 . . .
+chr20 59984086 59994086 . . .
+chr20 59984086 59994086 . . .
+chr20 59985994 59995994 . . .
+chr20 59986046 59996046 . . .
+chr20 59986046 59996046 . . .
+chr20 59987954 59997954 . . .
+chr20 59987979 59997979 . . .
+chr20 59987980 59997980 . . .
+chr12 57866912 57876912 . . .
+chr12 57866936 57876936 . . .
+chr12 57868846 57878846 . . .
+chr12 57868894 57878894 . . .
+chr12 57870804 57880804 . . .
+chr12 57870828 57880828 . . .
+chr8 22841419 22851419 . . .
+chr8 22841419 22851419 . . .
+chr8 22841461 22851461 . . .
+chr8 22843353 22853353 . . .
+chr8 22843353 22853353 . . .
+chr8 22843437 22853437 . . .
+chr8 22845328 22855328 . . .
+chr8 22845329 22855329 . . .
+chr8 22845371 22855371 . . .
+chr17 42512626 42522626 . . .
+chr17 42512626 42522626 . . .
+chr17 42512657 42522657 . . .
+chr17 42514560 42524560 . . .
+chr17 42514560 42524560 . . .
+chr17 42514622 42524622 . . .
+chr17 42516524 42526524 . . .
+chr17 42516525 42526525 . . .
+chr17 42516556 42526556 . . .
+chr17 44329638 44339638 . . .
+chr17 44329638 44339638 . . .
+chr17 44331572 44341572 . . .
+chr17 44331572 44341572 . . .
+chr17 44333505 44343505 . . .
+chr17 44333506 44343506 . . .
+chr17 45982661 45992661 . . .
+chr17 45982661 45992661 . . .
+chr17 45982681 45992681 . . .
+chr17 45982681 45992681 . . .
+chr17 45984595 45994595 . . .
+chr17 45984595 45994595 . . .
+chr17 45984635 45994635 . . .
+chr17 45984635 45994635 . . .
+chr17 45986548 45996548 . . .
+chr17 45986549 45996549 . . .
+chr17 45986568 45996568 . . .
+chr17 45986569 45996569 . . .
+chr17 82003625 82013625 . . .
+chr17 82003690 82013690 . . .
+chr17 82003691 82013691 . . .
+chr17 82005559 82015559 . . .
+chr17 82005690 82015690 . . .
+chr17 82005690 82015690 . . .
+chr17 82007558 82017558 . . .
+chr17 82007623 82017623 . . .
+chr17 82007624 82017624 . . .
+chr3 48457703 48467703 . . .
+chr3 48457733 48467733 . . .
+chr3 48457733 48467733 . . .
+chr3 48459637 48469637 . . .
+chr3 48459697 48469697 . . .
+chr3 48459697 48469697 . . .
+chr3 48461601 48471601 . . .
+chr3 48461630 48471630 . . .
+chr3 48461631 48471631 . . .
+chr2 44477902 44487902 . . .
+chr2 44479836 44489836 . . .
+chr2 44481770 44491770 . . .
+chr19 41727423 41737423 . . .
+chr19 41727423 41737423 . . .
+chr19 41729357 41739357 . . .
+chr19 41729357 41739357 . . .
+chr19 41731290 41741290 . . .
+chr19 41731291 41741291 . . .
+chr19 43448282 43458282 . . .
+chr19 43448304 43458304 . . .
+chr19 43448305 43458305 . . .
+chr19 43450216 43460216 . . .
+chr19 43450261 43460261 . . .
+chr19 43450261 43460261 . . .
+chr19 43452172 43462172 . . .
+chr19 43452194 43462194 . . .
+chr19 43452195 43462195 . . .
+chr19 44921111 44931111 . . .
+chr19 44921111 44931111 . . .
+chr19 44923045 44933045 . . .
+chr19 44923045 44933045 . . .
+chr19 44924978 44934978 . . .
+chr19 44924979 44934979 . . .
+chr5 111661983 111671983 . . .
+chr5 111663917 111673917 . . .
+chr5 111665851 111675851 . . .
+chr5 138871786 138881786 . . .
+chr5 138873720 138883720 . . .
+chr5 138875654 138885654 . . .
+chr9 145229735 145239735 . . .
+chr9 145229735 145239735 . . .
+chr9 145229753 145239753 . . .
+chr9 145231669 145241669 . . .
+chr9 145231669 145241669 . . .
+chr9 145231706 145241706 . . .
+chr9 145233621 145243621 . . .
+chr9 145233621 145243621 . . .
+chr9 145233640 145243640 . . .
+chr21 24889387 24899387 . . .
+chr21 24889387 24899387 . . .
+chr21 24891321 24901321 . . .
+chr21 24891321 24901321 . . .
+chr21 24893254 24903254 . . .
+chr21 24893255 24903255 . . .
diff --git a/dimelo/test/data/test_targets/megalodon_single_190/pileup.sorted.bed.gz b/dimelo/test/data/test_targets/megalodon_single_190/pileup.sorted.bed.gz
new file mode 100644
index 00000000..37fbc1ff
Binary files /dev/null and b/dimelo/test/data/test_targets/megalodon_single_190/pileup.sorted.bed.gz differ
diff --git a/dimelo/test/data/test_targets/megalodon_single_190/pileup.sorted.bed.gz.tbi b/dimelo/test/data/test_targets/megalodon_single_190/pileup.sorted.bed.gz.tbi
new file mode 100644
index 00000000..6d91384a
Binary files /dev/null and b/dimelo/test/data/test_targets/megalodon_single_190/pileup.sorted.bed.gz.tbi differ
diff --git a/dimelo/test/data/test_targets/megalodon_single_190/reads.combined_basemods.h5 b/dimelo/test/data/test_targets/megalodon_single_190/reads.combined_basemods.h5
new file mode 100644
index 00000000..c90abc1c
Binary files /dev/null and b/dimelo/test/data/test_targets/megalodon_single_190/reads.combined_basemods.h5 differ
diff --git a/dimelo/test/data/test_targets/megalodon_single_190/regions.processed.bed b/dimelo/test/data/test_targets/megalodon_single_190/regions.processed.bed
new file mode 100644
index 00000000..efac1073
--- /dev/null
+++ b/dimelo/test/data/test_targets/megalodon_single_190/regions.processed.bed
@@ -0,0 +1 @@
+chr1 114357437 114359753 . . .
diff --git a/dimelo/test/data/test_targets/megalodon_single_and_peaks_190/pileup.sorted.bed.gz b/dimelo/test/data/test_targets/megalodon_single_and_peaks_190/pileup.sorted.bed.gz
new file mode 100644
index 00000000..5d20e1ee
Binary files /dev/null and b/dimelo/test/data/test_targets/megalodon_single_and_peaks_190/pileup.sorted.bed.gz differ
diff --git a/dimelo/test/data/test_targets/megalodon_single_and_peaks_190/pileup.sorted.bed.gz.tbi b/dimelo/test/data/test_targets/megalodon_single_and_peaks_190/pileup.sorted.bed.gz.tbi
new file mode 100644
index 00000000..57239118
Binary files /dev/null and b/dimelo/test/data/test_targets/megalodon_single_and_peaks_190/pileup.sorted.bed.gz.tbi differ
diff --git a/dimelo/test/data/test_targets/megalodon_single_and_peaks_190/reads.combined_basemods.h5 b/dimelo/test/data/test_targets/megalodon_single_and_peaks_190/reads.combined_basemods.h5
new file mode 100644
index 00000000..0f669282
Binary files /dev/null and b/dimelo/test/data/test_targets/megalodon_single_and_peaks_190/reads.combined_basemods.h5 differ
diff --git a/dimelo/test/data/test_targets/megalodon_single_and_peaks_190/regions.processed.bed b/dimelo/test/data/test_targets/megalodon_single_and_peaks_190/regions.processed.bed
new file mode 100644
index 00000000..60a3c5ca
--- /dev/null
+++ b/dimelo/test/data/test_targets/megalodon_single_and_peaks_190/regions.processed.bed
@@ -0,0 +1,301 @@
+chr1 9163143 9173143 . . .
+chr1 9163176 9173176 . . .
+chr1 9163177 9173177 . . .
+chr1 9165077 9175077 . . .
+chr1 9165144 9175144 . . .
+chr1 9165144 9175144 . . .
+chr1 9167044 9177044 . . .
+chr1 9167077 9177077 . . .
+chr1 9167078 9177078 . . .
+chr1 114351661 114361661 . . .
+chr1 114351661 114361661 . . .
+chr1 114353595 114363595 . . .
+chr1 114353595 114363595 . . .
+chr1 114353595 114363595 . . .
+chr1 114355528 114365528 . . .
+chr1 114355529 114365529 . . .
+chr1 150096574 150106574 . . .
+chr1 150096591 150106591 . . .
+chr1 150096592 150106592 . . .
+chr1 150098508 150108508 . . .
+chr1 150098543 150108543 . . .
+chr1 150098543 150108543 . . .
+chr1 150100459 150110459 . . .
+chr1 150100476 150110476 . . .
+chr1 150100477 150110477 . . .
+chr1 224656606 224666606 . . .
+chr1 224656632 224666632 . . .
+chr1 224656632 224666632 . . .
+chr1 224658539 224668539 . . .
+chr1 224658592 224668592 . . .
+chr1 224658592 224668592 . . .
+chr1 224660499 224670499 . . .
+chr1 224660525 224670525 . . .
+chr1 224660526 224670526 . . .
+chr14 17372348 17382348 . . .
+chr14 17372348 17382348 . . .
+chr14 17372382 17382382 . . .
+chr14 17374282 17384282 . . .
+chr14 17374282 17384282 . . .
+chr14 17374350 17384350 . . .
+chr14 17376249 17386249 . . .
+chr14 17376250 17386250 . . .
+chr14 17376284 17386284 . . .
+chr14 44118233 44128233 . . .
+chr14 44118233 44128233 . . .
+chr14 44120167 44130167 . . .
+chr14 44120167 44130167 . . .
+chr14 44122100 44132100 . . .
+chr14 44122101 44132101 . . .
+chrX 9695409 9705409 . . .
+chrX 9697343 9707343 . . .
+chrX 9699277 9709277 . . .
+chrX 99204528 99214528 . . .
+chrX 99204528 99214528 . . .
+chrX 99206462 99216462 . . .
+chrX 99206462 99216462 . . .
+chrX 99208395 99218395 . . .
+chrX 99208396 99218396 . . .
+chr15 38659925 38669925 . . .
+chr15 38659925 38669925 . . .
+chr15 38659946 38669946 . . .
+chr15 38659946 38669946 . . .
+chr15 38661859 38671859 . . .
+chr15 38661859 38671859 . . .
+chr15 38661901 38671901 . . .
+chr15 38661901 38671901 . . .
+chr15 38663813 38673813 . . .
+chr15 38663814 38673814 . . .
+chr15 38663834 38673834 . . .
+chr15 38663835 38673835 . . .
+chr15 54627233 54637233 . . .
+chr15 54627233 54637233 . . .
+chr15 54627250 54637250 . . .
+chr15 54627250 54637250 . . .
+chr15 54629167 54639167 . . .
+chr15 54629167 54639167 . . .
+chr15 54629201 54639201 . . .
+chr15 54629201 54639201 . . .
+chr15 54631117 54641117 . . .
+chr15 54631118 54641118 . . .
+chr15 54631134 54641134 . . .
+chr15 54631135 54641135 . . .
+chr6 15556982 15566982 . . .
+chr6 15556982 15566982 . . .
+chr6 15557021 15567021 . . .
+chr6 15557021 15567021 . . .
+chr6 15558916 15568916 . . .
+chr6 15558916 15568916 . . .
+chr6 15558994 15568994 . . .
+chr6 15558994 15568994 . . .
+chr6 15560888 15570888 . . .
+chr6 15560889 15570889 . . .
+chr6 15560927 15570927 . . .
+chr6 15560928 15570928 . . .
+chr6 41191666 41201666 . . .
+chr6 41191666 41201666 . . .
+chr6 41191699 41201699 . . .
+chr6 41191699 41201699 . . .
+chr6 41193600 41203600 . . .
+chr6 41193600 41203600 . . .
+chr6 41193666 41203666 . . .
+chr6 41193666 41203666 . . .
+chr6 41195566 41205566 . . .
+chr6 41195567 41205567 . . .
+chr6 41195599 41205599 . . .
+chr6 41195600 41205600 . . .
+chr6 53004919 53014919 . . .
+chr6 53006853 53016853 . . .
+chr6 53008787 53018787 . . .
+chr6 73583214 73593214 . . .
+chr6 73583244 73593244 . . .
+chr6 73585148 73595148 . . .
+chr6 73585209 73595209 . . .
+chr6 73587112 73597112 . . .
+chr6 73587143 73597143 . . .
+chr6 155871001 155881001 . . .
+chr6 155871050 155881050 . . .
+chr6 155871050 155881050 . . .
+chr6 155872934 155882934 . . .
+chr6 155873033 155883033 . . .
+chr6 155873033 155883033 . . .
+chr6 155874917 155884917 . . .
+chr6 155874966 155884966 . . .
+chr6 155874967 155884967 . . .
+chr16 4274549 4284549 . . .
+chr16 4274580 4284580 . . .
+chr16 4274580 4284580 . . .
+chr16 4276482 4286482 . . .
+chr16 4276545 4286545 . . .
+chr16 4276545 4286545 . . .
+chr16 4278447 4288447 . . .
+chr16 4278478 4288478 . . .
+chr16 4278479 4288479 . . .
+chr16 61256434 61266434 . . .
+chr16 61256434 61266434 . . .
+chr16 61256473 61266473 . . .
+chr16 61256474 61266474 . . .
+chr16 61258368 61268368 . . .
+chr16 61258368 61268368 . . .
+chr16 61258447 61268447 . . .
+chr16 61258447 61268447 . . .
+chr16 61260341 61270341 . . .
+chr16 61260341 61270341 . . .
+chr16 61260380 61270380 . . .
+chr16 61260381 61270381 . . .
+chr16 63437466 63447466 . . .
+chr16 63437466 63447466 . . .
+chr16 63437488 63447488 . . .
+chr16 63439400 63449400 . . .
+chr16 63439400 63449400 . . .
+chr16 63439444 63449444 . . .
+chr16 63441355 63451355 . . .
+chr16 63441355 63451355 . . .
+chr16 63441377 63451377 . . .
+chr7 152338860 152348860 . . .
+chr7 152338860 152348860 . . .
+chr7 152338891 152348891 . . .
+chr7 152338891 152348891 . . .
+chr7 152340794 152350794 . . .
+chr7 152340794 152350794 . . .
+chr7 152340856 152350856 . . .
+chr7 152340856 152350856 . . .
+chr7 152342758 152352758 . . .
+chr7 152342759 152352759 . . .
+chr7 152342789 152352789 . . .
+chr7 152342790 152352790 . . .
+chr20 33330273 33340273 . . .
+chr20 33330273 33340273 . . .
+chr20 33330317 33340317 . . .
+chr20 33332207 33342207 . . .
+chr20 33332207 33342207 . . .
+chr20 33332295 33342295 . . .
+chr20 33334184 33344184 . . .
+chr20 33334185 33344185 . . .
+chr20 33334229 33344229 . . .
+chr20 36028998 36038998 . . .
+chr20 36028998 36038998 . . .
+chr20 36030932 36040932 . . .
+chr20 36030932 36040932 . . .
+chr20 36032865 36042865 . . .
+chr20 36032866 36042866 . . .
+chr20 40156088 40166088 . . .
+chr20 40156088 40166088 . . .
+chr20 40158022 40168022 . . .
+chr20 40158022 40168022 . . .
+chr20 40159955 40169955 . . .
+chr20 40159956 40169956 . . .
+chr20 59984060 59994060 . . .
+chr20 59984086 59994086 . . .
+chr20 59984086 59994086 . . .
+chr20 59985994 59995994 . . .
+chr20 59986046 59996046 . . .
+chr20 59986046 59996046 . . .
+chr20 59987954 59997954 . . .
+chr20 59987979 59997979 . . .
+chr20 59987980 59997980 . . .
+chr12 57866912 57876912 . . .
+chr12 57866936 57876936 . . .
+chr12 57868846 57878846 . . .
+chr12 57868894 57878894 . . .
+chr12 57870804 57880804 . . .
+chr12 57870828 57880828 . . .
+chr8 22841419 22851419 . . .
+chr8 22841419 22851419 . . .
+chr8 22841461 22851461 . . .
+chr8 22843353 22853353 . . .
+chr8 22843353 22853353 . . .
+chr8 22843437 22853437 . . .
+chr8 22845328 22855328 . . .
+chr8 22845329 22855329 . . .
+chr8 22845371 22855371 . . .
+chr17 42512626 42522626 . . .
+chr17 42512626 42522626 . . .
+chr17 42512657 42522657 . . .
+chr17 42514560 42524560 . . .
+chr17 42514560 42524560 . . .
+chr17 42514622 42524622 . . .
+chr17 42516524 42526524 . . .
+chr17 42516525 42526525 . . .
+chr17 42516556 42526556 . . .
+chr17 44329638 44339638 . . .
+chr17 44329638 44339638 . . .
+chr17 44331572 44341572 . . .
+chr17 44331572 44341572 . . .
+chr17 44333505 44343505 . . .
+chr17 44333506 44343506 . . .
+chr17 45982661 45992661 . . .
+chr17 45982661 45992661 . . .
+chr17 45982681 45992681 . . .
+chr17 45982681 45992681 . . .
+chr17 45984595 45994595 . . .
+chr17 45984595 45994595 . . .
+chr17 45984635 45994635 . . .
+chr17 45984635 45994635 . . .
+chr17 45986548 45996548 . . .
+chr17 45986549 45996549 . . .
+chr17 45986568 45996568 . . .
+chr17 45986569 45996569 . . .
+chr17 82003625 82013625 . . .
+chr17 82003690 82013690 . . .
+chr17 82003691 82013691 . . .
+chr17 82005559 82015559 . . .
+chr17 82005690 82015690 . . .
+chr17 82005690 82015690 . . .
+chr17 82007558 82017558 . . .
+chr17 82007623 82017623 . . .
+chr17 82007624 82017624 . . .
+chr3 48457703 48467703 . . .
+chr3 48457733 48467733 . . .
+chr3 48457733 48467733 . . .
+chr3 48459637 48469637 . . .
+chr3 48459697 48469697 . . .
+chr3 48459697 48469697 . . .
+chr3 48461601 48471601 . . .
+chr3 48461630 48471630 . . .
+chr3 48461631 48471631 . . .
+chr2 44477902 44487902 . . .
+chr2 44479836 44489836 . . .
+chr2 44481770 44491770 . . .
+chr19 41727423 41737423 . . .
+chr19 41727423 41737423 . . .
+chr19 41729357 41739357 . . .
+chr19 41729357 41739357 . . .
+chr19 41731290 41741290 . . .
+chr19 41731291 41741291 . . .
+chr19 43448282 43458282 . . .
+chr19 43448304 43458304 . . .
+chr19 43448305 43458305 . . .
+chr19 43450216 43460216 . . .
+chr19 43450261 43460261 . . .
+chr19 43450261 43460261 . . .
+chr19 43452172 43462172 . . .
+chr19 43452194 43462194 . . .
+chr19 43452195 43462195 . . .
+chr19 44921111 44931111 . . .
+chr19 44921111 44931111 . . .
+chr19 44923045 44933045 . . .
+chr19 44923045 44933045 . . .
+chr19 44924978 44934978 . . .
+chr19 44924979 44934979 . . .
+chr5 111661983 111671983 . . .
+chr5 111663917 111673917 . . .
+chr5 111665851 111675851 . . .
+chr5 138871786 138881786 . . .
+chr5 138873720 138883720 . . .
+chr5 138875654 138885654 . . .
+chr9 145229735 145239735 . . .
+chr9 145229735 145239735 . . .
+chr9 145229753 145239753 . . .
+chr9 145231669 145241669 . . .
+chr9 145231669 145241669 . . .
+chr9 145231706 145241706 . . .
+chr9 145233621 145243621 . . .
+chr9 145233621 145243621 . . .
+chr9 145233640 145243640 . . .
+chr21 24889387 24899387 . . .
+chr21 24889387 24899387 . . .
+chr21 24891321 24901321 . . .
+chr21 24891321 24901321 . . .
+chr21 24893254 24903254 . . .
+chr21 24893255 24903255 . . .
diff --git a/dimelo/test/data/test_targets/megalodon_single_nothresh/pileup.sorted.bed.gz b/dimelo/test/data/test_targets/megalodon_single_nothresh/pileup.sorted.bed.gz
new file mode 100644
index 00000000..b1bc1343
Binary files /dev/null and b/dimelo/test/data/test_targets/megalodon_single_nothresh/pileup.sorted.bed.gz differ
diff --git a/dimelo/test/data/test_targets/megalodon_single_nothresh/pileup.sorted.bed.gz.tbi b/dimelo/test/data/test_targets/megalodon_single_nothresh/pileup.sorted.bed.gz.tbi
new file mode 100644
index 00000000..c7e5e13e
Binary files /dev/null and b/dimelo/test/data/test_targets/megalodon_single_nothresh/pileup.sorted.bed.gz.tbi differ
diff --git a/dimelo/test/data/test_targets/megalodon_single_nothresh/reads.combined_basemods.h5 b/dimelo/test/data/test_targets/megalodon_single_nothresh/reads.combined_basemods.h5
new file mode 100644
index 00000000..cbeef2e3
Binary files /dev/null and b/dimelo/test/data/test_targets/megalodon_single_nothresh/reads.combined_basemods.h5 differ
diff --git a/dimelo/test/data/test_targets/megalodon_single_nothresh/regions.processed.bed b/dimelo/test/data/test_targets/megalodon_single_nothresh/regions.processed.bed
new file mode 100644
index 00000000..4a44e5d2
--- /dev/null
+++ b/dimelo/test/data/test_targets/megalodon_single_nothresh/regions.processed.bed
@@ -0,0 +1 @@
+chr1 114353595 114363595 . . .
diff --git a/dimelo/test/data/test_targets/test_matrix.pickle b/dimelo/test/data/test_targets/test_matrix.pickle
new file mode 100644
index 00000000..26328727
Binary files /dev/null and b/dimelo/test/data/test_targets/test_matrix.pickle differ
diff --git a/dimelo/test/data/winnowmap_guppy_merge_subset.bam b/dimelo/test/data/winnowmap_guppy_merge_subset.bam
deleted file mode 100644
index f7e74d4d..00000000
Binary files a/dimelo/test/data/winnowmap_guppy_merge_subset.bam and /dev/null differ
diff --git a/dimelo/test/data/winnowmap_guppy_merge_subset.bam.bai b/dimelo/test/data/winnowmap_guppy_merge_subset.bam.bai
deleted file mode 100644
index 61f6e09e..00000000
Binary files a/dimelo/test/data/winnowmap_guppy_merge_subset.bam.bai and /dev/null differ
diff --git a/dimelo/test/dimelo_test.py b/dimelo/test/dimelo_test.py
index 0e64f674..f7b84b2e 100644
--- a/dimelo/test/dimelo_test.py
+++ b/dimelo/test/dimelo_test.py
@@ -1,451 +1,1241 @@
-import subprocess
-import unittest
+import filecmp
+import gzip
+import pickle
from pathlib import Path
+import h5py
+import numpy as np
+import plotly
+import pytest
+from matplotlib.axes import Axes
+
import dimelo as dm
-from dimelo.test import DiMeLoTestCase
-
-"""
-Inputs
-
-NOTE: Changing any of these paths or database contents may require recomputing the hard-coded database
-hashes in the tests below.
-"""
-# TODO: Is this a reasonable way to specify input files? Where is this intended to be run from?
-input_bams = [
- Path("dimelo/test/data/mod_mappings_subset.bam"),
- Path("dimelo/test/data/winnowmap_guppy_merge_subset.bam"),
-]
-input_sample_names = ["test1", "test2"]
-input_bed = Path("dimelo/test/data/test.bed")
-input_region = "chr1:2907273-2909473"
-
-"""
-Outputs
-"""
-# TODO: When implemented elsewhere, replace these explicit database paths with modular calls
-output_dbs = [f.with_suffix(".db").name for f in input_bams]
-
-
-def db_hash(db_path: Path) -> bytes:
- """Computes a hash of the sqlite db at the specified path."""
- # Use the sqlite3 .sha3sum command to compute hash
- sqlite_output = subprocess.run(
- ["sqlite3", db_path, ".sha3sum"], capture_output=True
- )
- return sqlite_output.stdout
-
-
-class TestParseBam(DiMeLoTestCase):
- def test_parse_bam_bedFile(self):
- """Tests parsing a bam file into a database, specifying windows using a bed file.
-
- Notes:
- - cores set to 1 to ensure that hashes come out the same each time
- - thresholds set super low to ensure that meaningful rows are inserted for all modifications
- """
- bam_idx = 0
-
- # Inputs
- bam_file = input_bams[bam_idx]
- sample_name = input_sample_names[bam_idx]
- bed_file = input_bed
-
- # Outputs
- db_file = output_dbs[bam_idx]
-
- dm.parse_bam(
- fileName=str(bam_file),
- sampleName=sample_name,
- outDir=str(self.outDir),
- bedFile=str(bed_file),
- basemod="A+CG",
- center=True,
- windowSize=500,
- threshA=1,
- threshC=1,
- extractAllBases=False,
- cores=1,
- )
- database_path = self.outDir / db_file
- # Check whether database contents are the same as expected
- self.assertEqual(
- db_hash(database_path),
- b"8a8fe3984448ee6f215d1d71c01a4d8edde7691bcdb0bf28f2c01cd2\n",
+from dimelo.test import DiMeLoParsingTestCase, filter_kwargs_for_func
+
+script_location = Path(__file__).parent
+
+with open(
+ script_location / "data" / "test_targets" / "test_matrix.pickle", "rb"
+) as file:
+ test_matrix = pickle.load(file)
+
+
+@pytest.mark.parametrize(
+ "test_case,kwargs,results",
+ [(case, inputs, outputs) for case, (inputs, outputs) in test_matrix.items()],
+)
+class TestParseToPlot(DiMeLoParsingTestCase):
+ """
+ Tests parsing a bam file into a bed.gz pileup and an hdf5 single read file, then tests that each stage
+ of parse_bam -> load_processed -> plotting works correctly, including comparing where applicable.
+
+ This test class requires the output files be bitwise identical, compared to pre-defined reference files.
+ This means that interface changes require replacing these files.
+
+ For integration tests we test interfaces end-to-end.
+ """
+
+ def test_unit__pileup(
+ cls,
+ test_case,
+ kwargs,
+ results,
+ ):
+ kwargs_pileup = filter_kwargs_for_func(dm.parse_bam.pileup, kwargs)
+ kwargs_pileup["output_directory"] = cls.outDir
+ pileup_bed, regions_processed = dm.parse_bam.pileup(
+ **kwargs_pileup,
+ ref_genome=cls.reference_genome,
)
- def test_parse_bam_region(self):
- """Tests parsing a bam file into a database, specifying windows using a region string.
-
- Notes:
- - cores set to 1 to ensure that hashes come out the same each time
- - thresholds set super low to ensure that meaningful rows are inserted for all modifications
- """
- bam_idx = 0
-
- # Inputs
- bam_file = input_bams[bam_idx]
- sample_name = input_sample_names[bam_idx]
-
- # Outputs
- db_file = output_dbs[bam_idx]
-
- dm.parse_bam(
- fileName=str(bam_file),
- sampleName=sample_name,
- outDir=str(self.outDir),
- region=input_region,
- basemod="A+CG",
- threshA=1,
- threshC=1,
+ pileup_target, regions_target = results["pileup"]
+
+ if pileup_target is not None and regions_target is not None:
+ # This is necessary because the gzipped files are different on mac vs linux, but the contents should be identical (and are, so far)
+ # Not sure why the compression ratio is better on Linux when both are using pysam.tabix_compress with pysam 0.22.0 and zlib 1.2.13 but whatcha gonna do
+ with (
+ gzip.open(pileup_bed, "rt") as f1,
+ gzip.open(pileup_target, "rt") as f2,
+ ):
+ # Read and compare file contents
+ file1_contents = f1.read()
+ file2_contents = f2.read()
+ assert (
+ file1_contents == file2_contents
+ ), f"{test_case}: {pileup_bed} does not match {pileup_target}."
+ assert filecmp.cmp(
+ regions_processed, regions_target, shallow=False
+ ), f"{test_case}: {regions_processed} does not match {regions_target}."
+ else:
+ print(f"{test_case} skipped for pileup.")
+
+ def test_unit__extract(
+ cls,
+ test_case,
+ kwargs,
+ results,
+ ):
+ kwargs_extract = filter_kwargs_for_func(dm.parse_bam.extract, kwargs)
+ kwargs_extract["output_directory"] = cls.outDir
+ # extract can behave non-deterministically in terms of read output order
+ # if run on multiple cores. This is not an issue for sorted read loads,
+ # but means the file itself changes even though the meaningful contents don't
+ # given that parallelization here is purely within modkit anyway, the design
+ # choice was made to always single-core for extract testing
+ if "cores" in kwargs_extract:
+ del kwargs_extract["cores"]
+ extract_h5, regions_processed = dm.parse_bam.extract(
+ **kwargs_extract,
+ ref_genome=cls.reference_genome,
cores=1,
)
- database_path = self.outDir / db_file
- # Check whether database contents are the same as expected
- self.assertEqual(
- db_hash(database_path),
- b"baf72c009547e8a2e87402db04695d698e648c3856bcc9b8c0c6cf8a\n",
- )
-
- def test_parse_bam_bedFile_region_mutual_exclusion(self):
- """Verifies that bedFile and region arguments remain mutually exclusive."""
- bam_idx = 0
- # Inputs
- bam_file = input_bams[bam_idx]
- sample_name = input_sample_names[bam_idx]
- bed_file = input_bed
-
- with self.assertRaises(RuntimeError):
- dm.parse_bam(
- fileName=str(bam_file),
- sampleName=sample_name,
- outDir=str(self.outDir),
- bedFile=str(bed_file),
- region=input_region,
+ extract_target, regions_target = results["extract"]
+
+ if extract_target is not None and regions_target is not None:
+ # The hdf5 files will have a few bits different due to gzip compression timestamps, but comparing the exact size should pass because
+ # the timestamps are not themselves compressed inside the vector gzip objects
+ h5_test = h5py.File(extract_h5)
+ h5_target = h5py.File(extract_target)
+ datasets = [
+ name for name, obj in h5_target.items() if isinstance(obj, h5py.Dataset)
+ ]
+ for dataset in datasets:
+ if dataset in ["threshold"]:
+ assert h5_test[dataset] == h5_target[dataset] or np.isnan(
+ h5_test[dataset]
+ ) == np.isnan(h5_target[dataset])
+ else:
+ test_dataset = list(h5_test[dataset][:])
+ target_dataset = list(h5_target[dataset][:])
+ if dataset in ["mod_vector", "val_vector"]:
+ assert [
+ gzip.decompress(test_item.tobytes())
+ for test_item in test_dataset
+ ] == [
+ gzip.decompress(target_item.tobytes())
+ for target_item in target_dataset
+ ], f"{test_case}: {dataset} does not match."
+ else:
+ assert (
+ test_dataset == target_dataset
+ ), f"{test_case}: {dataset} does not match."
+ # assert os.path.getsize(extract_h5) == os.path.getsize(extract_target), f"{test_case}: {extract_h5} does not match {extract_target}."
+ assert filecmp.cmp(
+ regions_processed, regions_target, shallow=False
+ ), f"{test_case}: {regions_processed} does not match {regions_target}."
+ else:
+ print(f"{test_case} skipped for extract.")
+
+ def test_integration__pileup_load_plot(
+ cls,
+ test_case,
+ kwargs,
+ results,
+ ):
+ # This stuff is commented out because if we run this integration test in the same class as the unit test
+ # for parsing, we can cut down total end-to-end testing overhead by about 2x by just using that output
+ #
+ # kwargs_pileup = filter_kwargs_for_func(dm.parse_bam.pileup,kwargs)
+ # kwargs_pileup['output_directory'] = cls.outDir
+ # pileup_bed,_ = dm.parse_bam.pileup(
+ # **kwargs_pileup,
+ # ref_genome = cls.reference_genome,
+ # )
+ # We just grab the output from TestParseBam::test_pileup, wasteful to re-run an identical modkit command
+ pileup_bed = cls.outDir / kwargs["output_name"] / "pileup.sorted.bed.gz"
+
+ # If we have results for this pileup, check that the load_processed values are ok out of the output file
+ if results["pileup"][0] is not None:
+ kwargs_counts_from_bedmethyl = filter_kwargs_for_func(
+ dm.load_processed.pileup_counts_from_bedmethyl, kwargs
)
+ for motif in kwargs["motifs"]:
+ expected = results["pileup_counts_from_bedmethyl"][motif]
+ actual = dm.load_processed.pileup_counts_from_bedmethyl(
+ bedmethyl_file=pileup_bed,
+ motif=motif,
+ **kwargs_counts_from_bedmethyl,
+ )
+ assert (
+ actual == expected
+ ), f"{test_case}: Counts for motif {motif} are not equal"
- def test_parse_bam_region_center_incompatible(self):
- """Verifies that region and center arguments are incompatible."""
- bam_idx = 0
-
- # Inputs
- bam_file = input_bams[bam_idx]
- sample_name = input_sample_names[bam_idx]
-
- with self.assertRaises(RuntimeError):
- dm.parse_bam(
- fileName=str(bam_file),
- sampleName=sample_name,
- outDir=str(self.outDir),
- region=input_region,
- center=True,
+ kwargs_vectors_from_bedmethyl = filter_kwargs_for_func(
+ dm.load_processed.pileup_vectors_from_bedmethyl, kwargs
+ )
+ for motif in kwargs["motifs"]:
+ expected_tuple = results["pileup_vectors_from_bedmethyl"][motif]
+ actual_tuple = dm.load_processed.pileup_vectors_from_bedmethyl(
+ bedmethyl_file=results["pileup"][0],
+ motif=motif,
+ **kwargs_vectors_from_bedmethyl,
+ )
+ assert len(expected_tuple) == len(
+ actual_tuple
+ ), f"{test_case}: Unexpected number of arrays returned for {motif}"
+
+ for expected, actual in zip(expected_tuple, actual_tuple):
+ # TODO: The following was the original assertion error message, but it was not written in a functional way. Find a way to make it work as intended.
+ # assert np.array_equal(expected, actual), f"{test_case}: Arrays for motif {motif} are not equal: expected {value} but got {actual[key]}"
+ assert np.array_equal(
+ expected, actual
+ ), f"{test_case}: Arrays for motif {motif} are not equal."
+ else:
+ print(
+ f"{test_case} loading skipped for pileup_load_plot, continuing to plotting."
)
-
-# TODO: More robust qc_report tests
-class TestQCReport(DiMeLoTestCase):
- def test_qc_report_one_sample(self):
- """Tests generating a single qc report.
-
- Notes:
- - cores set to 1 to ensure that hashes come out the same each time
- """
- bam_idx = 0
-
- # Inputs
- bam_file = input_bams[bam_idx]
- sample_name = input_sample_names[bam_idx]
-
- # Outputs
- db_file = output_dbs[bam_idx]
- qc_report_file = f"{sample_name}_qc_report.pdf"
-
- dm.qc_report(
- fileNames=str(bam_file),
- sampleNames=sample_name,
- outDir=str(self.outDir),
- cores=1,
+ kwargs_plot_enrichment_plot_enrichment = filter_kwargs_for_func(
+ dm.plot_enrichment.plot_enrichment, kwargs
)
-
- database_path = self.outDir / db_file
- # Check whether database contents are the same as expected
- self.assertEqual(
- db_hash(database_path),
- b"58ed1ba2ce0c2e0f257ead1d9f2b9239f2777b8aefbf38ea7a40e464\n",
+ for motif in kwargs["motifs"]:
+ regions_list = (
+ kwargs["regions"]
+ if isinstance(kwargs["regions"], list)
+ else [kwargs["regions"]]
+ )
+ kwargs_plot_enrichment_plot_enrichment["motifs"] = [
+ motif for _ in regions_list
+ ]
+ ax = dm.plot_enrichment.plot_enrichment(
+ mod_file_names=[pileup_bed for _ in regions_list],
+ regions_list=regions_list,
+ sample_names=["label" for _ in regions_list],
+ **kwargs_plot_enrichment_plot_enrichment,
+ )
+ assert isinstance(ax, Axes), f"{test_case}: plotting failed for {motif}."
+ kwargs_plot_enrichment_profile_plot_enrichment_profile = filter_kwargs_for_func(
+ dm.plot_enrichment_profile.plot_enrichment_profile,
+ kwargs,
+ extra_args=["window_size", "smooth_window"],
)
-
- self.assertOutputFileExists(qc_report_file)
-
- def test_qc_report_multi_sample(self):
- """Tests generating multiple qc reports at once."""
- dm.qc_report(
- fileNames=[str(f) for f in input_bams],
- sampleNames=input_sample_names,
- outDir=str(self.outDir),
+ for motif in kwargs["motifs"]:
+ regions_list = (
+ kwargs["regions"]
+ if isinstance(kwargs["regions"], list)
+ else [kwargs["regions"]]
+ )
+ kwargs_plot_enrichment_profile_plot_enrichment_profile["motifs"] = [
+ motif for _ in regions_list
+ ]
+ ax = dm.plot_enrichment_profile.plot_enrichment_profile(
+ mod_file_names=[pileup_bed for _ in regions_list],
+ regions_list=regions_list,
+ sample_names=["label" for _ in regions_list],
+ **kwargs_plot_enrichment_profile_plot_enrichment_profile,
+ )
+ assert isinstance(ax, Axes), f"{test_case}: plotting failed for {motif}."
+
+ def test_integration__extract_load_plot(
+ cls,
+ test_case,
+ kwargs,
+ results,
+ ):
+ # This stuff is commented out because if we run this integration test in the same class as the unit test
+ # for parsing, we can cut down total end-to-end testing overhead by about 2x by just using that output
+ #
+ # if results['extract'][0] is None:
+ # return
+
+ # kwargs_extract = filter_kwargs_for_func(dm.parse_bam.extract,kwargs)
+ # kwargs_extract['output_directory'] = cls.outDir
+ # extract_h5,_ = dm.parse_bam.extract(
+ # **kwargs_extract,
+ # ref_genome = cls.reference_genome,
+ # )
+ # We just grab the output from TestParseBam::test_extract, wasteful to re-run an identical modkit command
+ extract_h5 = cls.outDir / kwargs["output_name"] / "reads.combined_basemods.h5"
+
+ # If we have results for this extraction, check that the load_processed values are ok out of the output file
+ if results["extract"][0] is not None:
+ kwargs_read_vectors_from_hdf5 = filter_kwargs_for_func(
+ dm.load_processed.read_vectors_from_hdf5, kwargs
+ )
+ read_data_list, datasets, _ = dm.load_processed.read_vectors_from_hdf5(
+ file=extract_h5,
+ **kwargs_read_vectors_from_hdf5,
+ )
+ read_data_dict = {}
+ # Pull out the data from the first read
+ for idx, dataset in enumerate(datasets):
+ for read_data in read_data_list:
+ read_data_dict[dataset] = read_data[idx]
+ break
+ expected = results["read_vectors_from_hdf5"]
+ actual = read_data_dict
+ for key, value in expected.items():
+ if isinstance(value, np.ndarray):
+ assert np.allclose(
+ actual[key], expected[key], atol=1e-5
+ ), f"""{test_case}: Arrays for {key} are not equal
+mismatch at {np.where(value != actual[key])}
+mismatch values expected {value[np.where(value != actual[key])]} vs actual {actual[key][np.where(value != actual[key])]}
+{value[np.where(value != actual[key])[0]]} vs {actual[key][np.where(value != actual[key])[0]]}.
+ """
+ elif isinstance(value, (str, int, bool)):
+ assert (
+ actual[key] == expected[key]
+ ), f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}."
+ else:
+ assert np.isclose(
+ actual[key], value, atol=1e-4
+ ), f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}."
+ else:
+ print("{test_case} skipped for read_vectors_from_hdf5.")
+ kwargs_plot_reads_plot_reads = filter_kwargs_for_func(
+ dm.plot_reads.plot_reads, kwargs
)
+ if kwargs["thresh"] is not None:
+ ax = dm.plot_reads.plot_reads(
+ mod_file_name=extract_h5,
+ **kwargs_plot_reads_plot_reads,
+ )
+ assert isinstance(ax, Axes), f"{test_case}: plotting failed."
+ else: # if the extract parameters did not have a threshold, plot_reads.plot_reads should raise an error
+ with pytest.raises(ValueError) as excinfo:
+ ax = dm.plot_reads.plot_reads(
+ mod_file_name=extract_h5,
+ **kwargs_plot_reads_plot_reads,
+ )
+ assert "No threshold has been applied" in str(
+ excinfo.value
+ ), f"{test_case}: unexpected exception {excinfo.value}"
+ # providing a threshold should be enough to run plot_reads.plot_reads without an error
+ kwargs_plot_reads_plot_reads["thresh"] = 0.75
+ ax = dm.plot_reads.plot_reads(
+ mod_file_name=extract_h5,
+ **kwargs_plot_reads_plot_reads,
+ )
+ assert isinstance(ax, Axes), f"{test_case}: plotting failed."
+
+
+@pytest.mark.parametrize(
+ "test_case,kwargs,results",
+ [(case, inputs, outputs) for case, (inputs, outputs) in test_matrix.items()],
+)
+class TestLoadProcessed:
+ """
+ Tests loading values from bed.gz pileups and hdf5 single read files.
+
+ This test class requires that values are identical. It loads from pre-defined reference files.
+ This means that interface changes require replacing these files by re-running the Generate parse_bam
+ outputs section of dimelo/test/generate_test_targets.ipynb.
+ """
+
+ def test_unit__regions_to_list(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ """
+ This test currently only tests that regions_to_list can run all relevant loaders, and assumes their
+ values are correct based on the subsequent tests that verify values.
+ """
+ if results["pileup"][0] is not None:
+ # test pileup loading
+ kwargs_counts_from_bedmethyl = filter_kwargs_for_func(
+ dm.load_processed.pileup_counts_from_bedmethyl, kwargs
+ )
+ kwargs_vectors_from_bedmethyl = filter_kwargs_for_func(
+ dm.load_processed.pileup_vectors_from_bedmethyl, kwargs
+ )
+ for motif in kwargs["motifs"]:
+ dm.load_processed.regions_to_list(
+ function_handle=dm.load_processed.pileup_counts_from_bedmethyl,
+ bedmethyl_file=results["pileup"][0],
+ motif=motif,
+ **kwargs_counts_from_bedmethyl,
+ )
+ dm.load_processed.regions_to_list(
+ function_handle=dm.load_processed.pileup_vectors_from_bedmethyl,
+ bedmethyl_file=results["pileup"][0],
+ motif=motif,
+ **kwargs_vectors_from_bedmethyl,
+ )
+ if results["extract"][0] is not None:
+ kwargs_read_vectors_from_hdf5 = filter_kwargs_for_func(
+ dm.load_processed.read_vectors_from_hdf5, kwargs
+ )
+ dm.load_processed.regions_to_list(
+ function_handle=dm.load_processed.read_vectors_from_hdf5,
+ file=results["extract"][0],
+ **kwargs_read_vectors_from_hdf5,
+ )
- for db_file in output_dbs:
- self.assertOutputFileExists(db_file)
- for sample_name in input_sample_names:
- self.assertOutputFileExists(f"{sample_name}_qc_report.pdf")
-
+ def test_unit__pileup_counts_from_bedmethyl(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_counts_from_bedmethyl = filter_kwargs_for_func(
+ dm.load_processed.pileup_counts_from_bedmethyl, kwargs
+ )
+ for motif in kwargs["motifs"]:
+ expected = results["pileup_counts_from_bedmethyl"][motif]
+ actual = dm.load_processed.pileup_counts_from_bedmethyl(
+ bedmethyl_file=results["pileup"][0],
+ motif=motif,
+ **kwargs_counts_from_bedmethyl,
+ )
+ assert (
+ actual == expected
+ ), f"{test_case}: Counts for motif {motif} are not equal"
+ else:
+ print(f"{test_case} skipped for pileup_counts_from_bedmethyl.")
+
+ def test_unit__pileup_vectors_from_bedmethyl(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_vectors_from_bedmethyl = filter_kwargs_for_func(
+ dm.load_processed.pileup_vectors_from_bedmethyl, kwargs
+ )
+ for motif in kwargs["motifs"]:
+ expected_tuple = results["pileup_vectors_from_bedmethyl"][motif]
+ actual_tuple = dm.load_processed.pileup_vectors_from_bedmethyl(
+ bedmethyl_file=results["pileup"][0],
+ motif=motif,
+ **kwargs_vectors_from_bedmethyl,
+ )
+ assert len(expected_tuple) == len(
+ actual_tuple
+ ), f"{test_case}: Unexpected number of arrays returned for {motif}"
+
+ for expected, actual in zip(expected_tuple, actual_tuple):
+ assert np.array_equal(
+ expected, actual
+ ), f"{test_case}: Arrays for motif {motif} are not equal"
+ else:
+ print(f"{test_case} skipped for pileup_vectors_from_bedmethyl.")
+
+ def test_unit__read_vectors_from_hdf5(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["extract"][0] is not None:
+ kwargs_read_vectors_from_hdf5 = filter_kwargs_for_func(
+ dm.load_processed.read_vectors_from_hdf5, kwargs
+ )
+ read_data_list, datasets, _ = dm.load_processed.read_vectors_from_hdf5(
+ file=results["extract"][0],
+ **kwargs_read_vectors_from_hdf5,
+ )
+ read_data_dict = {}
+ # Pull out the data from the first read
+ for idx, dataset in enumerate(datasets):
+ for read_data in read_data_list:
+ read_data_dict[dataset] = read_data[idx]
+ break
+ expected = results["read_vectors_from_hdf5"]
+ actual = read_data_dict
+ for key, value in expected.items():
+ if isinstance(value, np.ndarray):
+ assert np.allclose(
+ actual[key], expected[key], atol=1e-5
+ ), f"""{test_case}: Arrays for {key} are not equal
+mismatch at {np.where(value != actual[key])}
+mismatch values expected {value[np.where(value != actual[key])]} vs actual {actual[key][np.where(value != actual[key])]}
+{value[np.where(value != actual[key])[0]]} vs {actual[key][np.where(value != actual[key])[0]]}.
+ """
+ elif isinstance(value, (str, int, bool)):
+ assert (
+ actual[key] == expected[key]
+ ), f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}."
+ else:
+ assert np.isclose(
+ actual[key], value, atol=1e-4
+ ), f"{test_case}: Values for {key} are not equal: expected {value} but got {actual[key]}."
+ else:
+ print("{test_case} skipped for read_vectors_from_hdf5.")
+
+
+@pytest.mark.parametrize(
+ "test_case,kwargs,results",
+ [(case, inputs, outputs) for case, (inputs, outputs) in test_matrix.items()],
+)
+class TestExport(DiMeLoParsingTestCase):
+ """
+ Tests file export functionality in export module.
+
+ This test currently simply checks that we can make the appropriate output files without raising errors.
+ The values stored in the files are not verified. Future work should add test coverage for values, but at
+ the moment there is no loading infrastructure in place for bigwig files making such implementation high-overhead.
+ """
+
+ def test_unit__pileup_to_bigwig(
+ cls,
+ test_case,
+ kwargs,
+ results,
+ ):
+ kwargs_bigwig = filter_kwargs_for_func(dm.export.pileup_to_bigwig, kwargs)
+ kwargs_bigwig["bedmethyl_file"] = results["pileup"][0]
+ kwargs_bigwig["bigwig_file"] = (
+ cls.outDir / kwargs["output_name"] / "pileup.fractions.bigwig"
+ )
+ for motif in kwargs["motifs"]:
+ dm.export.pileup_to_bigwig(
+ **kwargs_bigwig,
+ motif=motif,
+ )
-class TestPlotBrowser(DiMeLoTestCase):
- def test_plot_browser_html(self):
- """Tests generation of an html browser."""
- bam_idx = 0
- # Inputs
- bam_file = input_bams[bam_idx]
- sample_name = input_sample_names[bam_idx]
+class TestPlotEnrichmentSynthetic:
+ """
+ Tests plotting functionality in plot_enrichment.
- # Outputs
- db_file = output_dbs[bam_idx]
+ This test simply checks that we can make plots from synthetic data without raising errors.
+ Appearance of plots is not verified.
+ """
- dm.plot_browser(
- fileNames=str(bam_file),
- sampleNames=sample_name,
- region=input_region,
- basemod="A+CG",
- outDir=str(self.outDir),
- static=False,
+ def test_unit__plot_enrichment_plot_enrichment_synthetic(self):
+ ax = dm.plot_enrichment.plot_enrichment(
+ mod_file_names=["test.fake", "test.fake"],
+ regions_list=["test.bed", "test.bed"],
+ motifs=["A", "A"],
+ sample_names=["a", "b"],
)
+ assert isinstance(ax, Axes)
- self.assertOutputFileExists(db_file)
+ def test_unit__plot_enrichment_by_modification_synthetic(self):
+ ax = dm.plot_enrichment.by_modification(
+ mod_file_name="test.fake", regions="test.bed", motifs=["A", "C"]
+ )
+ assert isinstance(ax, Axes)
- # TODO: It's difficult to get the output file name. Find a better way to do this check.
- n_html_files = len(list(self.outDir.glob("*.html")))
- self.assertEqual(n_html_files, 1)
+ def test_unit__plot_enrichment_by_regions_synthetic(self):
+ ax = dm.plot_enrichment.by_regions(
+ mod_file_name="test.fake",
+ regions_list=["test1.bed", "test2.bed"],
+ motif="A",
+ )
+ assert isinstance(ax, Axes)
- for basemod in ["A", "CG"]:
- for plot_type in ["fraction", "total"]:
- rolling_avg_file = (
- f"{sample_name}_{basemod}_sm_rolling_avg_{plot_type}.pdf"
+ def test_unit__plot_enrichment_by_dataset_synthetic(self):
+ ax = dm.plot_enrichment.by_dataset(
+ mod_file_names=["test1.fake", "test2.fake"], regions="test.bed", motif="A"
+ )
+ assert isinstance(ax, Axes)
+
+
+@pytest.mark.parametrize(
+ "test_case,kwargs,results",
+ [(case, inputs, outputs) for case, (inputs, outputs) in test_matrix.items()],
+)
+class TestPlotEnrichment:
+ def test_unit__plot_enrichment_plot_enrichment(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_plot_enrichment_plot_enrichment = filter_kwargs_for_func(
+ dm.plot_enrichment.plot_enrichment, kwargs
+ )
+ for motif in kwargs["motifs"]:
+ regions_list = (
+ kwargs["regions"]
+ if isinstance(kwargs["regions"], list)
+ else [kwargs["regions"]]
+ )
+ kwargs_plot_enrichment_plot_enrichment["motifs"] = [
+ motif for _ in regions_list
+ ]
+ ax = dm.plot_enrichment.plot_enrichment(
+ mod_file_names=[results["pileup"][0] for _ in regions_list],
+ regions_list=regions_list,
+ sample_names=["label" for _ in regions_list],
+ **kwargs_plot_enrichment_plot_enrichment,
+ )
+ assert isinstance(
+ ax, Axes
+ ), f"{test_case}: plotting failed for {motif}."
+ else:
+ print(f"{test_case} skipped for plot_enrichment.plot_enrichment.")
+
+ def test_unit__plot_enrichment_by_regions(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_plot_enrichment_by_regions = filter_kwargs_for_func(
+ dm.plot_enrichment.by_regions, kwargs
+ )
+ for motif in kwargs["motifs"]:
+ regions_list = (
+ kwargs["regions"]
+ if isinstance(kwargs["regions"], list)
+ else [kwargs["regions"]]
+ )
+ ax = dm.plot_enrichment.by_regions(
+ mod_file_name=results["pileup"][0],
+ regions_list=regions_list,
+ motif=motif,
+ sample_names=["label" for _ in regions_list],
+ **kwargs_plot_enrichment_by_regions,
+ )
+ assert isinstance(
+ ax, Axes
+ ), f"{test_case}: plotting failed for {motif}."
+ else:
+ print(f"{test_case} skipped for plot_enrichment.by_regions.")
+
+ def test_unit__plot_enrichment_by_modification(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_plot_enrichment_by_modification = filter_kwargs_for_func(
+ dm.plot_enrichment.by_modification, kwargs
+ )
+ ax = dm.plot_enrichment.by_modification(
+ mod_file_name=results["pileup"][0],
+ **kwargs_plot_enrichment_by_modification,
+ )
+ assert isinstance(ax, Axes), f"{test_case}: plotting failed."
+ else:
+ print(f"{test_case} skipped for plot_enrichment.by_modification.")
+
+ def test_unit__plot_enrichment_by_dataset(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_plot_enrichment_by_dataset = filter_kwargs_for_func(
+ dm.plot_enrichment.by_dataset, kwargs
+ )
+ for motif in kwargs["motifs"]:
+ ax = dm.plot_enrichment.by_dataset(
+ mod_file_names=[results["pileup"][0]],
+ motif=motif,
+ **kwargs_plot_enrichment_by_dataset,
)
- self.assertOutputFileExists(rolling_avg_file)
-
- # TODO: It's difficult to get the output file name. Find a better way to do this check.
- n_bed_files = len(list(self.outDir.glob("*.bed")))
- self.assertEqual(n_bed_files, 2)
-
- def test_plot_browser_pdf(self):
- """Tests generation of a pdf browser."""
- bam_idx = 0
-
- # Inputs
- bam_file = input_bams[bam_idx]
- sample_name = input_sample_names[bam_idx]
-
- # Outputs
- db_file = output_dbs[bam_idx]
-
- dm.plot_browser(
- fileNames=str(bam_file),
- sampleNames=sample_name,
- region=input_region,
- basemod="A+CG",
- outDir=str(self.outDir),
- static=True,
+ assert isinstance(ax, Axes), f"{test_case}: plotting failed."
+ else:
+ print(f"{test_case} skipped for plot_enrichment.by_dataset.")
+
+
+class TestPlotEnrichmentProfileSynthetic:
+ """
+ Tests plotting functionality in plot_enrichment_profile.
+
+ This test simply checks that we can make plots from synthetic data without raising errors.
+ Appearance of plots is not verified.
+ """
+
+ def test_unit__plot_enrichment_profile_plot_enrichment_profile_synthetic(self):
+ ax = dm.plot_enrichment_profile.plot_enrichment_profile(
+ mod_file_names=["test1.fake", "test2.fake"],
+ regions_list=["test1.bed", "test2.bed"],
+ motifs=["A", "C"],
+ window_size=500,
+ sample_names=["sample1", "sample2"],
+ smooth_window=50,
)
-
- self.assertOutputFileExists(db_file)
-
- # TODO: It's difficult to get the output file name. Find a better way to do this check.
- n_pdf_files = len(list(self.outDir.glob("*.pdf")))
- self.assertEqual(n_pdf_files, 5)
-
- for basemod in ["A", "CG"]:
- for plot_type in ["fraction", "total"]:
- rolling_avg_file = (
- f"{sample_name}_{basemod}_sm_rolling_avg_{plot_type}.pdf"
+ assert isinstance(ax, Axes)
+
+ def test_unit__plot_enrichment_profile_by_modification_synthetic(self):
+ ax = dm.plot_enrichment_profile.by_modification(
+ mod_file_name="test.fake",
+ regions="test.bed",
+ window_size=500,
+ motifs=["A", "C"],
+ smooth_window=50,
+ )
+ assert isinstance(ax, Axes)
+
+ def test_unit__plot_enrichment_profile_by_region_synthetic(self):
+ ax = dm.plot_enrichment_profile.by_regions(
+ mod_file_name="test.fake",
+ regions_list=["test1.bed", "test2.bed"],
+ motif="A",
+ window_size=500,
+ sample_names=["on target", "off target"],
+ smooth_window=50,
+ )
+ assert isinstance(ax, Axes)
+
+ def test_unit__plot_enrichment_profile_by_dataset_synthetic(self):
+ ax = dm.plot_enrichment_profile.by_dataset(
+ mod_file_names=["test1.fake", "test2.fake"],
+ regions="test.bed",
+ motif="A",
+ window_size=500,
+ sample_names=["experiment 1", "experiment 2"],
+ smooth_window=50,
+ )
+ assert isinstance(ax, Axes)
+
+
+@pytest.mark.parametrize(
+ "test_case,kwargs,results",
+ [(case, inputs, outputs) for case, (inputs, outputs) in test_matrix.items()],
+)
+class TestPlotEnrichmentProfile:
+ def test_unit__plot_enrichment_profile_plot_enrichment_profile(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_plot_enrichment_profile_plot_enrichment_profile = (
+ filter_kwargs_for_func(
+ dm.plot_enrichment_profile.plot_enrichment_profile,
+ kwargs,
+ extra_args=["window_size", "smooth_window"],
)
- self.assertOutputFileExists(rolling_avg_file)
-
- # TODO: It's difficult to get the output file name. Find a better way to do this check.
- n_bed_files = len(list(self.outDir.glob("*.bed")))
- self.assertEqual(n_bed_files, 2)
-
-
-class TestPlotEnrichment(DiMeLoTestCase):
- def test_plot_enrichment_2_bams(self):
- """Tests enrichment comparison for the same region over two different bam files."""
- bam_idx = 0
-
- # Inputs
- bam_file = input_bams[bam_idx]
-
- # Outputs
- db_file = output_dbs[bam_idx]
- pdf_file = f"region_{input_bed.stem}_CG_enrichment_barplot.pdf"
+ )
+ for motif in kwargs["motifs"]:
+ regions_list = (
+ kwargs["regions"]
+ if isinstance(kwargs["regions"], list)
+ else [kwargs["regions"]]
+ )
+ kwargs_plot_enrichment_profile_plot_enrichment_profile["motifs"] = [
+ motif for _ in regions_list
+ ]
+ ax = dm.plot_enrichment_profile.plot_enrichment_profile(
+ mod_file_names=[results["pileup"][0] for _ in regions_list],
+ regions_list=regions_list,
+ sample_names=["label" for _ in regions_list],
+ **kwargs_plot_enrichment_profile_plot_enrichment_profile,
+ )
+ assert isinstance(
+ ax, Axes
+ ), f"{test_case}: plotting failed for {motif}."
+ else:
+ print(
+ f"{test_case} skipped for plot_enrichment_profile.plot_enrichment_profile."
+ )
- dm.plot_enrichment(
- fileNames=[str(bam_file), str(bam_file)],
- sampleNames=input_sample_names,
- bedFiles=str(input_bed),
- basemod="CG",
- outDir=str(self.outDir),
- threshC=129,
+ def test_unit__plot_enrichment_profile_by_regions(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_plot_enrichment_profile_by_regions = filter_kwargs_for_func(
+ dm.plot_enrichment_profile.by_regions,
+ kwargs,
+ extra_args=["window_size", "smooth_window"],
+ )
+ for motif in kwargs["motifs"]:
+ regions_list = (
+ kwargs["regions"]
+ if isinstance(kwargs["regions"], list)
+ else [kwargs["regions"]]
+ )
+ ax = dm.plot_enrichment_profile.by_regions(
+ mod_file_name=results["pileup"][0],
+ regions_list=regions_list,
+ motif=motif,
+ sample_names=["label" for _ in regions_list],
+ **kwargs_plot_enrichment_profile_by_regions,
+ )
+ assert isinstance(
+ ax, Axes
+ ), f"{test_case}: plotting failed for {motif}."
+ else:
+ print(f"{test_case} skipped for plot_enrichment_profile.by_regions.")
+
+ def test_unit__plot_enrichment_profile_by_modification(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_plot_enrichment_profile_by_modification = filter_kwargs_for_func(
+ dm.plot_enrichment_profile.by_modification,
+ kwargs,
+ extra_args=["window_size", "smooth_window"],
+ )
+ ax = dm.plot_enrichment_profile.by_modification(
+ mod_file_name=results["pileup"][0],
+ **kwargs_plot_enrichment_profile_by_modification,
+ )
+ assert isinstance(ax, Axes), f"{test_case}: plotting failed."
+ else:
+ print(f"{test_case} skipped for plot_enrichment_profile.by_modification.")
+
+ def test_unit__plot_enrichment_by_dataset(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_plot_enrichment_profile_by_dataset = filter_kwargs_for_func(
+ dm.plot_enrichment_profile.by_dataset,
+ kwargs,
+ extra_args=["window_size", "smooth_window"],
+ )
+ for motif in kwargs["motifs"]:
+ ax = dm.plot_enrichment_profile.by_dataset(
+ mod_file_names=[results["pileup"][0]],
+ motif=motif,
+ **kwargs_plot_enrichment_profile_by_dataset,
+ )
+ assert isinstance(ax, Axes), f"{test_case}: plotting failed."
+ else:
+ print(f"{test_case} skipped for plot_enrichment_profile.by_dataset.")
+
+
+class TestPlotDepthProfileSynthetic:
+ """
+ Tests plotting functionality in plot_depth_profile.
+
+ This test simply checks that we can make plots from synthetic data without raising errors.
+ Appearance of plots is not verified.
+ """
+
+ def test_unit__plot_depth_profile_plot_depth_profile_synthetic(self):
+ ax = dm.plot_depth_profile.plot_depth_profile(
+ mod_file_names=["test1.fake", "test2.fake"],
+ regions_list=["test1.bed", "test2.bed"],
+ motifs=["A", "C"],
+ window_size=500,
+ sample_names=["sample1", "sample2"],
+ smooth_window=50,
)
-
- self.assertOutputFileExists(db_file)
- self.assertOutputFileExists(pdf_file)
-
- def test_plot_enrichment_2_beds(self):
- """Tests enrichment comparison for two different regions over the same bam file"""
- bam_idx = 0
-
- # Inputs
- bam_file = input_bams[bam_idx]
-
- # Outputs
- db_file = output_dbs[bam_idx]
- pdf_file = f"sample_{bam_file.stem}_CG_enrichment_barplot.pdf"
-
- dm.plot_enrichment(
- fileNames=str(bam_file),
- sampleNames=input_sample_names,
- bedFiles=[str(input_bed), str(input_bed)],
- basemod="CG",
- outDir=str(self.outDir),
- threshC=129,
+ assert isinstance(ax, Axes)
+
+ def test_unit__plot_depth_profile_by_modification_synthetic(self):
+ ax = dm.plot_depth_profile.by_modification(
+ mod_file_name="test.fake",
+ regions="test.bed",
+ window_size=500,
+ motifs=["A", "C"],
+ smooth_window=50,
)
-
- self.assertOutputFileExists(db_file)
- self.assertOutputFileExists(pdf_file)
-
- def test_plot_enrichment_incompatible_bed_bam(self):
- """Verifies that passing equal numbers of beds and bams remains an error."""
- bam_idx = 0
-
- # Inputs
- bam_file = input_bams[bam_idx]
-
- with self.assertRaises(RuntimeError):
- dm.plot_enrichment(
- fileNames=[str(bam_file), str(bam_file)],
- sampleNames=input_sample_names,
- bedFiles=[str(input_bed), str(input_bed)],
- basemod="CG",
- outDir=str(self.outDir),
- )
-
- def test_plot_enrichment_incompatible_basemod(self):
- """Verifies that plot_enrichment remains incompatible with multi-basemod options."""
- bam_idx = 0
-
- # Inputs
- bam_file = input_bams[bam_idx]
-
- with self.assertRaises(RuntimeError):
- dm.plot_enrichment(
- fileNames=[str(bam_file), str(bam_file)],
- sampleNames=input_sample_names,
- bedFiles=str(input_bed),
- basemod="A+CG",
- outDir=str(self.outDir),
- )
-
-
-class TestPlotEnrichmentProfile(DiMeLoTestCase):
- def test_plot_enrichment_profile_single(self):
- """Tests profile plotting for a single sample and region."""
- bam_idx = 0
-
- # Inputs
- bam_file = input_bams[bam_idx]
- sample_name = input_sample_names[bam_idx]
-
- # Outputs
- db_file = output_dbs[bam_idx]
- enrichment_plot = f"{sample_name}_A+CG_sm_rolling_avg.pdf"
- single_molecule_plot = f"{sample_name}_A+CG_sm_scatter.png"
- base_count_plots = [
- f"{sample_name}_{basemod}_base_count.png"
- for basemod in ["A", "CG"]
- ]
-
- dm.plot_enrichment_profile(
- fileNames=str(bam_file),
- sampleNames=sample_name,
- bedFiles=str(input_bed),
- basemod="A+CG",
- outDir=str(self.outDir),
- windowSize=500,
- dotsize=1,
+ assert isinstance(ax, Axes)
+
+ def test_unit__plot_depth_profile_by_region_synthetic(self):
+ ax = dm.plot_depth_profile.by_regions(
+ mod_file_name="test.fake",
+ regions_list=["test1.bed", "test2.bed"],
+ motif="A",
+ window_size=500,
+ sample_names=["on target", "off target"],
+ smooth_window=50,
)
-
- self.assertOutputFileExists(db_file)
- self.assertOutputFileExists(enrichment_plot)
- self.assertOutputFileExists(single_molecule_plot)
- for f in base_count_plots:
- self.assertOutputFileExists(f)
-
- def test_plot_enrichment_profile_sample_overlay(self):
- """Tests profile plotting for multiple regions over a single sample."""
- bam_idx = 0
-
- # Inputs
- bam_file = input_bams[bam_idx]
-
- # Outputs
- db_file = output_dbs[bam_idx]
- overlay_plot = f"sample_{bam_file.stem}_A_sm_rolling_avg_overlay.pdf"
-
- dm.plot_enrichment_profile(
- fileNames=str(bam_file),
- sampleNames=input_sample_names,
- bedFiles=[str(input_bed), str(input_bed)],
- basemod="A",
- outDir=str(self.outDir),
- windowSize=500,
- dotsize=1,
+ assert isinstance(ax, Axes)
+
+ def test_unit__plot_depth_profile_by_dataset_synthetic(self):
+ ax = dm.plot_depth_profile.by_dataset(
+ mod_file_names=["test1.fake", "test2.fake"],
+ regions="test.bed",
+ motif="A",
+ window_size=500,
+ sample_names=["experiment 1", "experiment 2"],
+ smooth_window=50,
)
-
- self.assertOutputFileExists(db_file)
- self.assertOutputFileExists(overlay_plot)
-
- def test_plot_enrichment_profile_region_overlay(self):
- """Tests profile plotting for multiple samples over a single region."""
- # Outputs
- overlay_plot = f"region_{input_bed.stem}_A_sm_rolling_avg_overlay.pdf"
-
- dm.plot_enrichment_profile(
- fileNames=[str(f) for f in input_bams],
- sampleNames=input_sample_names,
- bedFiles=str(input_bed),
- basemod="A",
- outDir=str(self.outDir),
- windowSize=500,
- dotsize=1,
+ assert isinstance(ax, Axes)
+
+
+@pytest.mark.parametrize(
+ "test_case,kwargs,results",
+ [(case, inputs, outputs) for case, (inputs, outputs) in test_matrix.items()],
+)
+class TestPlotDepthProfile:
+ def test_unit__plot_depth_profile_plot_depth_profile(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_plot_depth_profile_plot_depth_profile = filter_kwargs_for_func(
+ dm.plot_depth_profile.plot_depth_profile,
+ kwargs,
+ extra_args=["window_size", "smooth_window"],
+ )
+ for motif in kwargs["motifs"]:
+ regions_list = (
+ kwargs["regions"]
+ if isinstance(kwargs["regions"], list)
+ else [kwargs["regions"]]
+ )
+ kwargs_plot_depth_profile_plot_depth_profile["motifs"] = [
+ motif for _ in regions_list
+ ]
+ ax = dm.plot_depth_profile.plot_depth_profile(
+ mod_file_names=[results["pileup"][0] for _ in regions_list],
+ regions_list=regions_list,
+ sample_names=["label" for _ in regions_list],
+ **kwargs_plot_depth_profile_plot_depth_profile,
+ )
+ assert isinstance(
+ ax, Axes
+ ), f"{test_case}: plotting failed for {motif}."
+ else:
+ print(f"{test_case} skipped for plot_depth_profile.plot_depth_profile.")
+
+ def test_unit__plot_depth_profile_by_regions(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_plot_depth_profile_by_regions = filter_kwargs_for_func(
+ dm.plot_depth_profile.by_regions,
+ kwargs,
+ extra_args=["window_size", "smooth_window"],
+ )
+ for motif in kwargs["motifs"]:
+ regions_list = (
+ kwargs["regions"]
+ if isinstance(kwargs["regions"], list)
+ else [kwargs["regions"]]
+ )
+ ax = dm.plot_depth_profile.by_regions(
+ mod_file_name=results["pileup"][0],
+ regions_list=regions_list,
+ motif=motif,
+ sample_names=["label" for _ in regions_list],
+ **kwargs_plot_depth_profile_by_regions,
+ )
+ assert isinstance(
+ ax, Axes
+ ), f"{test_case}: plotting failed for {motif}."
+ else:
+ print(f"{test_case} skipped for plot_depth_profile.by_regions.")
+
+ def test_unit__plot_depth_profile_by_modification(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_plot_depth_profile_by_modification = filter_kwargs_for_func(
+ dm.plot_depth_profile.by_modification,
+ kwargs,
+ extra_args=["window_size", "smooth_window"],
+ )
+ ax = dm.plot_depth_profile.by_modification(
+ mod_file_name=results["pileup"][0],
+ **kwargs_plot_depth_profile_by_modification,
+ )
+ assert isinstance(ax, Axes), f"{test_case}: plotting failed."
+ else:
+ print(f"{test_case} skipped for plot_depth_profile.by_modification.")
+
+ def test_unit__plot_depth_by_dataset(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_plot_depth_profile_by_dataset = filter_kwargs_for_func(
+ dm.plot_depth_profile.by_dataset,
+ kwargs,
+ extra_args=["window_size", "smooth_window"],
+ )
+ for motif in kwargs["motifs"]:
+ ax = dm.plot_depth_profile.by_dataset(
+ mod_file_names=[results["pileup"][0]],
+ motif=motif,
+ **kwargs_plot_depth_profile_by_dataset,
+ )
+ assert isinstance(ax, Axes), f"{test_case}: plotting failed."
+ else:
+ print(f"{test_case} skipped for plot_depth_profile.by_dataset.")
+
+
+class TestPlotDepthHistogramSynthetic:
+ """
+ Tests plotting functionality in plot_depth_histogram.
+
+ This test simply checks that we can make plots from synthetic data without raising errors.
+ Appearance of plots is not verified.
+ """
+
+ def test_unit__plot_depth_histogram_plot_depth_histogram_synthetic(self):
+ ax = dm.plot_depth_histogram.plot_depth_histogram(
+ mod_file_names=["test1.fake", "test2.fake"],
+ regions_list=["test1.bed", "test2.bed"],
+ motifs=["A", "C"],
+ window_size=500,
+ sample_names=["sample1", "sample2"],
)
-
- for db_file in output_dbs:
- self.assertOutputFileExists(db_file)
- self.assertOutputFileExists(overlay_plot)
-
- def test_plot_enrichment_profile_overlay_incompatible_basemod(self):
- """Verifies that plot_enrichment_profile remains incompatible with multi-basemod options."""
- with self.assertRaises(RuntimeError):
- dm.plot_enrichment_profile(
- fileNames=[str(f) for f in input_bams],
- sampleNames=input_sample_names,
- bedFiles=str(input_bed),
- basemod="A+CG",
- outDir=str(self.outDir),
+ assert isinstance(ax, Axes)
+
+ def test_unit__plot_depth_histogram_by_modification_synthetic(self):
+ ax = dm.plot_depth_histogram.by_modification(
+ mod_file_name="test.fake",
+ regions="test.bed",
+ window_size=500,
+ motifs=["A", "C"],
+ )
+ assert isinstance(ax, Axes)
+
+ def test_unit__plot_depth_histogram_by_region_synthetic(self):
+ ax = dm.plot_depth_histogram.by_regions(
+ mod_file_name="test.fake",
+ regions_list=["test1.bed", "test2.bed"],
+ motif="A",
+ window_size=500,
+ sample_names=["on target", "off target"],
+ )
+ assert isinstance(ax, Axes)
+
+ def test_unit__plot_depth_histogram_by_dataset_synthetic(self):
+ ax = dm.plot_depth_histogram.by_dataset(
+ mod_file_names=["test1.fake", "test2.fake"],
+ regions="test.bed",
+ motif="A",
+ window_size=500,
+ sample_names=["experiment 1", "experiment 2"],
+ )
+ assert isinstance(ax, Axes)
+
+
+@pytest.mark.parametrize(
+ "test_case,kwargs,results",
+ [(case, inputs, outputs) for case, (inputs, outputs) in test_matrix.items()],
+)
+class TestPlotDepthHistogram:
+ def test_unit__plot_depth_histogram_plot_depth_histogram(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_plot_depth_histogram_plot_depth_histogram = filter_kwargs_for_func(
+ dm.plot_depth_histogram.plot_depth_histogram,
+ kwargs,
+ extra_args=["window_size"],
)
-
- def test_plot_enrichment_profile_overlay_incompatible_bed_bam(self):
- """Verifies that passing equal numbers of beds and bams remains an error."""
- with self.assertRaises(RuntimeError):
- dm.plot_enrichment_profile(
- fileNames=[str(f) for f in input_bams],
- sampleNames=input_sample_names,
- bedFiles=[str(input_bed), str(input_bed)],
- basemod="A",
- outDir=str(self.outDir),
+ for motif in kwargs["motifs"]:
+ regions_list = (
+ kwargs["regions"]
+ if isinstance(kwargs["regions"], list)
+ else [kwargs["regions"]]
+ )
+ kwargs_plot_depth_histogram_plot_depth_histogram["motifs"] = [
+ motif for _ in regions_list
+ ]
+ ax = dm.plot_depth_histogram.plot_depth_histogram(
+ mod_file_names=[results["pileup"][0] for _ in regions_list],
+ regions_list=regions_list,
+ sample_names=["label" for _ in regions_list],
+ **kwargs_plot_depth_histogram_plot_depth_histogram,
+ )
+ assert isinstance(
+ ax, Axes
+ ), f"{test_case}: plotting failed for {motif}."
+ else:
+ print(f"{test_case} skipped for plot_depth_histogram.plot_depth_histogram.")
+
+ def test_unit__plot_depth_histogram_by_regions(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_plot_depth_histogram_by_regions = filter_kwargs_for_func(
+ dm.plot_depth_histogram.by_regions,
+ kwargs,
+ extra_args=["window_size"],
+ )
+ for motif in kwargs["motifs"]:
+ regions_list = (
+ kwargs["regions"]
+ if isinstance(kwargs["regions"], list)
+ else [kwargs["regions"]]
+ )
+ ax = dm.plot_depth_histogram.by_regions(
+ mod_file_name=results["pileup"][0],
+ regions_list=regions_list,
+ motif=motif,
+ sample_names=["label" for _ in regions_list],
+ **kwargs_plot_depth_histogram_by_regions,
+ )
+ assert isinstance(
+ ax, Axes
+ ), f"{test_case}: plotting failed for {motif}."
+ else:
+ print(f"{test_case} skipped for plot_depth_histogram.by_regions.")
+
+ def test_unit__plot_depth_histogram_by_modification(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_plot_depth_histogram_by_modification = filter_kwargs_for_func(
+ dm.plot_depth_histogram.by_modification,
+ kwargs,
+ extra_args=["window_size"],
)
+ ax = dm.plot_depth_histogram.by_modification(
+ mod_file_name=results["pileup"][0],
+ **kwargs_plot_depth_histogram_by_modification,
+ )
+ assert isinstance(ax, Axes), f"{test_case}: plotting failed."
+ else:
+ print(f"{test_case} skipped for plot_depth_histogram.by_modification.")
+
+ def test_unit__plot_depth_by_dataset(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["pileup"][0] is not None:
+ kwargs_plot_depth_histogram_by_dataset = filter_kwargs_for_func(
+ dm.plot_depth_histogram.by_dataset,
+ kwargs,
+ extra_args=["window_size"],
+ )
+ for motif in kwargs["motifs"]:
+ ax = dm.plot_depth_histogram.by_dataset(
+ mod_file_names=[results["pileup"][0]],
+ motif=motif,
+ **kwargs_plot_depth_histogram_by_dataset,
+ )
+ assert isinstance(ax, Axes), f"{test_case}: plotting failed."
+ else:
+ print(f"{test_case} skipped for plot_depth_histogram.by_dataset.")
+
+class TestPlotReadsSynthetic:
+ """
+ Tests plotting functionality in plot_reads.
-if __name__ == "__main__":
- unittest.main()
+ This test simply checks that we can make plots from synthetic data without raising errors.
+ Appearance of plots is not verified.
+ """
+
+ def test_unit__plot_reads_plot_reads_synthetic(self):
+ ax = dm.plot_reads.plot_reads(
+ mod_file_name="test.fake", regions="test.bed", motifs=["A,0", "CG,0"]
+ )
+ assert isinstance(ax, Axes)
+
+
+@pytest.mark.parametrize(
+ "test_case,kwargs,results",
+ [(case, inputs, outputs) for case, (inputs, outputs) in test_matrix.items()],
+)
+class TestPlotReads:
+ def test_unit__plot_reads_plot_reads(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["extract"][0] is not None:
+ kwargs_plot_reads_plot_reads = filter_kwargs_for_func(
+ dm.plot_reads.plot_reads, kwargs
+ )
+ if kwargs["thresh"] is not None:
+ ax = dm.plot_reads.plot_reads(
+ mod_file_name=results["extract"][0],
+ **kwargs_plot_reads_plot_reads,
+ )
+ assert isinstance(ax, Axes), f"{test_case}: plotting failed."
+ else: # if the extract parameters did not have a threshold, plot_reads.plot_reads should raise an error
+ with pytest.raises(ValueError) as excinfo:
+ ax = dm.plot_reads.plot_reads(
+ mod_file_name=results["extract"][0],
+ **kwargs_plot_reads_plot_reads,
+ )
+ assert "No threshold has been applied" in str(
+ excinfo.value
+ ), f"{test_case}: unexpected exception {excinfo.value}"
+ # providing a threshold should be enough to run plot_reads.plot_reads without an error
+ kwargs_plot_reads_plot_reads["thresh"] = 0.75
+ ax = dm.plot_reads.plot_reads(
+ mod_file_name=results["extract"][0],
+ **kwargs_plot_reads_plot_reads,
+ )
+ assert isinstance(ax, Axes), f"{test_case}: plotting failed."
+ else:
+ print(f"{test_case} skipped for test_unit__plot_reads_plot_reads.")
+
+
+@pytest.mark.parametrize(
+ "test_case,kwargs,results",
+ [(case, inputs, outputs) for case, (inputs, outputs) in test_matrix.items()],
+)
+class TestPlotReadBrowser:
+ def test_unit__plot_read_browser(
+ self,
+ test_case,
+ kwargs,
+ results,
+ ):
+ if results["extract"][0] is not None:
+ kwargs_plot_read_browser = filter_kwargs_for_func(
+ dm.plot_read_browser.plot_read_browser, kwargs
+ )
+ if (
+ kwargs["thresh"] is None
+ and kwargs["thresh"] is None
+ and not isinstance(kwargs["regions"], list)
+ and Path(kwargs["regions"]).suffix != ".bed"
+ ):
+ fig = dm.plot_read_browser.plot_read_browser(
+ mod_file_name=results["extract"][0],
+ region=kwargs["regions"],
+ **kwargs_plot_read_browser,
+ )
+ assert isinstance(
+ fig, plotly.graph_objs.Figure
+ ), f"{test_case}: plotting failed."
+ else:
+ with pytest.raises(ValueError) as excinfo:
+ fig = dm.plot_read_browser.plot_read_browser(
+ mod_file_name=results["extract"][0],
+ region=kwargs["regions"],
+ **kwargs_plot_read_browser,
+ )
+ if (
+ isinstance(kwargs["regions"], list)
+ or Path(kwargs["regions"]).suffix == ".bed"
+ ) and kwargs["thresh"] is None:
+ assert (
+ "Invalid region" in str(excinfo.value)
+ ), f"{test_case}: unexpected exception for no-threshold bad-region case {excinfo.value}"
+ elif (
+ kwargs["thresh"] is not None
+ and not isinstance(kwargs["regions"], list)
+ and Path(kwargs["regions"]).suffix != ".bed"
+ ):
+ assert (
+ "A threshold has been applied" in str(excinfo.value)
+ ), f"{test_case}: unexpected exception thresholded valid-region case {excinfo.value}"
+ else:
+ assert (
+ "A threshold has been applied" in str(excinfo.value)
+ or "Invalid region" in str(excinfo.value)
+ ), f"{test_case}: unexpected exception thresholded bad-region case {excinfo.value}"
+
+ else:
+ print(f"{test_case} skipped for test_unit__plot_read_browser")
diff --git a/dimelo/test/generate_targets.py b/dimelo/test/generate_targets.py
new file mode 100644
index 00000000..50dc1411
--- /dev/null
+++ b/dimelo/test/generate_targets.py
@@ -0,0 +1,205 @@
+import argparse
+import pickle
+from pathlib import Path
+
+from cases import test_matrix
+from tqdm.auto import tqdm
+
+from dimelo import load_processed, parse_bam
+from dimelo.test import DiMeLoParsingTestCase, RelativePath, filter_kwargs_for_func
+
+ref_genome_file = Path(RelativePath("./output/chm13.draft_v1.0.fasta"))
+# Base input and output directories
+test_data_dir = Path(RelativePath("./data"))
+output_dir = test_data_dir / "test_targets"
+
+output_dir.mkdir(exist_ok=True)
+
+
+def generate_pileup(test_matrix, case_subset):
+ print("Generating pileup files...")
+ for case in case_subset if case_subset is not None else test_matrix.keys():
+ kwargs, results = test_matrix[case]
+ kwargs_pileup = filter_kwargs_for_func(parse_bam.pileup, kwargs)
+ pileup_file, pileup_regions = parse_bam.pileup(
+ **kwargs_pileup,
+ ref_genome=ref_genome_file,
+ )
+ results["pileup"] = (
+ RelativePath(pileup_file),
+ RelativePath(pileup_regions),
+ )
+
+
+def generate_extract(test_matrix, case_subset):
+ print("Generating extract files...")
+ for case in case_subset if case_subset is not None else test_matrix.keys():
+ kwargs, results = test_matrix[case]
+ kwargs_extract = filter_kwargs_for_func(parse_bam.extract, kwargs)
+ if "cores" in kwargs_extract:
+ del kwargs_extract["cores"]
+ extract_file, extract_regions = parse_bam.extract(
+ **kwargs_extract,
+ ref_genome=ref_genome_file,
+ cores=1,
+ )
+ results["extract"] = (
+ RelativePath(extract_file),
+ RelativePath(extract_regions),
+ )
+
+
+def generate_pileup_counts_from_bedmethyl(test_matrix, case_subset):
+ for case in tqdm(
+ case_subset if case_subset is not None else test_matrix.keys(),
+ desc="Generating pileup counts",
+ ):
+ kwargs, results = test_matrix[case]
+ results["pileup_counts_from_bedmethyl"] = {}
+ kwargs_func = filter_kwargs_for_func(
+ load_processed.pileup_counts_from_bedmethyl, kwargs
+ )
+ for motif in kwargs["motifs"]:
+ results["pileup_counts_from_bedmethyl"][motif] = (
+ load_processed.pileup_counts_from_bedmethyl(
+ bedmethyl_file=results["pileup"][0],
+ **kwargs_func,
+ motif=motif,
+ )
+ )
+
+
+def generate_pileup_vectors_from_bedmethyl(test_matrix, case_subset):
+ for case in tqdm(
+ case_subset if case_subset is not None else test_matrix.keys(),
+ desc="Generating pileup vectors",
+ ):
+ kwargs, results = test_matrix[case]
+ results["pileup_vectors_from_bedmethyl"] = {}
+ kwargs_func = filter_kwargs_for_func(
+ load_processed.pileup_vectors_from_bedmethyl, kwargs
+ )
+ for motif in kwargs["motifs"]:
+ results["pileup_vectors_from_bedmethyl"][motif] = (
+ load_processed.pileup_vectors_from_bedmethyl(
+ bedmethyl_file=results["pileup"][0],
+ **kwargs_func,
+ motif=motif,
+ )
+ )
+
+
+def generate_read_vectors_from_hdf5(test_matrix, case_subset):
+ for case in tqdm(
+ case_subset if case_subset is not None else test_matrix.keys(),
+ desc="Generating read vectors",
+ ):
+ kwargs, results = test_matrix[case]
+ extract_file, regions_bed = results["extract"]
+ if extract_file is not None and regions_bed is not None:
+ kwargs_func = filter_kwargs_for_func(
+ load_processed.read_vectors_from_hdf5, kwargs
+ )
+ read_data_list, datasets, _ = load_processed.read_vectors_from_hdf5(
+ file=extract_file,
+ **kwargs_func,
+ )
+ read_data_dict = {}
+ # Pull out the data from the first read
+ for idx, dataset in enumerate(datasets):
+ for read_data in read_data_list:
+ read_data_dict[dataset] = read_data[idx]
+ break
+ results["read_vectors_from_hdf5"] = read_data_dict
+
+
+def main(test_matrix):
+ """
+ The main function runs applicable generators based on the test_matrix defined in cases.py and kwargs.
+
+ Args:
+ test_matrix: the dict containing test cases. This will be modified in-place to contain existing targets
+ if generators won't be getting re-run, based on the command line parsed arguments described below.
+ """
+ # Set up input files, including ref genome download
+ DiMeLoParsingTestCase.setup_class()
+
+ parser = argparse.ArgumentParser(
+ description="Generate target data from test cases."
+ )
+
+ valid_subsets = [
+ "pileup",
+ "extract",
+ "pileup_counts_from_bedmethyl",
+ "pileup_vectors_from_bedmethyl",
+ "read_vectors_from_hdf5",
+ ]
+
+ parser.add_argument(
+ "--target-subset",
+ nargs="*",
+ help=f"""Specify one or more subsets of test targets, separated by spaces (default: all).
+ The following are valid options: {valid_subsets}""",
+ )
+
+ parser.add_argument(
+ "--case-subset",
+ nargs="*",
+ help=f"""Specify one or more subsets of cases, separated by spaces (default: all).
+ The following are valid options based on your current cases.py file: {test_matrix.keys()}""",
+ )
+
+ parser.add_argument(
+ "--initial-target-pickle",
+ type=str,
+ default=RelativePath("data/test_targets/test_matrix.pickle"),
+ help="A test target pickle to start with, when updating only a subset of cases.",
+ )
+
+ args = parser.parse_args()
+
+ if args.target_subset is None:
+ args.target_subset = valid_subsets
+ print(
+ f"Running {'all cases' if args.case_subset is None else args.case_subset} through all target generators from scratch based on test_cases.py"
+ )
+ else:
+ if Path(args.initial_target_pickle).exists():
+ with open(RelativePath(args.initial_target_pickle), "rb") as file:
+ old_test_matrix = pickle.load(file)
+ # loop through the old test matrix
+ for key, (old_kwargs, old_results) in old_test_matrix.items():
+ # if the old test case is in the new test matrix, bring over the results
+ # either the test targets will be regenerated, and the results replaced, or they won't be regenerated, and we'll need the old results
+ if key in test_matrix:
+ new_kwargs = test_matrix[key][0]
+ # if case will be covered, bring in new kwargs
+ if args.case_subset is None or key in args.case_subset:
+ test_matrix[key] = (new_kwargs, old_results)
+ # if case will not be covered, keep old kwargs
+ else:
+ test_matrix[key] = (old_kwargs, old_results)
+ print(
+ f"Running {'all cases' if args.case_subset is None else args.case_subset} through {args.target_subset} to supplement test targets from {args.initial_target_pickle}. Any new tests in cases.py will be added to the test matrix."
+ )
+ else:
+ raise ValueError(
+ f"Cannot run subset {args.target_subset} without a pre-existing complete set of generated targets, {args.initial_target_pickle} does not exist. Either specify an --initial-target-pickle that does exist or run without subsetting."
+ )
+
+ print("Generating targets for the following test matrix kwargs")
+ for test_name, (kwargs, _) in test_matrix.items():
+ print(test_name)
+ print(kwargs)
+
+ for subset in args.target_subset:
+ function_name = f"generate_{subset}"
+ globals()[function_name](test_matrix, args.case_subset)
+
+ with open(RelativePath("./data/test_targets/test_matrix.pickle"), "wb") as file:
+ pickle.dump(test_matrix, file)
+
+
+if __name__ == "__main__":
+ main(test_matrix)
diff --git a/dimelo/test/helper/test_utils.py b/dimelo/test/helper/test_utils.py
deleted file mode 100644
index 69c3ebdb..00000000
--- a/dimelo/test/helper/test_utils.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# helper functions for dimelo_test.py
-
-from dimelo.parse_bam import parse_bam
-
-
-def create_methylation_objects():
-
- fileNames = [
- "dimelo/test/data/mod_mappings_subset.bam",
- "dimelo/test/data/winnowmap_guppy_merge_subset.bam",
- ]
- sampleName = "test"
- bedFile = "dimelo/test/data/test.bed"
- basemods = ["A", "CG", "A+CG"]
- centers = [True, False]
- windowSize = 1000
-
- all_data_combined = []
- for fileName in fileNames:
- for basemod in basemods:
- for center in centers:
- all_data = parse_bam(
- fileName,
- sampleName,
- bedFile,
- basemod,
- center,
- windowSize,
- )
- all_data_combined.append(all_data)
-
- # returned object is a list of 12 all_data objects
- # 0. mod_mappings A center=True
- # 1. mod_mappings A center=False
- # 2. mod_mappings CG center=True
- # 3. mod_mappings CG center=False
- # 4. mod_mappings A+CG center=True
- # 5. mod_mappings A+CG center=False
- # 6. winnow_guppy A center=True
- # 7. winnow_guppy A center=False
- # 8. winnow_guppy CG center=True
- # 9. winnow_guppy CG center=False
- # 10. winnow_guppy A+CG center=True
- # 11. winnow_guppy A+CG center=False
- return all_data_combined
-
-
-def extract_methylation_data_subset(data, index, read_name):
- # returns a dataframe subset for given methylation data and read name
- return data[index][data[index]["read_name"] == read_name]
diff --git a/dimelo/test_data.py b/dimelo/test_data.py
new file mode 100644
index 00000000..3fec3fd0
--- /dev/null
+++ b/dimelo/test_data.py
@@ -0,0 +1,123 @@
+import numpy as np
+
+from . import utils
+
+
+def expspace_prob(num: int, a: float, b: float = 1) -> np.ndarray:
+ """
+ Return probability values spaced over the interval 0 to b along an exponential curve.
+
+ Calculated as y = ((a^x - 1) / (a - 1)) * b, a > 1, 0 < b <= 1
+
+ Args:
+ num: total length of space to return; same as num argument to np.linspace
+ a: controls the depth of the curve; higher values result in a longer wait before going to 1. Must be >1.
+ b: controls the max value of the curve. Must be between (0, 1].
+
+ Returns:
+ Array of probability values betweeen 0 and b, spaced along an exponential curve
+ """
+ if a <= 1:
+ raise ValueError("Value of a must be > 1.")
+ if b > 1:
+ raise ValueError("Value of b must be between (0, 1].")
+ return (np.power(a, np.linspace(start=0, stop=1, num=num)) - 1) / (a - 1) * b
+
+
+def fake_read_mod_calls(halfsize: int, read_type: str, max_prob: float) -> np.ndarray:
+ """
+ Generates a read of the given size with modifications; returns 0 where there is no mod, 1 where there is a mod.
+
+ TODO: More realistic read varieties
+ TODO: Fewer magic numbers
+
+ Args:
+ halfsize: specifies length of output trace; final length will be 2*halfsize
+ read_type: string name of desired read type; see match statement for available types
+ max_prob: maximum probability of any single position being called as modified
+
+ Returns:
+ Array of 0s and 1s, patterned appropriately
+ """
+ # Set the vector of p-vals for the bernoulli distribution pulls based on the requested read type
+ match read_type:
+ case "peak":
+ # higher chance of mod at center of read
+ p_vec = expspace_prob(num=halfsize, a=15, b=max_prob)
+ case "uniform":
+ # uniform low chance of mod across entire read
+ p_vec = [0.05] * halfsize
+ case "inverse_peak":
+ # higher chance of mod at edges of read
+ p_vec = np.flip(expspace_prob(num=halfsize, a=15, b=max_prob))
+ case _:
+ ValueError(f"Unknown read type {read_type}")
+ first_half = [utils.rng.binomial(n=1, p=x) for x in p_vec]
+ second_half = np.flip([utils.rng.binomial(n=1, p=x) for x in p_vec])
+ return np.concatenate([first_half, second_half])
+
+
+def fake_read_mod_positions(
+ halfsize: int, read_type: str, max_prob: float
+) -> np.ndarray:
+ """
+ Generates a read of the given size with modifications; returns positions where there is a modification.
+ Positions are relative to the center of the read.
+
+ See fake_read_mod_calls for details.
+
+ TODO: Should this be able to operate in a non-centered fashion?
+
+ Returns:
+ Vector of positions of modified bases
+ """
+ return (
+ np.flatnonzero(
+ fake_read_mod_calls(
+ halfsize=halfsize, read_type=read_type, max_prob=max_prob
+ )
+ )
+ - halfsize
+ )
+
+
+def fake_peak_enrichment_profile(
+ halfsize: int, peak_height: float, n_reads: int = 100
+) -> np.ndarray:
+ """
+ Generates a random fake peak, with measurements increasing in value up to the center point and decreasing after.
+ Returns enrichment values (fraction modified bases) at each position.
+
+ Args:
+ halfsize: specifies length of output trace; final length will be 2*halfsize
+ peak_height: max height of peak. Must be (0, 1].
+ n_reads: number of reads to generate
+
+ Returns:
+ Array of enrichment values between 0 and 1, peaking at the middle
+ """
+ reads = [fake_read_mod_calls(halfsize, "peak", peak_height) for _ in range(n_reads)]
+ modified_base_counts = np.sum(reads, axis=0)
+ modified_fractions = np.divide(modified_base_counts, n_reads)
+ return modified_fractions
+
+
+def fake_peak_enrichment(
+ halfsize: int, peak_height: float, n_reads: int = 100
+) -> tuple[int, int]:
+ """
+ Generates total modification enrichment counts for a set of fake peak reads spanning some region.
+ Returns enrichment values (fraction modified bases) summed across the entire region.
+
+ Args:
+ halfsize: specifies length of output trace; final length will be 2*halfsize
+ peak_height: max height of peak. Must be (0, 1].
+ n_reads: number of reads to generate
+
+ Returns:
+ tuple containing counts of (modified bases, total_bases)
+ """
+ reads = [fake_read_mod_calls(halfsize, "peak", peak_height) for _ in range(n_reads)]
+ modified_bases = np.sum(reads)
+ total_bases = np.sum(np.fromiter((len(read) for read in reads), dtype=int))
+ return (modified_bases, total_bases)
diff --git a/dimelo/utils.py b/dimelo/utils.py
index 2ac7ab57..e4ad62ba 100644
--- a/dimelo/utils.py
+++ b/dimelo/utils.py
@@ -1,67 +1,491 @@
-r"""
-=================================================
-Functions for sqlite db
-=================================================
-"""
+import multiprocessing
+from collections import defaultdict
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from matplotlib.axes import Axes
-import os
-import sqlite3
+# This provides the mapping of canonical bases to sets of valid mode names
+# It is a defaultdict because any bases without a default should still
+# have valid entries, but they should be empty sets
+BASEMOD_NAMES_DICT = defaultdict(lambda: set())
+BASEMOD_NAMES_DICT.update(
+ {
+ "A": {"a", "Y"},
+ "C": {"m", "Z"},
+ }
+)
+# Default colors for seaborn plots
+DEFAULT_COLORS = defaultdict(lambda: "grey")
+DEFAULT_COLORS.update(
+ {
+ "A,0": "blue",
+ "A,0,a": "blue",
+ "CG,0": "orange",
+ "CG,0,m": "yellow",
+ "CG,0,h": "red",
+ "GCH,1": "purple",
+ }
+)
+# Default colorscales for plotly; based off of DEFAULT_COLORS
+DEFAULT_COLORSCALES = defaultdict(lambda: ["white", "grey"])
+DEFAULT_COLORSCALES.update([(k, ["white", v]) for k, v in DEFAULT_COLORS.items()])
-def clear_db(database_name):
- if os.path.exists(database_name):
- os.remove(database_name)
- if os.path.exists(database_name + "-journal"):
- os.remove(database_name + "-journal")
+# Define the source of randomness for a variety of purposes throughout the package
+# TODO: how best to initialize seed for random state, to allow for reproducibility?
+rng = np.random.default_rng()
-def create_sql_table(database_name, table_name, cols, d_types):
- conn = sqlite3.connect(database_name)
- c = conn.cursor()
- s = ""
- for i in range(len(cols)):
- if i == 0:
- s = s + cols[i] + " " + d_types[i] + " " + "PRIMARY KEY, "
- elif i == len(cols) - 1:
- s = s + cols[i] + " " + d_types[i]
+
+def cores_to_run(cores):
+ cores_avail = multiprocessing.cpu_count()
+ if cores is None or cores > cores_avail:
+ return cores_avail
+ else:
+ return cores
+
+
+class ParsedMotif:
+ def __init__(self, motif_string):
+ """
+ ParsedMotif takes in a base modification specifier motif, e.g. CG,0 or CG,0,m,
+ and parses it for easy use later.
+
+ args:
+ -motif_string: a specifier string containing sequence_context,mod_coord(,optional mod_code)
+ """
+ parts = motif_string.split(",")
+ if len(parts) == 2:
+ # If a mod code isn't specified, we use the default set
+ self.motif_seq = parts[0]
+ self.modified_pos = int(parts[1])
+ if self.modified_pos >= len(self.motif_seq):
+ raise ValueError(f"Motif {motif_string} has an out-of-range mod index.")
+ self.modified_base = self.motif_seq[self.modified_pos]
+ self.mod_codes = BASEMOD_NAMES_DICT[self.modified_base]
+ elif len(parts) == 3:
+ # If a mod code is specified, that will be the only one we look for
+ self.motif_seq = parts[0]
+ self.modified_pos = int(parts[1])
+ if self.modified_pos >= len(self.motif_seq):
+ raise ValueError(f"Motif {motif_string} has an out-of-range mod index.")
+ self.modified_base = self.motif_seq[self.modified_pos]
+ self.mod_codes = set(parts[2])
+ else:
+ # Motifs need both a sequence and an index, separated by a comma
+ raise ValueError(
+ f"Motif {motif_string} must have 2 or 3 comma-separated elements: sequence, index, and (optionally) mod code."
+ )
+
+
+def adjust_threshold(
+ thresh,
+ quiet=True,
+):
+ if thresh > 0:
+ if thresh > 1:
+ if not quiet:
+ print(
+ f"Modification threshold of {thresh} assumed to be for range 0-255. {thresh}/255={thresh / 255} will be sent to modkit."
+ )
+ thresh_scaled = thresh / 255
else:
- s = s + cols[i] + " " + d_types[i] + "," + " "
- fs = "(" + s + ")"
- c.execute("""DROP TABLE IF EXISTS """ + table_name + """;""")
- c.execute("""CREATE TABLE """ + table_name + """ """ + fs + """;""")
- conn.commit()
- c.close()
-
-
-def execute_sql_command(
- command: str, database_name: str, values, conn=None
-) -> None:
- """
- Function to execute a SQL command from Python.
- Parameters
- ----------
- command: str
- SQL command (use strings with three quotes on each side
- so that it can be a multiline string
- database_name: str
- File name of the database (e.g, “my.db”)
- Returns
- -------
- No return, executes the command
- """
- # will create if not present
- if conn is None:
- conn = sqlite3.connect(database_name, timeout=60.0)
- c = conn.cursor()
- # c.execute('BEGIN TRANSACTION')
- if len(values) == 0:
- c.execute(command)
- elif type(values) == list:
- c.executemany(command, values)
+ if not quiet:
+ print(
+ f"Modification threshold of {thresh} will be treated as coming from range 0-1."
+ )
+ thresh_scaled = thresh
+
+ return thresh_scaled
+ return thresh
+
+
+def process_chunks_from_regions_dict(
+ regions_dict: dict,
+ chunk_size: int,
+):
+ """
+ returns: a list of chunk specifier dictionaries, which contain region and subregion information. The subregion start and end
+ are always within the region. This information is sufficient for a downstream process to operate on the subregion chunk while
+ knowing where it lies within the larger region.
+ """
+ chunk_list = []
+ for chromosome, region_list in regions_dict.items():
+ for start_coord, end_coord, strand in region_list:
+ for subregion_start in range(start_coord, end_coord, chunk_size):
+ subregion_end = min(end_coord, subregion_start + chunk_size)
+ chunk_list.append(
+ {
+ "chromosome": chromosome,
+ "region_start": start_coord,
+ "region_end": end_coord,
+ "subregion_start": subregion_start,
+ "subregion_end": subregion_end,
+ "strand": strand,
+ }
+ )
+ return chunk_list
+
+
+def regions_dict_from_input(
+ regions: str | Path | list[str | Path] | None = None,
+ window_size: int | None = None,
+) -> dict:
+ """
+ Create a dictionary defining every region provided in the regions input
+ key: chromosome strings
+ value: lists of (start,end,strand)
+
+ TODO: Maybe this dict object should be a more codified type
+ """
+ # TODO: Why is this declared out here, and not within add_region_to_dict? To my eye, that method should just return the fully-loaded dict.
+ # I don't think this approach works because add_region_to_dict can be called many times; the regions parameter can be a single bed path / string OR many in a list
+ regions_dict: defaultdict[str, list] = defaultdict(list)
+
+ if window_size is not None and window_size <= 0:
+ raise ValueError(
+ "Invalid window_size. To disable windowing, set window_size to None or do not pass a value (the default is None)."
+ )
+
+ if isinstance(regions, list):
+ for region in regions:
+ add_region_to_dict(region, window_size, regions_dict)
else:
- c.execute(command, values)
- # saves the changes
- conn.commit()
- c.close()
- # conn.close()
+ add_region_to_dict(regions, window_size, regions_dict)
+ for chrom in regions_dict:
+ regions_dict[chrom].sort(key=lambda x: x[0])
+
+ return regions_dict
+
+
+def add_region_to_dict(
+ region: str | Path,
+ window_size: int | None,
+ regions_dict: dict,
+):
+ # TODO: The flow of this is very confusing, creating mypy errors, and possibly creates actual errors.
+ # mypy error: dimelo/utils.py:110: error: Item "str" of "str | Path" has no attribute "name" [union-attr]
+ # Basically, this method is confusing because the string can be a pathlike or a region string.
+ # Find a different way to check whether the string is pathlike or a region string, coerce paths to Path objects, then clean up everything else.
+
+ # Added None as a window_size option, it was already handled below so was always a valid input
+
+ # We check whether the region is a path to a .bed file by seeing if, when coerced into a Path object, it has the suffix 'bed'
+ if Path(region).suffix == ".bed":
+ with open(region) as bed_regions:
+ for line_index, line in enumerate(bed_regions):
+ fields = line.split()
+ if len(fields) > 2:
+ # Per the bed spec, the 6th column is strand
+ # https://genome.ucsc.edu/FAQ/FAQformat.html
+ if len(fields) > 5:
+ chrom, start, end, strand = (
+ fields[0],
+ int(fields[1]),
+ int(fields[2]),
+ fields[5],
+ )
+ # If strand isn't in the bed file, we set to . (neither/both)
+ else:
+ chrom, start, end, strand = (
+ fields[0],
+ int(fields[1]),
+ int(fields[2]),
+ ".",
+ )
+ if window_size is None:
+ regions_dict[chrom].append((start, end, strand))
+ else:
+ center_coord = (start + end) // 2
+ regions_dict[chrom].append(
+ (
+ center_coord - window_size,
+ center_coord + window_size,
+ strand,
+ )
+ )
+ else:
+ raise ValueError(
+ f"Invalid bed format line {line_index} of {Path(region).name}"
+ )
+ # If the region is a path but *not* to a bed file, that isn't valid
+ elif isinstance(region, Path):
+ raise ValueError(
+ f"Path object {region} is not pointing to a .bed file. regions must be provided as paths to .bed files or as strings in the format chrX:XXX-XXX,strand."
+ )
+ # If the region is a string and doesn't convert to a path to a bed file, then it must be a region string else it cannot be parsed
+ elif (
+ isinstance(region, str)
+ and len(region.split(":")) == 2
+ and 2 <= len(region.split(":")[1].split("-")) <= 3
+ ):
+ chrom, (start, end, strand) = parse_region_string(
+ region=region, window_size=window_size
+ )
+ regions_dict[chrom].append((start, end, strand))
+ else:
+ raise ValueError(
+ f"Invalid regions {type(region)}: {region}. Please use the format chrX:XXX-XXX,strand."
+ )
+
+
+def parse_region_string(
+ region: str,
+ window_size: int | None,
+) -> tuple[str, tuple[int, int, str]]:
+ """
+ Parse a region specification string into its component parts.
+
+ Args:
+ region: a region string of the format chrX:XXX-XXX or chrX:XXX-XXX,strand (+/-/.)
+ window_size: if present, returns a window of this size around the center of the given region
+
+ Returns:
+ chromosome, (start_pos, end_pos, strand)
+ """
+ try:
+ # region strings can be either chrX:XXX-XXX or chrX:XXX-XXX,strand (+/-/.)
+ region_coords = region.split(",")
+ # The default strand is ., which is neither strand
+ strand = region_coords[1] if len(region_coords) > 1 else "."
+ chrom, coords = region_coords[0].split(":")
+ start, end = map(int, coords.split("-"))
+ if window_size is None:
+ return chrom, (start, end, strand)
+ else:
+ center_coord = (start + end) // 2
+ return chrom, (
+ center_coord - window_size,
+ center_coord + window_size,
+ strand,
+ )
+ except (ValueError, AttributeError) as err:
+ raise ValueError(
+ f"Invalid region string {region}. Region strings can be either chrX:XXX-XXX or chrX:XXX-XXX,strand (+/-/.)."
+ ) from err
+
+
+def bed_from_regions_dict(
+ regions_dict: dict,
+ save_bed_path: Path,
+):
+ with open(save_bed_path, "w") as processed_bed:
+ for chrom, regions_list in regions_dict.items():
+ for start, end, _ in regions_list:
+ bed_line = (
+ "\t".join([chrom, str(start), str(end), ".", ".", "."]) + "\n"
+ )
+ processed_bed.write(bed_line)
+
+
+def bedmethyl_to_bigwig(input_bedmethyl: str | Path, output_bigwig: str | Path):
+ return 0
+
+
+def sanitize_path_args(*args) -> tuple:
+ """
+ Coerce all given arguments to Path objects, leaving Nones as Nones.
+ """
+ return tuple(Path(f) if f is not None else f for f in args)
+
+
+def check_len_equal(*args: list) -> bool:
+ """
+ Checks whether all provided lists are the same length.
+ """
+ return all(len(x) == len(args[0]) for x in args)
+
+
+def bar_plot(categories: list[str], values: np.ndarray, y_label: str, **kwargs) -> Axes:
+ """
+ Utility for producing bar plots.
+
+ Args:
+ categories: parallel with values; bar labels
+ values: parallel with categories: bar heights
+ y_label: y-axis label
+ kwargs: other keyword parameters passed through to seaborn.barplot
+
+ Returns:
+ Axes object containing the plot
+ """
+ axes = sns.barplot(x=categories, y=values, hue=categories, **kwargs)
+ axes.set(ylabel=y_label)
+ return axes
+
+
+def line_plot(
+ indep_vector: np.ndarray,
+ indep_name: str,
+ dep_vectors: list[np.ndarray],
+ dep_names: list[str],
+ y_label: str,
+ **kwargs,
+) -> Axes:
+ """
+ Utility for producing overlayed line plots for data vectors with the same x-axis values.
+
+ Takes in one independent vector and arbitrarily many dependent vectors. Plots all dependent vectors on the same axes against the same dependent vector.
+ All vectors must be of equal length.
+
+ TODO: Right now, this always generates a legend with the title "variable". I could add a parameter to specify this (by passing the var_name argument to pd.DataFrame.melt), but then that percolates upwards to other methods. How to do this cleanly?
+
+ Args:
+ indep_vector: parallel with each entry in dep_vectors; independent variable values shared across each overlayed line
+ indep_name: name of independent variable; set as x axis label
+ dep_vectors: outer list parallel with dep_names; each inner vector parallel with indep_vector; dependent variable values for each overlayed line
+ dep_names: parallel with dep_vectors; names of each overlayed line; set as legend entries
+ y_label: y-axis label
+ kwargs: other keyword parameters passed through to seaborn.lineplot
+
+ Returns:
+ Axes object containing the plot
+
+ Raises:
+ ValueError: raised if any vectors are of unequal length
+ """
+ # construct dict of {vector_name: vector}, including the x vector using dict union operations
+ data_dict = {indep_name: indep_vector} | dict(zip(dep_names, dep_vectors))
+ # construct long-form data table for plotting
+ try:
+ data_table = pd.DataFrame(data_dict).melt(
+ id_vars=indep_name, value_name=y_label
+ )
+ except ValueError as e:
+ raise ValueError(
+ "All dependent and independent vectors must be the same length"
+ ) from e
+ # plot lines
+ return sns.lineplot(
+ data=data_table, x=indep_name, y=y_label, hue="variable", **kwargs
+ )
+
+
+def hist_plot(
+ value_vectors: list[np.ndarray],
+ value_names: list[str],
+ x_label: str,
+ y_label: str,
+ integer_values: bool = False,
+ **kwargs,
+) -> Axes:
+ """
+ Utility for producing overlayed histogram plots for data vectors containing values with some distribution.
+
+ Takes arbitrarily many counts vectors and plots on same histogram.
+
+ Args:
+ value_vectors: parallel with value_names; vectors of values to plot histograms of; each vector will be a separate overlayed histogram
+ value_names: parallel with value_vectors; names of each overlayed histogram; set as legend entries
+ x_label: name of distributed values; set as x axis label
+ y_label: y-axis label
+ integer_values: True if hist bins are only at integer values, meaning bins shouldn't be auto-determined
+ kwargs: other keyword parameters passed through to seaborn.histplot
+
+ Returns:
+ Axes object containing the plot
+
+ Raises:
+ ValueError: raised if any vectors are of unequal length
+ """
+ # Flatten the vectors and assign corresponding labels
+ data_dict = {
+ x_label: np.concatenate(value_vectors),
+ y_label: np.repeat(value_names, [len(vec) for vec in value_vectors]),
+ }
+
+ # Create DataFrame
+ data_table = pd.DataFrame(data_dict)
+ if integer_values:
+ # Warn user that passed bins are being overwritten
+ if "bins" in kwargs:
+ print("Warning: bin settings overwritten by defaults")
+ kwargs["bins"] = np.arange(
+ data_table[x_label].min() - 0.5, data_table[x_label].max() + 1.5, 1
+ )
+
+ # plot histogram
+ ax = sns.histplot(
+ data=data_table,
+ x=x_label,
+ hue=y_label,
+ multiple="dodge",
+ **kwargs,
+ )
+
+ ax.set_ylabel(y_label)
+
+ return ax
+
+
+def smooth_rolling_mean(
+ vector: np.ndarray[float], window: int, min_periods: int = 1
+) -> np.ndarray:
+ """
+ Smooths the given vector, using rolling centered windows of the given size.
+ See pandas rolling documentation for details; documentation for relevant arguments copied here.
+
+ Note: Because this operation is always centered, min_periods only has an effect if it is less than half of window size.
+
+ TODO: Is pandas the most efficient implementation for this?
+ TODO: Is it reasonable for min_periods to be default 1? That makes some sense for plotting, but might make analysis misleading in the future, compared to defaulting to window size.
+
+ Args:
+ vector: the vector of values to smooth
+ window: size of the moving window
+ min_periods: minimum number of observations in window to output a value; otherwise, result is np.nan
+
+ Returns:
+ Vector of smoothed values
+ """
+ return (
+ pd.Series(vector)
+ .rolling(window=window, min_periods=min_periods, center=True)
+ .mean()
+ .values
+ )
+
+
+def random_sample(
+ array: np.ndarray,
+ n: int | None = None,
+ frac: float | None = None,
+ replace: bool = False,
+ # shuffle: bool = True,
+):
+ """
+ Utility method for generating a random sample of the elements in an array. Always defaults to sampling along the first axis.
+ This means that for 2d arrays this method will return a subsample of the rows.
+
+ Handles n/frac specification like pandas.DataFrame.sample(): https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html.
+
+ Args:
+ array: Array to sample from
+ n: Number of elements to return; mutually exclusive with frac
+ frac: Fraction of elements to return; mutually exclusive with n
+ replace: When True, sample with replacement
+
+ Return: the requested random subset of the given array
+
+ NOTE: Outputs are not guaranteed to be in the same order as the original array. If ordering is required to be preserved, re-sort after.
+
+ TODO: enable non-uniform weights? Seems like too much to me...
+ TODO: shuffled outputs originally broke h5 loading, so it was turned off. However, because of the instability of the sampling order,
+ this doesn't actually matter. However, it is being left off to prevent confusion for the end user.
+ """
+ size = pd.core.sample.process_sampling_size(n, frac, replace)
+ if size is None:
+ assert frac is not None
+ size = round(frac * len(array))
+ return rng.choice(
+ a=array,
+ size=size,
+ replace=replace,
+ p=None,
+ axis=0,
+ # shuffle=shuffle
+ shuffle=False,
+ )
diff --git a/docs/.nojekyll b/docs/.nojekyll
deleted file mode 100644
index e69de29b..00000000
diff --git a/docs/Makefile b/docs/Makefile
deleted file mode 100644
index c271f08f..00000000
--- a/docs/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS ?=
-SPHINXBUILD ?= sphinx-build
-SOURCEDIR = .
-BUILDDIR = .
-
-# Put it first so that "make" without argument is like "make help".
-help:
- @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
- @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/auto_examples/auto_examples_jupyter.zip b/docs/auto_examples/auto_examples_jupyter.zip
deleted file mode 100644
index d970304c..00000000
Binary files a/docs/auto_examples/auto_examples_jupyter.zip and /dev/null differ
diff --git a/docs/auto_examples/auto_examples_python.zip b/docs/auto_examples/auto_examples_python.zip
deleted file mode 100644
index 4c34adc1..00000000
Binary files a/docs/auto_examples/auto_examples_python.zip and /dev/null differ
diff --git a/docs/auto_examples/browser_example.ipynb b/docs/auto_examples/browser_example.ipynb
deleted file mode 100644
index c5f158fa..00000000
--- a/docs/auto_examples/browser_example.ipynb
+++ /dev/null
@@ -1,82 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "%matplotlib inline"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n# Single-Molecule Browser Plot\n\nPlot single molecules with colored base modifications in a region of interest\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create either an interactive single-molecule HTML browser if static=False, or create a PDF if static=True.\nBase modifications are colored by type (mA vs. mCG) and probability of modification.\nExample data for producing these plots can be downloaded from SRA: https://www.ncbi.nlm.nih.gov/bioproject/752170\nThe below Python and command line options produce the same output.\n\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1. Python option\n\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "import dimelo as dm\n\nbam = \"deep_ctcf_mod_mappings_merge.sorted.bam\"\nsampleName = \"CTCF\"\noutDir = \"./out\"\ndm.plot_browser(\n bam,\n sampleName,\n \"chr11:2086423-2091187\",\n \"A+CG\",\n outDir,\n threshA=153,\n threshC=153,\n static=True,\n smooth=100,\n min_periods=10,\n)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 2. Command line option\n``dimelo-plot-browser -f deep_ctcf_mod_mappings_merge.sorted.bam -s CTCF -r chr11:2086423-2091187 -m A+CG -o ./out -A 153 -C 153 --static -t 100 -n 10``\n\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Output\n.. figure:: ../auto_examples/images/methylation_browser_chr11_2086423_2091187.png\n :align: center\n.. figure:: ../auto_examples/images/CTCF_A_sm_rolling_avg_fraction.png\n :align: center\n.. figure:: ../auto_examples/images/CTCF_A_sm_rolling_avg_total.png\n :align: center\n.. figure:: ../auto_examples/images/CTCF_CG_sm_rolling_avg_fraction.png\n :align: center\n.. figure:: ../auto_examples/images/CTCF_CG_sm_rolling_avg_total.png\n :align: center\n\n"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.11"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
\ No newline at end of file
diff --git a/docs/auto_examples/browser_example.py b/docs/auto_examples/browser_example.py
deleted file mode 100644
index dc4e0a06..00000000
--- a/docs/auto_examples/browser_example.py
+++ /dev/null
@@ -1,52 +0,0 @@
-"""
-Single-Molecule Browser Plot
-============================
-
-Plot single molecules with colored base modifications in a region of interest
-
-"""
-# %%
-# Create either an interactive single-molecule HTML browser if static=False, or create a PDF if static=True.
-# Base modifications are colored by type (mA vs. mCG) and probability of modification.
-# Example data for producing these plots can be downloaded from SRA: https://www.ncbi.nlm.nih.gov/bioproject/752170
-# The below Python and command line options produce the same output.
-
-# %%
-# 1. Python option
-# ----------------
-import dimelo as dm
-
-bam = "deep_ctcf_mod_mappings_merge.sorted.bam"
-sampleName = "CTCF"
-outDir = "./out"
-dm.plot_browser(
- bam,
- sampleName,
- "chr11:2086423-2091187",
- "A+CG",
- outDir,
- threshA=153,
- threshC=153,
- static=True,
- smooth=100,
- min_periods=10,
-)
-
-# %%
-# 2. Command line option
-# ----------------------
-# ``dimelo-plot-browser -f deep_ctcf_mod_mappings_merge.sorted.bam -s CTCF -r chr11:2086423-2091187 -m A+CG -o ./out -A 153 -C 153 --static -t 100 -n 10``
-
-# %%
-# Output
-# ----------------------
-# .. figure:: ../auto_examples/images/methylation_browser_chr11_2086423_2091187.png
-# :align: center
-# .. figure:: ../auto_examples/images/CTCF_A_sm_rolling_avg_fraction.png
-# :align: center
-# .. figure:: ../auto_examples/images/CTCF_A_sm_rolling_avg_total.png
-# :align: center
-# .. figure:: ../auto_examples/images/CTCF_CG_sm_rolling_avg_fraction.png
-# :align: center
-# .. figure:: ../auto_examples/images/CTCF_CG_sm_rolling_avg_total.png
-# :align: center
diff --git a/docs/auto_examples/browser_example.rst b/docs/auto_examples/browser_example.rst
deleted file mode 100644
index e9e84594..00000000
--- a/docs/auto_examples/browser_example.rst
+++ /dev/null
@@ -1,113 +0,0 @@
-
-.. DO NOT EDIT.
-.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
-.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
-.. "auto_examples/browser_example.py"
-.. LINE NUMBERS ARE GIVEN BELOW.
-
-.. only:: html
-
- .. note::
- :class: sphx-glr-download-link-note
-
- Click :ref:`here `
- to download the full example code
-
-.. rst-class:: sphx-glr-example-title
-
-.. _sphx_glr_auto_examples_browser_example.py:
-
-
-Single-Molecule Browser Plot
-============================
-
-Plot single molecules with colored base modifications in a region of interest
-
-.. GENERATED FROM PYTHON SOURCE LINES 9-13
-
-Create either an interactive single-molecule HTML browser if static=False, or create a PDF if static=True.
-Base modifications are colored by type (mA vs. mCG) and probability of modification.
-Example data for producing these plots can be downloaded from SRA: https://www.ncbi.nlm.nih.gov/bioproject/752170
-The below Python and command line options produce the same output.
-
-.. GENERATED FROM PYTHON SOURCE LINES 15-17
-
-1. Python option
-----------------
-
-.. GENERATED FROM PYTHON SOURCE LINES 17-35
-
-.. code-block:: default
-
- import dimelo as dm
-
- bam = "deep_ctcf_mod_mappings_merge.sorted.bam"
- sampleName = "CTCF"
- outDir = "./out"
- dm.plot_browser(
- bam,
- sampleName,
- "chr11:2086423-2091187",
- "A+CG",
- outDir,
- threshA=153,
- threshC=153,
- static=True,
- smooth=100,
- min_periods=10,
- )
-
-
-.. GENERATED FROM PYTHON SOURCE LINES 36-39
-
-2. Command line option
-----------------------
-``dimelo-plot-browser -f deep_ctcf_mod_mappings_merge.sorted.bam -s CTCF -r chr11:2086423-2091187 -m A+CG -o ./out -A 153 -C 153 --static -t 100 -n 10``
-
-.. GENERATED FROM PYTHON SOURCE LINES 41-53
-
-Output
-----------------------
-.. figure:: ../auto_examples/images/methylation_browser_chr11_2086423_2091187.png
- :align: center
-.. figure:: ../auto_examples/images/CTCF_A_sm_rolling_avg_fraction.png
- :align: center
-.. figure:: ../auto_examples/images/CTCF_A_sm_rolling_avg_total.png
- :align: center
-.. figure:: ../auto_examples/images/CTCF_CG_sm_rolling_avg_fraction.png
- :align: center
-.. figure:: ../auto_examples/images/CTCF_CG_sm_rolling_avg_total.png
- :align: center
-
-
-.. rst-class:: sphx-glr-timing
-
- **Total running time of the script:** ( 0 minutes 0.000 seconds)
-
-
-.. _sphx_glr_download_auto_examples_browser_example.py:
-
-
-.. only :: html
-
- .. container:: sphx-glr-footer
- :class: sphx-glr-footer-example
-
-
-
- .. container:: sphx-glr-download sphx-glr-download-python
-
- :download:`Download Python source code: browser_example.py `
-
-
-
- .. container:: sphx-glr-download sphx-glr-download-jupyter
-
- :download:`Download Jupyter notebook: browser_example.ipynb `
-
-
-.. only:: html
-
- .. rst-class:: sphx-glr-signature
-
- `Gallery generated by Sphinx-Gallery `_
diff --git a/docs/auto_examples/browser_example_codeobj.pickle b/docs/auto_examples/browser_example_codeobj.pickle
deleted file mode 100644
index b0ca669e..00000000
Binary files a/docs/auto_examples/browser_example_codeobj.pickle and /dev/null differ
diff --git a/docs/auto_examples/enrichment_multi_bam_example.ipynb b/docs/auto_examples/enrichment_multi_bam_example.ipynb
deleted file mode 100644
index 537b4b52..00000000
--- a/docs/auto_examples/enrichment_multi_bam_example.ipynb
+++ /dev/null
@@ -1,82 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "%matplotlib inline"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n# Enrichment Plot Comparison Across BAMs\n\nPlot overall fraction of methylated bases within regions of interest specified by bed file across multiple samples.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create barplot comparing methylation levels in bed file regions of interest across samples\n\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1. Python option\n\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "import dimelo as dm\n\nbams = [\n \"deep_ctcf_mod_mappings_merge.sorted.bam\",\n \"hia5_mod_mappings.bam\",\n \"igg_mod_mappings.bam\",\n]\nsampleNames = [\"CTCF\", \"Hia5\", \"IgG\"]\nbed = \"q10.150.slop.bed\"\noutDir = \"./out\"\ndm.plot_enrichment(bams, sampleNames, bed, \"A\", outDir, threshA=190)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 2. Command line option\n``dimelo-plot-enrichment -f deep_ctcf_mod_mappings_merge.sorted.bam hia5_mod_mappings.bam igg_mod_mappings.bam -s CTCF Hia5 IgG -b q10.150.slop.bed -m A -o ./out -A 190``\n\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Output\n.. figure:: ../auto_examples/images/region_q10.150.slop_A_enrichment_barplot.png\n\n"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.11"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
\ No newline at end of file
diff --git a/docs/auto_examples/enrichment_multi_bam_example.py b/docs/auto_examples/enrichment_multi_bam_example.py
deleted file mode 100644
index 4a65f21d..00000000
--- a/docs/auto_examples/enrichment_multi_bam_example.py
+++ /dev/null
@@ -1,34 +0,0 @@
-"""
-Enrichment Plot Comparison Across BAMs
-======================================
-
-Plot overall fraction of methylated bases within regions of interest specified by bed file across multiple samples.
-
-"""
-# %%
-# Create barplot comparing methylation levels in bed file regions of interest across samples
-
-# %%
-# 1. Python option
-# ----------------
-import dimelo as dm
-
-bams = [
- "deep_ctcf_mod_mappings_merge.sorted.bam",
- "hia5_mod_mappings.bam",
- "igg_mod_mappings.bam",
-]
-sampleNames = ["CTCF", "Hia5", "IgG"]
-bed = "q10.150.slop.bed"
-outDir = "./out"
-dm.plot_enrichment(bams, sampleNames, bed, "A", outDir, threshA=190)
-
-# %%
-# 2. Command line option
-# ----------------------
-# ``dimelo-plot-enrichment -f deep_ctcf_mod_mappings_merge.sorted.bam hia5_mod_mappings.bam igg_mod_mappings.bam -s CTCF Hia5 IgG -b q10.150.slop.bed -m A -o ./out -A 190``
-
-# %%
-# Output
-# ----------------------
-# .. figure:: ../auto_examples/images/region_q10.150.slop_A_enrichment_barplot.png
diff --git a/docs/auto_examples/enrichment_multi_bam_example.rst b/docs/auto_examples/enrichment_multi_bam_example.rst
deleted file mode 100644
index 33de12ca..00000000
--- a/docs/auto_examples/enrichment_multi_bam_example.rst
+++ /dev/null
@@ -1,95 +0,0 @@
-
-.. DO NOT EDIT.
-.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
-.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
-.. "auto_examples/enrichment_multi_bam_example.py"
-.. LINE NUMBERS ARE GIVEN BELOW.
-
-.. only:: html
-
- .. note::
- :class: sphx-glr-download-link-note
-
- Click :ref:`here `
- to download the full example code
-
-.. rst-class:: sphx-glr-example-title
-
-.. _sphx_glr_auto_examples_enrichment_multi_bam_example.py:
-
-
-Enrichment Plot Comparison Across BAMs
-======================================
-
-Plot overall fraction of methylated bases within regions of interest specified by bed file across multiple samples.
-
-.. GENERATED FROM PYTHON SOURCE LINES 9-10
-
-Create barplot comparing methylation levels in bed file regions of interest across samples
-
-.. GENERATED FROM PYTHON SOURCE LINES 12-14
-
-1. Python option
-----------------
-
-.. GENERATED FROM PYTHON SOURCE LINES 14-26
-
-.. code-block:: default
-
- import dimelo as dm
-
- bams = [
- "deep_ctcf_mod_mappings_merge.sorted.bam",
- "hia5_mod_mappings.bam",
- "igg_mod_mappings.bam",
- ]
- sampleNames = ["CTCF", "Hia5", "IgG"]
- bed = "q10.150.slop.bed"
- outDir = "./out"
- dm.plot_enrichment(bams, sampleNames, bed, "A", outDir, threshA=190)
-
-
-.. GENERATED FROM PYTHON SOURCE LINES 27-30
-
-2. Command line option
-----------------------
-``dimelo-plot-enrichment -f deep_ctcf_mod_mappings_merge.sorted.bam hia5_mod_mappings.bam igg_mod_mappings.bam -s CTCF Hia5 IgG -b q10.150.slop.bed -m A -o ./out -A 190``
-
-.. GENERATED FROM PYTHON SOURCE LINES 32-35
-
-Output
-----------------------
-.. figure:: ../auto_examples/images/region_q10.150.slop_A_enrichment_barplot.png
-
-
-.. rst-class:: sphx-glr-timing
-
- **Total running time of the script:** ( 0 minutes 0.000 seconds)
-
-
-.. _sphx_glr_download_auto_examples_enrichment_multi_bam_example.py:
-
-
-.. only :: html
-
- .. container:: sphx-glr-footer
- :class: sphx-glr-footer-example
-
-
-
- .. container:: sphx-glr-download sphx-glr-download-python
-
- :download:`Download Python source code: enrichment_multi_bam_example.py `
-
-
-
- .. container:: sphx-glr-download sphx-glr-download-jupyter
-
- :download:`Download Jupyter notebook: enrichment_multi_bam_example.ipynb `
-
-
-.. only:: html
-
- .. rst-class:: sphx-glr-signature
-
- `Gallery generated by Sphinx-Gallery `_
diff --git a/docs/auto_examples/enrichment_multi_bam_example_codeobj.pickle b/docs/auto_examples/enrichment_multi_bam_example_codeobj.pickle
deleted file mode 100644
index dc6b209b..00000000
Binary files a/docs/auto_examples/enrichment_multi_bam_example_codeobj.pickle and /dev/null differ
diff --git a/docs/auto_examples/enrichment_multi_bed_example.ipynb b/docs/auto_examples/enrichment_multi_bed_example.ipynb
deleted file mode 100644
index 6bdc04af..00000000
--- a/docs/auto_examples/enrichment_multi_bed_example.ipynb
+++ /dev/null
@@ -1,82 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "%matplotlib inline"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n# Enrichment Plot Comparison Across BEDs\n\nPlot overall fraction of methylated bases within multiple sets of regions of interest specified by bed files for a single sample.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create barplot comparing methylation levels in single sample across multiple regions of interest defined in bed files.\n\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1. Python option\n\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "import dimelo as dm\n\nbam = \"deep_ctcf_mod_mappings_merge.sorted.bam\"\nbeds = [\"q10.150.slop.bed\", \"q10nopeak.bed\"]\nsampleNames = [\"chip_peak\", \"not_chip_peak\"]\noutDir = \"./out\"\ndm.plot_enrichment(bam, sampleNames, beds, \"A\", outDir, threshA=190)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 2. Command line option\n``dimelo-plot-enrichment -f deep_ctcf_mod_mappings_merge.sorted.bam -s chip_peak not_chip_peak -b q10.150.slop.bed q10nopeak.bed -m A -o ./out -A 190``\n\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Output\n.. figure:: ../auto_examples/images/sample_deep_ctcf_mod_mappings_merge.sorted_A_enrichment_barplot.png\n\n"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.11"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
\ No newline at end of file
diff --git a/docs/auto_examples/enrichment_multi_bed_example.py b/docs/auto_examples/enrichment_multi_bed_example.py
deleted file mode 100644
index 462b3da8..00000000
--- a/docs/auto_examples/enrichment_multi_bed_example.py
+++ /dev/null
@@ -1,30 +0,0 @@
-"""
-Enrichment Plot Comparison Across BEDs
-======================================
-
-Plot overall fraction of methylated bases within multiple sets of regions of interest specified by bed files for a single sample.
-
-"""
-# %%
-# Create barplot comparing methylation levels in single sample across multiple regions of interest defined in bed files.
-
-# %%
-# 1. Python option
-# ----------------
-import dimelo as dm
-
-bam = "deep_ctcf_mod_mappings_merge.sorted.bam"
-beds = ["q10.150.slop.bed", "q10nopeak.bed"]
-sampleNames = ["chip_peak", "not_chip_peak"]
-outDir = "./out"
-dm.plot_enrichment(bam, sampleNames, beds, "A", outDir, threshA=190)
-
-# %%
-# 2. Command line option
-# ----------------------
-# ``dimelo-plot-enrichment -f deep_ctcf_mod_mappings_merge.sorted.bam -s chip_peak not_chip_peak -b q10.150.slop.bed q10nopeak.bed -m A -o ./out -A 190``
-
-# %%
-# Output
-# ----------------------
-# .. figure:: ../auto_examples/images/sample_deep_ctcf_mod_mappings_merge.sorted_A_enrichment_barplot.png
diff --git a/docs/auto_examples/enrichment_multi_bed_example.rst b/docs/auto_examples/enrichment_multi_bed_example.rst
deleted file mode 100644
index 51da39ae..00000000
--- a/docs/auto_examples/enrichment_multi_bed_example.rst
+++ /dev/null
@@ -1,91 +0,0 @@
-
-.. DO NOT EDIT.
-.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
-.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
-.. "auto_examples/enrichment_multi_bed_example.py"
-.. LINE NUMBERS ARE GIVEN BELOW.
-
-.. only:: html
-
- .. note::
- :class: sphx-glr-download-link-note
-
- Click :ref:`here `
- to download the full example code
-
-.. rst-class:: sphx-glr-example-title
-
-.. _sphx_glr_auto_examples_enrichment_multi_bed_example.py:
-
-
-Enrichment Plot Comparison Across BEDs
-======================================
-
-Plot overall fraction of methylated bases within multiple sets of regions of interest specified by bed files for a single sample.
-
-.. GENERATED FROM PYTHON SOURCE LINES 9-10
-
-Create barplot comparing methylation levels in single sample across multiple regions of interest defined in bed files.
-
-.. GENERATED FROM PYTHON SOURCE LINES 12-14
-
-1. Python option
-----------------
-
-.. GENERATED FROM PYTHON SOURCE LINES 14-22
-
-.. code-block:: default
-
- import dimelo as dm
-
- bam = "deep_ctcf_mod_mappings_merge.sorted.bam"
- beds = ["q10.150.slop.bed", "q10nopeak.bed"]
- sampleNames = ["chip_peak", "not_chip_peak"]
- outDir = "./out"
- dm.plot_enrichment(bam, sampleNames, beds, "A", outDir, threshA=190)
-
-
-.. GENERATED FROM PYTHON SOURCE LINES 23-26
-
-2. Command line option
-----------------------
-``dimelo-plot-enrichment -f deep_ctcf_mod_mappings_merge.sorted.bam -s chip_peak not_chip_peak -b q10.150.slop.bed q10nopeak.bed -m A -o ./out -A 190``
-
-.. GENERATED FROM PYTHON SOURCE LINES 28-31
-
-Output
-----------------------
-.. figure:: ../auto_examples/images/sample_deep_ctcf_mod_mappings_merge.sorted_A_enrichment_barplot.png
-
-
-.. rst-class:: sphx-glr-timing
-
- **Total running time of the script:** ( 0 minutes 0.000 seconds)
-
-
-.. _sphx_glr_download_auto_examples_enrichment_multi_bed_example.py:
-
-
-.. only :: html
-
- .. container:: sphx-glr-footer
- :class: sphx-glr-footer-example
-
-
-
- .. container:: sphx-glr-download sphx-glr-download-python
-
- :download:`Download Python source code: enrichment_multi_bed_example.py `
-
-
-
- .. container:: sphx-glr-download sphx-glr-download-jupyter
-
- :download:`Download Jupyter notebook: enrichment_multi_bed_example.ipynb `
-
-
-.. only:: html
-
- .. rst-class:: sphx-glr-signature
-
- `Gallery generated by Sphinx-Gallery `_
diff --git a/docs/auto_examples/enrichment_multi_bed_example_codeobj.pickle b/docs/auto_examples/enrichment_multi_bed_example_codeobj.pickle
deleted file mode 100644
index dc6b209b..00000000
Binary files a/docs/auto_examples/enrichment_multi_bed_example_codeobj.pickle and /dev/null differ
diff --git a/docs/auto_examples/enrichment_profile_ma_mc_example.ipynb b/docs/auto_examples/enrichment_profile_ma_mc_example.ipynb
deleted file mode 100644
index 6f858ffc..00000000
--- a/docs/auto_examples/enrichment_profile_ma_mc_example.ipynb
+++ /dev/null
@@ -1,82 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "%matplotlib inline"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n# Enrichment Profile mA & mCG\n\nAggregate and single molecule plots colored by modification and centered at regions of interest defined in bed file.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create (1) aggregate profile plots for mA/A and mCG/CG, (2) single-molecule plots for mA + mCG, and (3) base abundance plots for A and CG.\n\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1. Python option\n\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "import dimelo as dm\n\nbam = \"deep_ctcf_mod_mappings_merge.sorted.bam\"\nsampleName = \"quartile4\"\nbed = \"quart4.bed\"\noutDir = \"./out\"\ndm.plot_enrichment_profile(\n bam,\n sampleName,\n bed,\n \"A+CG\",\n outDir,\n threshA=190,\n threshC=190,\n dotsize=0.05,\n)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 2. Command line option\n``dimelo-plot-enrichment-profile -f deep_ctcf_mod_mappings_merge.sorted.bam -s quartile4 -b quart4.bed -m A+CG -o ./out -A 190 -C 190 -d 0.05``\n\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Output\n.. figure:: ../auto_examples/images/quartile4_A+CG_sm_rolling_avg.png\n :align: center\n.. figure:: ../auto_examples/images/quartile4_A+CG_sm_scatter.png\n.. figure:: ../auto_examples/images/quartile4_A_base_count.png\n.. figure:: ../auto_examples/images/quartile4_CG_base_count.png\n\n"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.11"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
\ No newline at end of file
diff --git a/docs/auto_examples/enrichment_profile_ma_mc_example.py b/docs/auto_examples/enrichment_profile_ma_mc_example.py
deleted file mode 100644
index 077d0801..00000000
--- a/docs/auto_examples/enrichment_profile_ma_mc_example.py
+++ /dev/null
@@ -1,44 +0,0 @@
-"""
-Enrichment Profile mA & mCG
-=================================
-
-Aggregate and single molecule plots colored by modification and centered at regions of interest defined in bed file.
-
-"""
-# %%
-# Create (1) aggregate profile plots for mA/A and mCG/CG, (2) single-molecule plots for mA + mCG, and (3) base abundance plots for A and CG.
-
-# %%
-# 1. Python option
-# ----------------
-import dimelo as dm
-
-bam = "deep_ctcf_mod_mappings_merge.sorted.bam"
-sampleName = "quartile4"
-bed = "quart4.bed"
-outDir = "./out"
-dm.plot_enrichment_profile(
- bam,
- sampleName,
- bed,
- "A+CG",
- outDir,
- threshA=190,
- threshC=190,
- dotsize=0.05,
-)
-
-
-# %%
-# 2. Command line option
-# ----------------------
-# ``dimelo-plot-enrichment-profile -f deep_ctcf_mod_mappings_merge.sorted.bam -s quartile4 -b quart4.bed -m A+CG -o ./out -A 190 -C 190 -d 0.05``
-
-# %%
-# Output
-# ----------------------
-# .. figure:: ../auto_examples/images/quartile4_A+CG_sm_rolling_avg.png
-# :align: center
-# .. figure:: ../auto_examples/images/quartile4_A+CG_sm_scatter.png
-# .. figure:: ../auto_examples/images/quartile4_A_base_count.png
-# .. figure:: ../auto_examples/images/quartile4_CG_base_count.png
diff --git a/docs/auto_examples/enrichment_profile_ma_mc_example.rst b/docs/auto_examples/enrichment_profile_ma_mc_example.rst
deleted file mode 100644
index 57591f14..00000000
--- a/docs/auto_examples/enrichment_profile_ma_mc_example.rst
+++ /dev/null
@@ -1,105 +0,0 @@
-
-.. DO NOT EDIT.
-.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
-.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
-.. "auto_examples/enrichment_profile_ma_mc_example.py"
-.. LINE NUMBERS ARE GIVEN BELOW.
-
-.. only:: html
-
- .. note::
- :class: sphx-glr-download-link-note
-
- Click :ref:`here `
- to download the full example code
-
-.. rst-class:: sphx-glr-example-title
-
-.. _sphx_glr_auto_examples_enrichment_profile_ma_mc_example.py:
-
-
-Enrichment Profile mA & mCG
-=================================
-
-Aggregate and single molecule plots colored by modification and centered at regions of interest defined in bed file.
-
-.. GENERATED FROM PYTHON SOURCE LINES 9-10
-
-Create (1) aggregate profile plots for mA/A and mCG/CG, (2) single-molecule plots for mA + mCG, and (3) base abundance plots for A and CG.
-
-.. GENERATED FROM PYTHON SOURCE LINES 12-14
-
-1. Python option
-----------------
-
-.. GENERATED FROM PYTHON SOURCE LINES 14-32
-
-.. code-block:: default
-
- import dimelo as dm
-
- bam = "deep_ctcf_mod_mappings_merge.sorted.bam"
- sampleName = "quartile4"
- bed = "quart4.bed"
- outDir = "./out"
- dm.plot_enrichment_profile(
- bam,
- sampleName,
- bed,
- "A+CG",
- outDir,
- threshA=190,
- threshC=190,
- dotsize=0.05,
- )
-
-
-
-.. GENERATED FROM PYTHON SOURCE LINES 33-36
-
-2. Command line option
-----------------------
-``dimelo-plot-enrichment-profile -f deep_ctcf_mod_mappings_merge.sorted.bam -s quartile4 -b quart4.bed -m A+CG -o ./out -A 190 -C 190 -d 0.05``
-
-.. GENERATED FROM PYTHON SOURCE LINES 38-45
-
-Output
-----------------------
-.. figure:: ../auto_examples/images/quartile4_A+CG_sm_rolling_avg.png
- :align: center
-.. figure:: ../auto_examples/images/quartile4_A+CG_sm_scatter.png
-.. figure:: ../auto_examples/images/quartile4_A_base_count.png
-.. figure:: ../auto_examples/images/quartile4_CG_base_count.png
-
-
-.. rst-class:: sphx-glr-timing
-
- **Total running time of the script:** ( 0 minutes 0.000 seconds)
-
-
-.. _sphx_glr_download_auto_examples_enrichment_profile_ma_mc_example.py:
-
-
-.. only :: html
-
- .. container:: sphx-glr-footer
- :class: sphx-glr-footer-example
-
-
-
- .. container:: sphx-glr-download sphx-glr-download-python
-
- :download:`Download Python source code: enrichment_profile_ma_mc_example.py `
-
-
-
- .. container:: sphx-glr-download sphx-glr-download-jupyter
-
- :download:`Download Jupyter notebook: enrichment_profile_ma_mc_example.ipynb `
-
-
-.. only:: html
-
- .. rst-class:: sphx-glr-signature
-
- `Gallery generated by Sphinx-Gallery `_
diff --git a/docs/auto_examples/enrichment_profile_ma_mc_example_codeobj.pickle b/docs/auto_examples/enrichment_profile_ma_mc_example_codeobj.pickle
deleted file mode 100644
index a6965886..00000000
Binary files a/docs/auto_examples/enrichment_profile_ma_mc_example_codeobj.pickle and /dev/null differ
diff --git a/docs/auto_examples/enrichment_profile_overlay_example.ipynb b/docs/auto_examples/enrichment_profile_overlay_example.ipynb
deleted file mode 100644
index 58ddb532..00000000
--- a/docs/auto_examples/enrichment_profile_overlay_example.ipynb
+++ /dev/null
@@ -1,82 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "%matplotlib inline"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n# Enrichment Profile Overlay\n\nAggregate fraction of methylated bases centered at regions of interest defined in bed files.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create (1) aggregate profile plots for mA/A and mCG/CG, (2) single-molecule plots for mA + mCG, and (3) base abundance plots for A and CG.\n\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1. Python option\n\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "import dimelo as dm\n\nbam = \"deep_ctcf_mod_mappings_merge.sorted.bam\"\nsampleNames = [\"q4\", \"q3\", \"q2\", \"q1\"]\nbeds = [\"quart4.bed\", \"quart3.bed\", \"quart2.bed\", \"quart1.bed\"]\noutDir = \"./out\"\ndm.plot_enrichment_profile(\n bam, sampleNames, beds, \"A\", outDir, threshA=190, dotsize=0.05\n)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 2. Command line option\n``dimelo-plot-enrichment-profile -f deep_ctcf_mod_mappings_merge.sorted.bam -s q4 q3 q2 q1 -b quart4.bed quart3.bed quart2.bed quart1.bed -m A -o ./out -A 190 -d 0.05``\n\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Output\n.. figure:: ../auto_examples/images/sample_deep_ctcf_mod_mappings_merge.sorted_A_sm_rolling_avg_overlay.png\n :align: center\n\n"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.11"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
\ No newline at end of file
diff --git a/docs/auto_examples/enrichment_profile_overlay_example.py b/docs/auto_examples/enrichment_profile_overlay_example.py
deleted file mode 100644
index 63f43666..00000000
--- a/docs/auto_examples/enrichment_profile_overlay_example.py
+++ /dev/null
@@ -1,33 +0,0 @@
-"""
-Enrichment Profile Overlay
-=================================
-
-Aggregate fraction of methylated bases centered at regions of interest defined in bed files.
-
-"""
-# %%
-# Create (1) aggregate profile plots for mA/A and mCG/CG, (2) single-molecule plots for mA + mCG, and (3) base abundance plots for A and CG.
-
-# %%
-# 1. Python option
-# ----------------
-import dimelo as dm
-
-bam = "deep_ctcf_mod_mappings_merge.sorted.bam"
-sampleNames = ["q4", "q3", "q2", "q1"]
-beds = ["quart4.bed", "quart3.bed", "quart2.bed", "quart1.bed"]
-outDir = "./out"
-dm.plot_enrichment_profile(
- bam, sampleNames, beds, "A", outDir, threshA=190, dotsize=0.05
-)
-
-# %%
-# 2. Command line option
-# ----------------------
-# ``dimelo-plot-enrichment-profile -f deep_ctcf_mod_mappings_merge.sorted.bam -s q4 q3 q2 q1 -b quart4.bed quart3.bed quart2.bed quart1.bed -m A -o ./out -A 190 -d 0.05``
-
-# %%
-# Output
-# ----------------------
-# .. figure:: ../auto_examples/images/sample_deep_ctcf_mod_mappings_merge.sorted_A_sm_rolling_avg_overlay.png
-# :align: center
diff --git a/docs/auto_examples/enrichment_profile_overlay_example.rst b/docs/auto_examples/enrichment_profile_overlay_example.rst
deleted file mode 100644
index 454b1745..00000000
--- a/docs/auto_examples/enrichment_profile_overlay_example.rst
+++ /dev/null
@@ -1,94 +0,0 @@
-
-.. DO NOT EDIT.
-.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
-.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
-.. "auto_examples/enrichment_profile_overlay_example.py"
-.. LINE NUMBERS ARE GIVEN BELOW.
-
-.. only:: html
-
- .. note::
- :class: sphx-glr-download-link-note
-
- Click :ref:`here `
- to download the full example code
-
-.. rst-class:: sphx-glr-example-title
-
-.. _sphx_glr_auto_examples_enrichment_profile_overlay_example.py:
-
-
-Enrichment Profile Overlay
-=================================
-
-Aggregate fraction of methylated bases centered at regions of interest defined in bed files.
-
-.. GENERATED FROM PYTHON SOURCE LINES 9-10
-
-Create (1) aggregate profile plots for mA/A and mCG/CG, (2) single-molecule plots for mA + mCG, and (3) base abundance plots for A and CG.
-
-.. GENERATED FROM PYTHON SOURCE LINES 12-14
-
-1. Python option
-----------------
-
-.. GENERATED FROM PYTHON SOURCE LINES 14-24
-
-.. code-block:: default
-
- import dimelo as dm
-
- bam = "deep_ctcf_mod_mappings_merge.sorted.bam"
- sampleNames = ["q4", "q3", "q2", "q1"]
- beds = ["quart4.bed", "quart3.bed", "quart2.bed", "quart1.bed"]
- outDir = "./out"
- dm.plot_enrichment_profile(
- bam, sampleNames, beds, "A", outDir, threshA=190, dotsize=0.05
- )
-
-
-.. GENERATED FROM PYTHON SOURCE LINES 25-28
-
-2. Command line option
-----------------------
-``dimelo-plot-enrichment-profile -f deep_ctcf_mod_mappings_merge.sorted.bam -s q4 q3 q2 q1 -b quart4.bed quart3.bed quart2.bed quart1.bed -m A -o ./out -A 190 -d 0.05``
-
-.. GENERATED FROM PYTHON SOURCE LINES 30-34
-
-Output
-----------------------
-.. figure:: ../auto_examples/images/sample_deep_ctcf_mod_mappings_merge.sorted_A_sm_rolling_avg_overlay.png
- :align: center
-
-
-.. rst-class:: sphx-glr-timing
-
- **Total running time of the script:** ( 0 minutes 0.000 seconds)
-
-
-.. _sphx_glr_download_auto_examples_enrichment_profile_overlay_example.py:
-
-
-.. only :: html
-
- .. container:: sphx-glr-footer
- :class: sphx-glr-footer-example
-
-
-
- .. container:: sphx-glr-download sphx-glr-download-python
-
- :download:`Download Python source code: enrichment_profile_overlay_example.py `
-
-
-
- .. container:: sphx-glr-download sphx-glr-download-jupyter
-
- :download:`Download Jupyter notebook: enrichment_profile_overlay_example.ipynb `
-
-
-.. only:: html
-
- .. rst-class:: sphx-glr-signature
-
- `Gallery generated by Sphinx-Gallery `_
diff --git a/docs/auto_examples/enrichment_profile_overlay_example_codeobj.pickle b/docs/auto_examples/enrichment_profile_overlay_example_codeobj.pickle
deleted file mode 100644
index a6965886..00000000
Binary files a/docs/auto_examples/enrichment_profile_overlay_example_codeobj.pickle and /dev/null differ
diff --git a/docs/auto_examples/enrichment_profile_single_example.ipynb b/docs/auto_examples/enrichment_profile_single_example.ipynb
deleted file mode 100644
index 4e932055..00000000
--- a/docs/auto_examples/enrichment_profile_single_example.ipynb
+++ /dev/null
@@ -1,82 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "%matplotlib inline"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n# Enrichment Profile mA only\n\nPlot single molecules centered at regions of interest defined in bed file and produce aggregate profile\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create (1) aggregate profile plots for mA/A, (2) single-molecule plots for mA, and (3) base abundance plots for A.\n\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 1. Python option\n\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "import dimelo as dm\n\nbam = \"deep_ctcf_mod_mappings_merge.sorted.bam\"\nsampleName = \"quartile4\"\nbed = \"quart4.bed\"\noutDir = \"./out\"\ndm.plot_enrichment_profile(\n bam, sampleName, bed, \"A\", outDir, threshA=190, dotsize=0.05\n)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 2. Command line option\n``dimelo-plot-enrichment-profile -f deep_ctcf_mod_mappings_merge.sorted.bam -s quartile4 -b quart4.bed -m A -o ./out -A 190 -d 0.05``\n\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Output\n.. figure:: ../auto_examples/images/quartile4_A_sm_rolling_avg.png\n :align: center\n.. figure:: ../auto_examples/images/quartile4_A_sm_scatter.png\n.. figure:: ../auto_examples/images/quartile4_A_base_count.png\n\n"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.11"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
\ No newline at end of file
diff --git a/docs/auto_examples/enrichment_profile_single_example.py b/docs/auto_examples/enrichment_profile_single_example.py
deleted file mode 100644
index 7a932845..00000000
--- a/docs/auto_examples/enrichment_profile_single_example.py
+++ /dev/null
@@ -1,35 +0,0 @@
-"""
-Enrichment Profile mA only
-=================================
-
-Plot single molecules centered at regions of interest defined in bed file and produce aggregate profile
-
-"""
-# %%
-# Create (1) aggregate profile plots for mA/A, (2) single-molecule plots for mA, and (3) base abundance plots for A.
-
-# %%
-# 1. Python option
-# ----------------
-import dimelo as dm
-
-bam = "deep_ctcf_mod_mappings_merge.sorted.bam"
-sampleName = "quartile4"
-bed = "quart4.bed"
-outDir = "./out"
-dm.plot_enrichment_profile(
- bam, sampleName, bed, "A", outDir, threshA=190, dotsize=0.05
-)
-
-# %%
-# 2. Command line option
-# ----------------------
-# ``dimelo-plot-enrichment-profile -f deep_ctcf_mod_mappings_merge.sorted.bam -s quartile4 -b quart4.bed -m A -o ./out -A 190 -d 0.05``
-
-# %%
-# Output
-# ----------------------
-# .. figure:: ../auto_examples/images/quartile4_A_sm_rolling_avg.png
-# :align: center
-# .. figure:: ../auto_examples/images/quartile4_A_sm_scatter.png
-# .. figure:: ../auto_examples/images/quartile4_A_base_count.png
diff --git a/docs/auto_examples/enrichment_profile_single_example.rst b/docs/auto_examples/enrichment_profile_single_example.rst
deleted file mode 100644
index 71ccb7df..00000000
--- a/docs/auto_examples/enrichment_profile_single_example.rst
+++ /dev/null
@@ -1,96 +0,0 @@
-
-.. DO NOT EDIT.
-.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
-.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
-.. "auto_examples/enrichment_profile_single_example.py"
-.. LINE NUMBERS ARE GIVEN BELOW.
-
-.. only:: html
-
- .. note::
- :class: sphx-glr-download-link-note
-
- Click :ref:`here `
- to download the full example code
-
-.. rst-class:: sphx-glr-example-title
-
-.. _sphx_glr_auto_examples_enrichment_profile_single_example.py:
-
-
-Enrichment Profile mA only
-=================================
-
-Plot single molecules centered at regions of interest defined in bed file and produce aggregate profile
-
-.. GENERATED FROM PYTHON SOURCE LINES 9-10
-
-Create (1) aggregate profile plots for mA/A, (2) single-molecule plots for mA, and (3) base abundance plots for A.
-
-.. GENERATED FROM PYTHON SOURCE LINES 12-14
-
-1. Python option
-----------------
-
-.. GENERATED FROM PYTHON SOURCE LINES 14-24
-
-.. code-block:: default
-
- import dimelo as dm
-
- bam = "deep_ctcf_mod_mappings_merge.sorted.bam"
- sampleName = "quartile4"
- bed = "quart4.bed"
- outDir = "./out"
- dm.plot_enrichment_profile(
- bam, sampleName, bed, "A", outDir, threshA=190, dotsize=0.05
- )
-
-
-.. GENERATED FROM PYTHON SOURCE LINES 25-28
-
-2. Command line option
-----------------------
-``dimelo-plot-enrichment-profile -f deep_ctcf_mod_mappings_merge.sorted.bam -s quartile4 -b quart4.bed -m A -o ./out -A 190 -d 0.05``
-
-.. GENERATED FROM PYTHON SOURCE LINES 30-36
-
-Output
-----------------------
-.. figure:: ../auto_examples/images/quartile4_A_sm_rolling_avg.png
- :align: center
-.. figure:: ../auto_examples/images/quartile4_A_sm_scatter.png
-.. figure:: ../auto_examples/images/quartile4_A_base_count.png
-
-
-.. rst-class:: sphx-glr-timing
-
- **Total running time of the script:** ( 0 minutes 0.000 seconds)
-
-
-.. _sphx_glr_download_auto_examples_enrichment_profile_single_example.py:
-
-
-.. only :: html
-
- .. container:: sphx-glr-footer
- :class: sphx-glr-footer-example
-
-
-
- .. container:: sphx-glr-download sphx-glr-download-python
-
- :download:`Download Python source code: enrichment_profile_single_example.py `
-
-
-
- .. container:: sphx-glr-download sphx-glr-download-jupyter
-
- :download:`Download Jupyter notebook: enrichment_profile_single_example.ipynb `
-
-
-.. only:: html
-
- .. rst-class:: sphx-glr-signature
-
- `Gallery generated by Sphinx-Gallery `_
diff --git a/docs/auto_examples/enrichment_profile_single_example_codeobj.pickle b/docs/auto_examples/enrichment_profile_single_example_codeobj.pickle
deleted file mode 100644
index a6965886..00000000
Binary files a/docs/auto_examples/enrichment_profile_single_example_codeobj.pickle and /dev/null differ
diff --git a/docs/auto_examples/images/CTCF_A_sm_rolling_avg_fraction.png b/docs/auto_examples/images/CTCF_A_sm_rolling_avg_fraction.png
deleted file mode 100644
index 3c3a6800..00000000
Binary files a/docs/auto_examples/images/CTCF_A_sm_rolling_avg_fraction.png and /dev/null differ
diff --git a/docs/auto_examples/images/CTCF_A_sm_rolling_avg_total.png b/docs/auto_examples/images/CTCF_A_sm_rolling_avg_total.png
deleted file mode 100644
index 0014a0ff..00000000
Binary files a/docs/auto_examples/images/CTCF_A_sm_rolling_avg_total.png and /dev/null differ
diff --git a/docs/auto_examples/images/CTCF_CG_sm_rolling_avg_fraction.png b/docs/auto_examples/images/CTCF_CG_sm_rolling_avg_fraction.png
deleted file mode 100644
index c90edc69..00000000
Binary files a/docs/auto_examples/images/CTCF_CG_sm_rolling_avg_fraction.png and /dev/null differ
diff --git a/docs/auto_examples/images/CTCF_CG_sm_rolling_avg_total.png b/docs/auto_examples/images/CTCF_CG_sm_rolling_avg_total.png
deleted file mode 100644
index 618d3fe6..00000000
Binary files a/docs/auto_examples/images/CTCF_CG_sm_rolling_avg_total.png and /dev/null differ
diff --git a/docs/auto_examples/images/CTCF_qc_report.png b/docs/auto_examples/images/CTCF_qc_report.png
deleted file mode 100644
index 5c266a07..00000000
Binary files a/docs/auto_examples/images/CTCF_qc_report.png and /dev/null differ
diff --git a/docs/auto_examples/images/QC_Terminal_Output.png b/docs/auto_examples/images/QC_Terminal_Output.png
deleted file mode 100644
index e240da00..00000000
Binary files a/docs/auto_examples/images/QC_Terminal_Output.png and /dev/null differ
diff --git a/docs/auto_examples/images/methylation_browser_chr11_2086423_2091187.png b/docs/auto_examples/images/methylation_browser_chr11_2086423_2091187.png
deleted file mode 100644
index c25328e9..00000000
Binary files a/docs/auto_examples/images/methylation_browser_chr11_2086423_2091187.png and /dev/null differ
diff --git a/docs/auto_examples/images/quartile4_A+CG_sm_rolling_avg.png b/docs/auto_examples/images/quartile4_A+CG_sm_rolling_avg.png
deleted file mode 100644
index ae0846c9..00000000
Binary files a/docs/auto_examples/images/quartile4_A+CG_sm_rolling_avg.png and /dev/null differ
diff --git a/docs/auto_examples/images/quartile4_A+CG_sm_scatter.png b/docs/auto_examples/images/quartile4_A+CG_sm_scatter.png
deleted file mode 100644
index d4dbf67a..00000000
Binary files a/docs/auto_examples/images/quartile4_A+CG_sm_scatter.png and /dev/null differ
diff --git a/docs/auto_examples/images/quartile4_A_base_count.png b/docs/auto_examples/images/quartile4_A_base_count.png
deleted file mode 100644
index 5cfa7526..00000000
Binary files a/docs/auto_examples/images/quartile4_A_base_count.png and /dev/null differ
diff --git a/docs/auto_examples/images/quartile4_A_sm_rolling_avg.png b/docs/auto_examples/images/quartile4_A_sm_rolling_avg.png
deleted file mode 100644
index d9c156e9..00000000
Binary files a/docs/auto_examples/images/quartile4_A_sm_rolling_avg.png and /dev/null differ
diff --git a/docs/auto_examples/images/quartile4_A_sm_scatter.png b/docs/auto_examples/images/quartile4_A_sm_scatter.png
deleted file mode 100644
index e109399e..00000000
Binary files a/docs/auto_examples/images/quartile4_A_sm_scatter.png and /dev/null differ
diff --git a/docs/auto_examples/images/quartile4_CG_base_count.png b/docs/auto_examples/images/quartile4_CG_base_count.png
deleted file mode 100644
index 106ce2d0..00000000
Binary files a/docs/auto_examples/images/quartile4_CG_base_count.png and /dev/null differ
diff --git a/docs/auto_examples/images/region_q10.150.slop_A_enrichment_barplot.png b/docs/auto_examples/images/region_q10.150.slop_A_enrichment_barplot.png
deleted file mode 100644
index f0e184de..00000000
Binary files a/docs/auto_examples/images/region_q10.150.slop_A_enrichment_barplot.png and /dev/null differ
diff --git a/docs/auto_examples/images/sample_deep_ctcf_mod_mappings_merge.sorted_A_enrichment_barplot.png b/docs/auto_examples/images/sample_deep_ctcf_mod_mappings_merge.sorted_A_enrichment_barplot.png
deleted file mode 100644
index 8b21aeaf..00000000
Binary files a/docs/auto_examples/images/sample_deep_ctcf_mod_mappings_merge.sorted_A_enrichment_barplot.png and /dev/null differ
diff --git a/docs/auto_examples/images/sample_deep_ctcf_mod_mappings_merge.sorted_A_sm_rolling_avg_overlay.png b/docs/auto_examples/images/sample_deep_ctcf_mod_mappings_merge.sorted_A_sm_rolling_avg_overlay.png
deleted file mode 100644
index 00bf6efa..00000000
Binary files a/docs/auto_examples/images/sample_deep_ctcf_mod_mappings_merge.sorted_A_sm_rolling_avg_overlay.png and /dev/null differ
diff --git a/docs/auto_examples/images/sphx_glr_plot_qc_001.png b/docs/auto_examples/images/sphx_glr_plot_qc_001.png
deleted file mode 100644
index e5cae1af..00000000
Binary files a/docs/auto_examples/images/sphx_glr_plot_qc_001.png and /dev/null differ
diff --git a/docs/auto_examples/images/sphx_glr_plot_qc_example_001.png b/docs/auto_examples/images/sphx_glr_plot_qc_example_001.png
deleted file mode 100644
index e8c0e2c0..00000000
Binary files a/docs/auto_examples/images/sphx_glr_plot_qc_example_001.png and /dev/null differ
diff --git a/docs/auto_examples/images/thumb/sphx_glr_browser_example_thumb.png b/docs/auto_examples/images/thumb/sphx_glr_browser_example_thumb.png
deleted file mode 100644
index c25328e9..00000000
Binary files a/docs/auto_examples/images/thumb/sphx_glr_browser_example_thumb.png and /dev/null differ
diff --git a/docs/auto_examples/images/thumb/sphx_glr_enrichment_multi_bam_example_thumb.png b/docs/auto_examples/images/thumb/sphx_glr_enrichment_multi_bam_example_thumb.png
deleted file mode 100644
index f0e184de..00000000
Binary files a/docs/auto_examples/images/thumb/sphx_glr_enrichment_multi_bam_example_thumb.png and /dev/null differ
diff --git a/docs/auto_examples/images/thumb/sphx_glr_enrichment_multi_bed_example_thumb.png b/docs/auto_examples/images/thumb/sphx_glr_enrichment_multi_bed_example_thumb.png
deleted file mode 100644
index 8b21aeaf..00000000
Binary files a/docs/auto_examples/images/thumb/sphx_glr_enrichment_multi_bed_example_thumb.png and /dev/null differ
diff --git a/docs/auto_examples/images/thumb/sphx_glr_enrichment_profile_ma_mc_example_thumb.png b/docs/auto_examples/images/thumb/sphx_glr_enrichment_profile_ma_mc_example_thumb.png
deleted file mode 100644
index ae0846c9..00000000
Binary files a/docs/auto_examples/images/thumb/sphx_glr_enrichment_profile_ma_mc_example_thumb.png and /dev/null differ
diff --git a/docs/auto_examples/images/thumb/sphx_glr_enrichment_profile_overlay_example_thumb.png b/docs/auto_examples/images/thumb/sphx_glr_enrichment_profile_overlay_example_thumb.png
deleted file mode 100644
index 00bf6efa..00000000
Binary files a/docs/auto_examples/images/thumb/sphx_glr_enrichment_profile_overlay_example_thumb.png and /dev/null differ
diff --git a/docs/auto_examples/images/thumb/sphx_glr_enrichment_profile_single_example_thumb.png b/docs/auto_examples/images/thumb/sphx_glr_enrichment_profile_single_example_thumb.png
deleted file mode 100644
index d9c156e9..00000000
Binary files a/docs/auto_examples/images/thumb/sphx_glr_enrichment_profile_single_example_thumb.png and /dev/null differ
diff --git a/docs/auto_examples/images/thumb/sphx_glr_plot_qc_example_thumb.png b/docs/auto_examples/images/thumb/sphx_glr_plot_qc_example_thumb.png
deleted file mode 100644
index edd36fcc..00000000
Binary files a/docs/auto_examples/images/thumb/sphx_glr_plot_qc_example_thumb.png and /dev/null differ
diff --git a/docs/auto_examples/images/thumb/sphx_glr_qc_report_example_thumb.png b/docs/auto_examples/images/thumb/sphx_glr_qc_report_example_thumb.png
deleted file mode 100644
index 537eb042..00000000
Binary files a/docs/auto_examples/images/thumb/sphx_glr_qc_report_example_thumb.png and /dev/null differ
diff --git a/docs/auto_examples/index.rst b/docs/auto_examples/index.rst
deleted file mode 100644
index 77c9162a..00000000
--- a/docs/auto_examples/index.rst
+++ /dev/null
@@ -1,184 +0,0 @@
-:orphan:
-
-
-
-.. _sphx_glr_auto_examples:
-
-Example Gallery
-=======================
-
-
-.. raw:: html
-
-
-
-.. only:: html
-
- .. figure:: /auto_examples/images/thumb/sphx_glr_enrichment_profile_single_example_thumb.png
- :alt: Enrichment Profile mA only
-
- :ref:`sphx_glr_auto_examples_enrichment_profile_single_example.py`
-
-.. raw:: html
-
-