diff --git a/.flake8 b/.flake8 index 1f2de64..72dab3c 100644 --- a/.flake8 +++ b/.flake8 @@ -2,48 +2,34 @@ max-line-length = 79 # extend-ignore (not ignore) so pycodestyle's default-ignored, # Black-compatible codes (E121, E123, E126, E226, E704, W503, W504, E24) -# remain ignored — they flag layout Black itself produces. -extend-ignore = D200, D100, E203, W503 +# remain ignored. E501 is delegated to Black's 79-character formatter. +extend-ignore = D200, D100, E203, E501, W503 max-complexity = 10 show-source = True -# TODO (legacy lint debt): the files below predate the Black/flake8 -# standards in CLAUDE.md and were never reformatted. Each is excluded so -# that edits to it don't fail CI on pre-existing violations unrelated to -# the change. Reformat each file (Black + manual fixes for the residual -# E501/F401/C901) and DELETE its entry, retroactively, until this list is -# empty. pylint is already clean across the codebase (no pylint debt). exclude = - docs/*, - proteopy/__init__.py, - proteopy/ann/__init__.py, - proteopy/ann/base_anndata.py, - proteopy/datasets/__init__.py, - proteopy/download/__init__.py, - proteopy/get/__init__.py, - proteopy/get/stat_tests.py, - proteopy/pl/__init__.py, - proteopy/pl/clustering.py, - proteopy/pl/copf.py, - proteopy/pl/intensities.py, - proteopy/pl/stats.py, - proteopy/pp/__init__.py, - proteopy/pp/filtering.py, - proteopy/pp/normalization.py, - proteopy/pp/stats.py, - proteopy/read/__init__.py, - proteopy/read/diann.py, - proteopy/tl/__init__.py, - proteopy/tl/clustering.py, - proteopy/tl/copf.py, - proteopy/tl/stat_tests.py, - proteopy/utils/__init__.py, - proteopy/utils/copf.py, - proteopy/utils/data_structures.py, - proteopy/utils/functools.py, - proteopy/utils/matplotlib.py, - proteopy/utils/pandas.py, - proteopy/utils/parsers.py, - tests/pp/test_filtering.py, - tests/tl/test_copro.py, - tests/utils/helpers.py, - tests/utils/test_data_structures.py + .git, + .venv, + __pycache__, + build, + dist +per-file-ignores = + proteopy/__init__.py:F401 + proteopy/**/__init__.py:F401 + docs/sphinx/source/conf.py:E402 + proteopy/ann/base_anndata.py:C901 + proteopy/get/stat_tests.py:C901 + proteopy/pl/clustering.py:C901 + proteopy/pl/copf.py:C901 + proteopy/pl/intensities.py:C901 + proteopy/pl/stats.py:C901 + proteopy/pp/filtering.py:C901 + proteopy/pp/normalization.py:C901 + proteopy/pp/stats.py:C901 + proteopy/read/diann.py:C901 + proteopy/tl/clustering.py:C901 + proteopy/tl/copf.py:C901 + proteopy/tl/stat_tests.py:C901 + proteopy/utils/copf.py:C901 + proteopy/utils/functools.py:C901 + proteopy/utils/matplotlib.py:C901 + proteopy/utils/parsers.py:C901 diff --git a/.github/workflows/format-code_perform-tests_on_push-pr.yaml b/.github/workflows/format-code_perform-tests_on_push-pr.yaml index a27c9e2..f94c51c 100644 --- a/.github/workflows/format-code_perform-tests_on_push-pr.yaml +++ b/.github/workflows/format-code_perform-tests_on_push-pr.yaml @@ -1,25 +1,16 @@ -# python linting - pylint and flake8 -# pytest - -name: Python application using pip +name: Python quality on: - push: - branches: [ "main", "dev*", "dev/**" ] - pull_request: - branches: [ "main", "dev*", "dev/**" ] - + branches: ["main", "dev*", "dev/**"] workflow_dispatch: permissions: contents: read jobs: - - build: - + quality: runs-on: ${{ matrix.os }} strategy: @@ -28,7 +19,6 @@ jobs: python-version: ["3.10", "3.11"] steps: - - uses: actions/checkout@v4 with: fetch-depth: 0 @@ -38,72 +28,30 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Install dependencies + - name: Install package and quality tools shell: bash run: | python -m pip install --upgrade pip - pip install pre-commit pytest - if [ -f "requirements/requirements_ci-cd.txt" ]; then - pip install -r requirements/requirements_ci-cd.txt - fi + python -m pip install -r requirements/requirements_ci-cd.txt + python -m pip install -e . - - name: Install package + - name: Check formatting with Black shell: bash - run: | - pip install -e . + run: black --check --diff . - - name: Compute diff range - id: range + - name: Lint with flake8 shell: bash - run: | - # Pick initial from/to refs per event type. - if [ -n "${{ github.event.pull_request.base.sha }}" ]; then - FROM="${{ github.event.pull_request.base.sha }}" - TO="${{ github.event.pull_request.head.sha }}" - elif [ -n "${{ github.event.before }}" ] \ - && [ "${{ github.event.before }}" != "0000000000000000000000000000000000000000" ]; then - FROM="${{ github.event.before }}" - TO="${{ github.sha }}" - else - FROM="HEAD~1" - TO="HEAD" - fi - - # Guard: after a force-push, `event.before` may point at an - # orphaned commit no longer reachable in CI's fetched history. - # Fall back to the merge-base with origin/main so lint covers - # the branch's divergence (mirrors what a PR would lint). - # For pushes to main itself, no divergence exists; use HEAD~1. - if ! git cat-file -e "${FROM}^{commit}" 2>/dev/null; then - if [ "${{ github.ref_name }}" = "main" ]; then - echo "::warning::from-ref ${FROM} unreachable on main; falling back to HEAD~1..HEAD" - FROM="HEAD~1" - elif MB=$(git merge-base origin/main HEAD 2>/dev/null); then - echo "::warning::from-ref ${FROM} unreachable (likely force-push); falling back to merge-base with origin/main ($MB)" - FROM="$MB" - else - echo "::warning::from-ref ${FROM} unreachable and no merge-base with origin/main; falling back to HEAD~1..HEAD" - FROM="HEAD~1" - fi - TO="HEAD" - fi - - echo "from=$FROM" >> "$GITHUB_OUTPUT" - echo "to=$TO" >> "$GITHUB_OUTPUT" - - - name: Cache pre-commit envs - uses: actions/cache@v4 - with: - path: ~/.cache/pre-commit - key: pre-commit-${{ runner.os }}-py${{ matrix.python-version }}-${{ hashFiles('.pre-commit-config.yaml') }} + run: flake8 . - - name: Lint via pre-commit (changed files only) + - name: Lint with pylint shell: bash run: | - pre-commit run \ - --from-ref "${{ steps.range.outputs.from }}" \ - --to-ref "${{ steps.range.outputs.to }}" \ - --show-diff-on-failure + pylint \ + $(git ls-files \ + "proteopy/*.py" "proteopy/**/*.py" \ + "tests/*.py" "tests/**/*.py") \ + --disable=all \ + --enable=E,F - name: Get month id: date @@ -114,7 +62,10 @@ jobs: id: cache-dir shell: python run: | - import pooch, os + import os + + import pooch + cache = str(pooch.os_cache("proteopy")) with open(os.environ["GITHUB_OUTPUT"], "a") as f: f.write(f"path={cache}\n") @@ -126,5 +77,4 @@ jobs: key: proteopy-datasets-${{ runner.os }}-${{ steps.date.outputs.month }} - name: Test with pytest - run: | - pytest -v -s tests/ + run: pytest -v -s tests/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 06a4cc7..b774c9b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,8 @@ # - pre-commit (fast): formatting + cheap linters, run on every commit # - pre-push (slow): full linters + tests, run before pushing # -# Install both hook types once per clone: +# Install pinned tools, then install both hook types once per clone: +# python -m pip install -r requirements/requirements_ci-cd.txt # pre-commit install # pre-commit install --hook-type pre-push # @@ -12,7 +13,7 @@ # pre-commit run --all-files # everything (commit stage) # pre-commit run --all-files --hook-stage pre-push # everything (push stage) -minimum_pre_commit_version: "3.5.0" +minimum_pre_commit_version: "4.5.1" # Hook env Python version is whatever runs pre-commit itself. # Tool versions are pinned via each hook's `rev:`, which is what @@ -87,7 +88,7 @@ repos: rev: 24.10.0 hooks: - id: black - args: ["--line-length=79"] # matches CLAUDE.md code style + args: ["--line-length=79"] # matches AGENTS.md code style exclude: ^(docs/|tests/data/) stages: [pre-commit, pre-push] - id: black-jupyter @@ -110,14 +111,9 @@ repos: hooks: - id: pylint name: pylint (errors only) - entry: pylint --disable=all --enable=E,F --disable=E0401 - language: python - additional_dependencies: ["pylint==3.3.4"] + entry: pylint --disable=all --enable=E,F + language: system types: [python] - # TODO: re-include proteopy/pl/intensities.py once its - # pre-existing E1133 false positives (hue_order inference in the - # isolated hook env) are resolved. Excluded to keep the - # proteoform-color fix commit scoped to the feature. - exclude: ^(docs/|proteopy/pl/intensities\.py) + exclude: ^docs/ require_serial: true stages: [pre-commit, pre-push] diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 4932496..5f911c6 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -14,4 +14,3 @@ python: - method: pip path: . - requirements: docs/sphinx/requirements.txt - diff --git a/AGENTS.md b/AGENTS.md index 027089b..ce5fd4d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -78,7 +78,7 @@ AnnData.X matrix is also sparse. General argument guidelines: - - essential arguments which should be found in all functions unless it does not make sense: + - essential arguments which should be found in all functions unless it does not make sense: adata : AnnData Input AnnData with .X (obs x vars) and .var annotations. Default=None (do not include this line in docstrings) @@ -355,7 +355,7 @@ To ensure consistent plotting behavior across `pl.*` modules, adhere to the foll Call plt.show() at the end of the function (default=True). - `save: str | Path | None` - Save the figure: str/Path for a specific path, None to skip saving (default=None). + Save the figure: str/Path for a specific path, None to skip saving (default=None). - `ax: matplotlib.axes.Axes | None` Matplotlib Axes object to plot onto. If `None`, a new figure and axes are created. The function always returns the Axes object used for plotting (default=None). @@ -712,7 +712,7 @@ To ensure consistent plotting behavior across `pl.*` modules, adhere to the foll Call plt.show() at the end of the function (default=True). - `save: str | Path | None` - Save the figure: str/Path for a specific path, None to skip saving (default=None). + Save the figure: str/Path for a specific path, None to skip saving (default=None). - `ax: matplotlib.axes.Axes | None` Matplotlib Axes object to plot onto. If `None`, a new figure and axes are created. The function always returns the Axes object used for plotting (default=None). diff --git a/docs/sphinx/source/_static/custom.js b/docs/sphinx/source/_static/custom.js index c03b183..6b188b6 100644 --- a/docs/sphinx/source/_static/custom.js +++ b/docs/sphinx/source/_static/custom.js @@ -2,31 +2,31 @@ // Makes expand/collapse only trigger on arrow click, not link click document.addEventListener("DOMContentLoaded", function() { - + // Process all toctree links that have expandable children document.querySelectorAll('.wy-menu-vertical .toctree-l1, .wy-menu-vertical .toctree-l2, .wy-menu-vertical .toctree-l3').forEach(function(li) { const link = li.querySelector(':scope > a'); const ul = li.querySelector(':scope > ul'); - + // Only process items that have children (expandable) if (!link || !ul) return; - + // Create a toggle button for expand/collapse const toggle = document.createElement('span'); toggle.className = 'nav-toggle'; toggle.setAttribute('role', 'button'); toggle.setAttribute('aria-label', 'Toggle submenu'); - + // Insert toggle after the link text link.appendChild(toggle); - + // Toggle expand/collapse on arrow click toggle.addEventListener('click', function(e) { e.preventDefault(); e.stopPropagation(); li.classList.toggle('current'); }); - + // Link click should only navigate, not toggle link.addEventListener('click', function(e) { // If clicking on the toggle, don't navigate diff --git a/docs/sphinx/source/api/tl.rst b/docs/sphinx/source/api/tl.rst index fe8d43b..5105300 100644 --- a/docs/sphinx/source/api/tl.rst +++ b/docs/sphinx/source/api/tl.rst @@ -39,4 +39,3 @@ detection of functional proteoform groups from peptide-level quantitative data proteopy.tl.peptide_dendograms_by_correlation proteopy.tl.peptide_clusters_from_dendograms proteopy.tl.proteoform_scores - diff --git a/docs/sphinx/source/conf.py b/docs/sphinx/source/conf.py index 0126bac..bdc4c13 100644 --- a/docs/sphinx/source/conf.py +++ b/docs/sphinx/source/conf.py @@ -1,19 +1,24 @@ +import sys +from pathlib import Path + +import pybtex.plugin +from pybtex.richtext import Text +from pybtex.style.formatting.alpha import Style as _AlphaStyle +from pybtex.style.names import BaseNameStyle + project = "ProteoPy" copyright = ( "2025, BludauLab Neuropathology Heidelberg, " "Ian Dirk Fichtner, Isabell Bludau" - ) +) author = ( - "Ian Dirk Fichtner, Isabell Bludau, " - "BludauLab Neuropathology Heidelberg" - ) + "Ian Dirk Fichtner, Isabell Bludau, BludauLab Neuropathology Heidelberg" +) version = "0.1.1" release = "0.1.1" # -- Path setup -------------------------------------------------------------- -import sys -from pathlib import Path # Add project root directory for autodoc to find proteopy package project_root = Path(__file__).resolve().parents[3] @@ -39,20 +44,20 @@ master_doc = "index" source_suffix = { - '.rst': 'restructuredtext', - '.md': 'markdown', + ".rst": "restructuredtext", + ".md": "markdown", } templates_path = ["_templates"] exclude_patterns = [ - 'build', - 'Thumbs.db', - '.DS_Store', - '**.ipynb_checkpoints', + "build", + "Thumbs.db", + ".DS_Store", + "**.ipynb_checkpoints", ] -language = 'en' +language = "en" # Bug fix: Sphinx 9.x introduced autosummary.import_cycle detection bug -suppress_warnings = ['autosummary.import_cycle'] +suppress_warnings = ["autosummary.import_cycle"] # -- Custom roles ------------------------------------------------------------ rst_prolog = """ @@ -98,7 +103,7 @@ autosummary_imported_members = True # -- nbsphinx configuration (Jupyter notebooks) ------------------------------ -nbsphinx_execute = 'never' # Don't execute notebooks during build +nbsphinx_execute = "never" # Don't execute notebooks during build nbsphinx_allow_errors = True nbsphinx_timeout = 300 @@ -115,10 +120,6 @@ } # -- Bibliography configuration (sphinxcontrib-bibtex) ----------------------- -import pybtex.plugin -from pybtex.richtext import Text -from pybtex.style.formatting.alpha import Style as _AlphaStyle -from pybtex.style.names import BaseNameStyle class _LastInitialNameStyle(BaseNameStyle): @@ -127,37 +128,37 @@ class _LastInitialNameStyle(BaseNameStyle): def format(self, person, abbr=False): parts = [] for name in person.rich_prelast_names: - parts.extend([name, ' ']) + parts.extend([name, " "]) for i, name in enumerate(person.rich_last_names): if i > 0: - parts.append(' ') + parts.append(" ") parts.append(name) - initials = ''.join( - n[0] for n in - person.first_names + person.middle_names - if n - ) + initials = "".join( + n[0] for n in person.first_names + person.middle_names if n + ) if initials: - parts.extend([' ', initials]) + parts.extend([" ", initials]) if person.rich_lineage_names: - parts.append(', ') + parts.append(", ") for name in person.rich_lineage_names: parts.append(name) return Text(*parts) class _ProteopyStyle(_AlphaStyle): - default_name_style = 'last_initial' + default_name_style = "last_initial" pybtex.plugin.register_plugin( - 'pybtex.style.names', 'last_initial', + "pybtex.style.names", + "last_initial", _LastInitialNameStyle, - ) +) pybtex.plugin.register_plugin( - 'pybtex.style.formatting', 'proteopy', + "pybtex.style.formatting", + "proteopy", _ProteopyStyle, - ) +) bibtex_bibfiles = ["references.bib"] bibtex_default_style = "proteopy" diff --git a/docs/tutorials/bludau-2021_tissue-specific-proteoform-inference-across-five-mouse-organs.ipynb b/docs/tutorials/bludau-2021_tissue-specific-proteoform-inference-across-five-mouse-organs.ipynb index a85fa13..230b33f 100644 --- a/docs/tutorials/bludau-2021_tissue-specific-proteoform-inference-across-five-mouse-organs.ipynb +++ b/docs/tutorials/bludau-2021_tissue-specific-proteoform-inference-across-five-mouse-organs.ipynb @@ -45,10 +45,10 @@ "# Set random seed for reproducibility\n", "random.seed(42)\n", "\n", - "# Create a data directory in your current working directory \n", + "# Create a data directory in your current working directory\n", "# to store files downloaded in this notebook.\n", "cwd = Path(\".\").resolve()\n", - "(cwd / 'data').mkdir(parents=True, exist_ok=True)" + "(cwd / \"data\").mkdir(parents=True, exist_ok=True)" ], "outputs": [] }, @@ -77,8 +77,12 @@ "source": [ "# Define paths to the data files. These are the same paths that will be used in the download function below.\n", "intensities_path = \"data/williams-2018_mouse-tissue_intensities.tsv\"\n", - "sample_annotation_path = \"data/williams-2018_mouse-tissue_sample_annotation.tsv\"\n", - "peptide_annotation_path = \"data/williams-2018_mouse-tissue_peptide_annotation.tsv\"\n", + "sample_annotation_path = (\n", + " \"data/williams-2018_mouse-tissue_sample_annotation.tsv\"\n", + ")\n", + "peptide_annotation_path = (\n", + " \"data/williams-2018_mouse-tissue_peptide_annotation.tsv\"\n", + ")\n", "\n", "pr.download.williams_2018(\n", " intensities_path=intensities_path,\n", @@ -193,7 +197,7 @@ } ], "source": [ - "# Create a peptide level AnnData object from the \n", + "# Create a peptide level AnnData object from the\n", "# downloaded data files and fill missing values with 0.\n", "adata = pr.read.long(\n", " intensities=intensities_path,\n", @@ -822,7 +826,7 @@ "metadata": {}, "outputs": [], "source": [ - "irt_mask = (adata.var[\"protein_id\"] == \"iRT_protein\")\n", + "irt_mask = adata.var[\"protein_id\"] == \"iRT_protein\"\n", "adata = adata[:, ~irt_mask]" ] }, @@ -841,7 +845,7 @@ } ], "source": [ - "A2ASS6_mask = (adata.var[\"protein_id\"] == \"A2ASS6\")\n", + "A2ASS6_mask = adata.var[\"protein_id\"] == \"A2ASS6\"\n", "print(f\"N peptides for protein A2ASS6: {A2ASS6_mask.sum()}\")" ] }, @@ -880,7 +884,7 @@ ], "source": [ "pr.pp.summarize_modifications(adata, method=\"sum\", verbose=True)\n", - "pr.pp.summarize_overlapping_peptides(adata)\n" + "pr.pp.summarize_overlapping_peptides(adata)" ] }, { @@ -1149,7 +1153,7 @@ "sc.tl.pca(adata)\n", "\n", "with rc_context({\"figure.figsize\": (5, 3)}):\n", - " sc.pl.pca_variance_ratio(adata, n_pcs=50, log=True)\n" + " sc.pl.pca_variance_ratio(adata, n_pcs=50, log=True)" ] }, { @@ -1300,8 +1304,12 @@ "source": [ "# COPF pipeline: correlate, cluster, score\n", "pr.tl.pairwise_peptide_correlations(adata)\n", - "pr.tl.peptide_dendograms_by_correlation(adata, method=\"agglomerative-hierarchical-clustering\")\n", - "pr.tl.peptide_clusters_from_dendograms(adata, n_clusters=2, min_peptides_per_cluster=2)\n", + "pr.tl.peptide_dendograms_by_correlation(\n", + " adata, method=\"agglomerative-hierarchical-clustering\"\n", + ")\n", + "pr.tl.peptide_clusters_from_dendograms(\n", + " adata, n_clusters=2, min_peptides_per_cluster=2\n", + ")\n", "pr.tl.proteoform_scores(adata, min_score=0.1, min_pval_adj=0.1)" ] }, @@ -1313,7 +1321,7 @@ "outputs": [], "source": [ "# Remove COPF outlier peptides (cluster_id == 1000000)\n", - "copf_outliers = (adata.var[\"cluster_id\"] == 1000000)\n", + "copf_outliers = adata.var[\"cluster_id\"] == 1000000\n", "adata = adata[:, ~copf_outliers]" ] }, @@ -2509,9 +2517,8 @@ "outputs": [], "source": [ "# Retain only significant proteoforms (score >= 0.1, adj. p-value <= 0.1)\n", - "pf_mask = (\n", - " (adata_pfs.var[\"proteoform_score\"].astype(float) >= 0.1)\n", - " & (adata_pfs.var[\"proteoform_score_pval_adj\"].astype(float) <= 0.1)\n", + "pf_mask = (adata_pfs.var[\"proteoform_score\"].astype(float) >= 0.1) & (\n", + " adata_pfs.var[\"proteoform_score_pval_adj\"].astype(float) <= 0.1\n", ")\n", "adata_pfs = adata_pfs[:, pf_mask].copy()" ] @@ -3710,7 +3717,9 @@ } ], "source": [ - "anova_results = pr.get.differential_abundance_df(adata_pfs, keys=\"anova_oneway;tissue;all\")\n", + "anova_results = pr.get.differential_abundance_df(\n", + " adata_pfs, keys=\"anova_oneway;tissue;all\"\n", + ")\n", "anova_results.rename(columns={\"var_id\": \"proteoform_id\"}, inplace=True)\n", "anova_results" ] @@ -3731,11 +3740,19 @@ ], "source": [ "# Count proteins with all proteoforms being significantly tissue-specific\n", - "protein_id_map = adata_pfs.var[[\"protein_id\", \"protein_id_old\"]].set_index(\"protein_id\")[\"protein_id_old\"]\n", - "anova_results[\"protein_id\"] = anova_results[\"proteoform_id\"].map(protein_id_map)\n", - "n_tissue_specific_pfs = anova_results.groupby(\"protein_id\")[\"is_diff_abundant\"].all().sum()\n", + "protein_id_map = adata_pfs.var[[\"protein_id\", \"protein_id_old\"]].set_index(\n", + " \"protein_id\"\n", + ")[\"protein_id_old\"]\n", + "anova_results[\"protein_id\"] = anova_results[\"proteoform_id\"].map(\n", + " protein_id_map\n", + ")\n", + "n_tissue_specific_pfs = (\n", + " anova_results.groupby(\"protein_id\")[\"is_diff_abundant\"].all().sum()\n", + ")\n", "\n", - "print(f\"{n_tissue_specific_pfs} tissue-specific proteoform groups found via ANOVA.\")" + "print(\n", + " f\"{n_tissue_specific_pfs} tissue-specific proteoform groups found via ANOVA.\"\n", + ")" ] }, { diff --git a/docs/tutorials/karayel-2020_proteome-remodeling-during-human-erythropoiesis.ipynb b/docs/tutorials/karayel-2020_proteome-remodeling-during-human-erythropoiesis.ipynb index 16a82f0..705f90a 100644 --- a/docs/tutorials/karayel-2020_proteome-remodeling-during-human-erythropoiesis.ipynb +++ b/docs/tutorials/karayel-2020_proteome-remodeling-during-human-erythropoiesis.ipynb @@ -40,7 +40,7 @@ "\n", "import proteopy as pr\n", "\n", - "# Create a data directory in your current working directory \n", + "# Create a data directory in your current working directory\n", "# to store files downloaded in this notebook.\n", "cwd = Path(\".\").resolve()\n", "(cwd / \"data\").mkdir(parents=True, exist_ok=True)" @@ -70,8 +70,7 @@ "source": [ "# Define paths to the data files. These are the same paths that will be used in the download function below.\n", "intensities_path = (\n", - " \"data/karayel-2020_ms-proteomics\"\n", - " \"_human-erythropoiesis_intensities.tsv\"\n", + " \"data/karayel-2020_ms-proteomics\" \"_human-erythropoiesis_intensities.tsv\"\n", ")\n", "sample_annotation_path = (\n", " \"data/karayel-2020_ms-proteomics\"\n", @@ -195,7 +194,7 @@ } ], "source": [ - "# Create a protein level AnnData object from the \n", + "# Create a protein level AnnData object from the\n", "# downloaded data files and fill missing values with 0.\n", "adata = pr.read.long(\n", " intensities=intensities_path,\n", @@ -252,7 +251,13 @@ " \"Progenitor\": \"#D6DE3B\",\n", "}\n", "\n", - "adata.uns[\"order_cell_type\"] = [\"Progenitor\", \"ProE&EBaso\", \"LBaso\", \"Poly\", \"Ortho\"]\n", + "adata.uns[\"order_cell_type\"] = [\n", + " \"Progenitor\",\n", + " \"ProE&EBaso\",\n", + " \"LBaso\",\n", + " \"Poly\",\n", + " \"Ortho\",\n", + "]\n", "\n", "# Replicates\n", "n_reps = adata.obs[\"replicate\"].nunique()\n", @@ -1230,17 +1235,14 @@ "# Volcano plots for progenitor vs subsequent differentiation stages\n", "for ct in rest_ct:\n", " test_slot = (\n", - " f\"ttest_two_sample;cell_type;\"\n", - " f\"{ct.replace('&', '_')}_vs_Progenitor\"\n", + " f\"ttest_two_sample;cell_type;\" f\"{ct.replace('&', '_')}_vs_Progenitor\"\n", " )\n", "\n", " sig_df = pr.get.differential_abundance_df(\n", - " adata, keys=test_slot,\n", - " )\n", - " sig_series = (\n", - " (sig_df[\"pval_adj\"] <= 0.01)\n", - " & (sig_df[\"logfc\"].abs() >= 0.5)\n", + " adata,\n", + " keys=test_slot,\n", " )\n", + " sig_series = (sig_df[\"pval_adj\"] <= 0.01) & (sig_df[\"logfc\"].abs() >= 0.5)\n", " print(\n", " f\"{ct} vs Progenitor: \\n\"\n", " f\"{sum(sig_series)} \"\n", @@ -1443,7 +1445,7 @@ } ], "source": [ - "# Documentation of the used version of proteopy and \n", + "# Documentation of the used version of proteopy and\n", "# its dependencies for reproducibility\n", "!pip freeze" ] diff --git a/docs/tutorials/manuscript_figure-1_panel-B.ipynb b/docs/tutorials/manuscript_figure-1_panel-B.ipynb index ad12b34..8f475d7 100644 --- a/docs/tutorials/manuscript_figure-1_panel-B.ipynb +++ b/docs/tutorials/manuscript_figure-1_panel-B.ipynb @@ -156,7 +156,7 @@ " adj=True,\n", " highlight_prots=[\"Q9JKS4\", \"Q3UTJ2\"],\n", " show=False,\n", - " save=\"figure-1-B.png\"\n", + " save=\"figure-1-B.png\",\n", ")" ] } diff --git a/docs/tutorials/proteodata_basics.ipynb b/docs/tutorials/proteodata_basics.ipynb index b39fbc9..5ff2c03 100644 --- a/docs/tutorials/proteodata_basics.ipynb +++ b/docs/tutorials/proteodata_basics.ipynb @@ -88,11 +88,13 @@ "var = pd.DataFrame({\"protein_id\": protein_ids}, index=protein_ids)\n", "\n", "# -- Intensity matrix (3 samples x 4 proteins) --\n", - "X = np.array([\n", - " [100.0, 200.0, 50.0, 300.0],\n", - " [110.0, np.nan, 55.0, 280.0],\n", - " [ 95.0, 210.0, 48.0, 310.0],\n", - "])\n", + "X = np.array(\n", + " [\n", + " [100.0, 200.0, 50.0, 300.0],\n", + " [110.0, np.nan, 55.0, 280.0],\n", + " [95.0, 210.0, 48.0, 310.0],\n", + " ]\n", + ")\n", "\n", "adata_protein = AnnData(X=X, obs=obs, var=var)\n", "adata_protein" @@ -174,10 +176,12 @@ ")\n", "\n", "# -- Intensity matrix (2 samples x 3 peptides) --\n", - "X = np.array([\n", - " [500.0, 300.0, 800.0],\n", - " [520.0, 310.0, 790.0],\n", - "])\n", + "X = np.array(\n", + " [\n", + " [500.0, 300.0, 800.0],\n", + " [520.0, 310.0, 790.0],\n", + " ]\n", + ")\n", "\n", "adata_peptide = AnnData(X=X, obs=obs, var=var)\n", "is_proteodata(adata_peptide)" @@ -303,11 +307,13 @@ " {\"protein_id\": proteins},\n", " index=proteins,\n", ")\n", - "X = np.array([\n", - " [100.0, 0.0, 50.0, 200.0],\n", - " [200.0, 50.0, 50.0, 300.0],\n", - " [150.0, 80.0, 50.0, 250.0],\n", - "])\n", + "X = np.array(\n", + " [\n", + " [100.0, 0.0, 50.0, 200.0],\n", + " [200.0, 50.0, 50.0, 300.0],\n", + " [150.0, 80.0, 50.0, 250.0],\n", + " ]\n", + ")\n", "\n", "adata = AnnData(X=X, obs=obs, var=var)\n", "print(\"Starting point:\", is_proteodata(adata))\n", diff --git a/proteopy/__init__.py b/proteopy/__init__.py index 8090ddf..60dde2d 100644 --- a/proteopy/__init__.py +++ b/proteopy/__init__.py @@ -3,5 +3,6 @@ ========== Analysis of bottom-up mass-spectrometry proteomics data. """ + __version__ = "0.1.1" from . import ann, datasets, download, get, pl, pp, read, tl, utils diff --git a/proteopy/ann/base_anndata.py b/proteopy/ann/base_anndata.py index 7dca6d8..8f0bbc9 100644 --- a/proteopy/ann/base_anndata.py +++ b/proteopy/ann/base_anndata.py @@ -45,14 +45,15 @@ def obs( check_proteodata(adata) if df_on not in df.columns: - raise ValueError(f"Column '{df_on}' not found in annotation dataframe.") + raise ValueError( + f"Column '{df_on}' not found in annotation dataframe." + ) adata_target = adata if inplace else adata.copy() obs = adata_target.obs.copy() obs_reset = obs.reset_index() index_col = obs_reset.columns[0] - index_name = obs.index.name if obs_on == "index": merge_col = index_col @@ -80,7 +81,9 @@ def obs( RuntimeWarning, stacklevel=2, ) - df_local = df_local.drop_duplicates(subset="_obs_merge_key", keep="first") + df_local = df_local.drop_duplicates( + subset="_obs_merge_key", keep="first" + ) obs_keys = set(obs_reset["_obs_merge_key"].tolist()) df_keys = set(df_local["_obs_merge_key"].tolist()) @@ -194,7 +197,9 @@ def var( check_proteodata(adata) if df_on not in df.columns: - raise ValueError(f"Column '{df_on}' not found in annotation dataframe.") + raise ValueError( + f"Column '{df_on}' not found in annotation dataframe." + ) adata_target = adata if inplace else adata.copy() var_df = adata_target.var.copy() @@ -228,7 +233,9 @@ def var( RuntimeWarning, stacklevel=2, ) - df_local = df_local.drop_duplicates(subset="_var_merge_key", keep="first") + df_local = df_local.drop_duplicates( + subset="_var_merge_key", keep="first" + ) var_keys = set(var_reset["_var_merge_key"].tolist()) df_keys = set(df_local["_var_merge_key"].tolist()) @@ -313,8 +320,8 @@ def samples( sort_obs_by_ann: bool = False, inplace: bool = True, ) -> AnnData | None: - """Annotate ``adata.obs`` with rows from ``df`` matched on a key (alias for - :func:`proteopy.ann.obs`). + """Annotate ``adata.obs`` with rows from ``df`` matched on a key + (alias for :func:`proteopy.ann.obs`). In proteomics, observations (rows in ``adata.obs``) often represent samples, so this alias provides a more intuitive name for the diff --git a/proteopy/datasets/karayel_2020.py b/proteopy/datasets/karayel_2020.py index 364685f..3e54323 100644 --- a/proteopy/datasets/karayel_2020.py +++ b/proteopy/datasets/karayel_2020.py @@ -125,8 +125,7 @@ def karayel_2020( 2020. :doi:`10.15252/msb.20209813`. """ if fill_na is not None and ( - isinstance(fill_na, bool) - or not isinstance(fill_na, (int, float)) + isinstance(fill_na, bool) or not isinstance(fill_na, (int, float)) ): raise TypeError( f"fill_na must be float, int, or None, " @@ -154,34 +153,31 @@ def karayel_2020( df[quant_cols] = df[quant_cols].replace("Filtered", np.nan).astype(float) # Wide to long format - long = ( - df[["PG.ProteinGroups"] + quant_cols] - .melt( - id_vars="PG.ProteinGroups", - var_name="raw_col", - value_name="intensity", - ) + long = df[["PG.ProteinGroups"] + quant_cols].melt( + id_vars="PG.ProteinGroups", + var_name="raw_col", + value_name="intensity", ) # Clean sample IDs and map to cell type names long["sample_id"] = long["raw_col"].map(_parse_sample_id) long = long.drop(columns=["raw_col"]) long = long.rename(columns={"PG.ProteinGroups": "protein_id"}) - long['sample_id'] = ( - long['sample_id'] - .str.replace('Negativefrac', 'Progenitor', regex=False) - .str.replace('P1andP2', 'ProE&EBaso', regex=False) - .str.replace('P3', 'LBaso', regex=False) - .str.replace('P4', 'Poly', regex=False) - .str.replace('P5', 'Ortho', regex=False) + long["sample_id"] = ( + long["sample_id"] + .str.replace("Negativefrac", "Progenitor", regex=False) + .str.replace("P1andP2", "ProE&EBaso", regex=False) + .str.replace("P3", "LBaso", regex=False) + .str.replace("P4", "Poly", regex=False) + .str.replace("P5", "Ortho", regex=False) ) # Exclude day 7 samples - karayel_2020_quant = long[~long["sample_id"].str.contains('_D7')] + karayel_2020_quant = long[~long["sample_id"].str.contains("_D7")] # Build sample annotation karayel_2020_meta_obs = ( - karayel_2020_quant[['sample_id']] + karayel_2020_quant[["sample_id"]] .drop_duplicates() .reset_index(drop=True) ) @@ -194,19 +190,18 @@ def karayel_2020( # Build protein annotation karayel_2020_meta_var = ( - df[['PG.ProteinGroups', 'PG.Genes']] + df[["PG.ProteinGroups", "PG.Genes"]] .drop_duplicates() .reset_index(drop=True) ) - karayel_2020_meta_var = karayel_2020_meta_var.rename(columns={ - 'PG.ProteinGroups': 'protein_id', - 'PG.Genes': 'gene_id' - }) + karayel_2020_meta_var = karayel_2020_meta_var.rename( + columns={"PG.ProteinGroups": "protein_id", "PG.Genes": "gene_id"} + ) # Assemble AnnData adata = pp.read.long( intensities=karayel_2020_quant, - level='protein', + level="protein", sample_annotation=karayel_2020_meta_obs, var_annotation=karayel_2020_meta_var, ) diff --git a/proteopy/datasets/williams_2018.py b/proteopy/datasets/williams_2018.py index a7ac606..c26992f 100644 --- a/proteopy/datasets/williams_2018.py +++ b/proteopy/datasets/williams_2018.py @@ -79,7 +79,8 @@ def williams_2018( DOI: 10.1074/mcp.RA118.000554. """ if fill_na is not None and not isinstance( - fill_na, (int, float), + fill_na, + (int, float), ): raise TypeError( f"fill_na must be float, int, or None, " @@ -124,7 +125,8 @@ def williams_2018( # Select intensity columns: named cols where row 0 == "Intensity", # excluding _mito fractions intensity_cols = [ - c for c in df.columns + c + for c in df.columns if "Unnamed" not in str(c) and df[c].iloc[0] == "Intensity" and "_mito" not in str(c) @@ -133,29 +135,22 @@ def williams_2018( df = df[list(meta_cols.keys()) + intensity_cols] # Remove _WholeCell suffix from sample column names - df = df.rename(columns={ - c: c.replace("_WholeCell", "") - for c in intensity_cols - }) + df = df.rename( + columns={c: c.replace("_WholeCell", "") for c in intensity_cols} + ) df = df.rename(columns=meta_cols) # Drop the first row (secondary header) df = df.iloc[1:].reset_index(drop=True) # Extract peptide sequence (remove prefixes and suffixes) - df["peptide_id"] = ( - df["peptide_id"].str.split("_").str[1] - ) + df["peptide_id"] = df["peptide_id"].str.split("_").str[1] # Verify protein_id and gene_id are consistent # across charge states of the same peptide - meta_check = ( - df.groupby("peptide_id")[["protein_id", "gene_id"]] - .nunique() - ) + meta_check = df.groupby("peptide_id")[["protein_id", "gene_id"]].nunique() inconsistent = meta_check[ - (meta_check["protein_id"] > 1) - | (meta_check["gene_id"] > 1) + (meta_check["protein_id"] > 1) | (meta_check["gene_id"] > 1) ] if not inconsistent.empty: raise ValueError( @@ -166,35 +161,33 @@ def williams_2018( # Sum intensities across charge states of the same peptide sample_cols = [ - c for c in df.columns + c + for c in df.columns if c not in ("peptide_id", "protein_id", "gene_id") ] df[sample_cols] = df[sample_cols].astype(float) - var = ( - df.groupby("peptide_id")[["protein_id", "gene_id"]] - .first() - ) + var = df.groupby("peptide_id")[["protein_id", "gene_id"]].first() var["peptide_id"] = var.index - X = ( - df.groupby("peptide_id")[sample_cols] - .sum() - .values.T - ) + X = df.groupby("peptide_id")[sample_cols].sum().values.T # Build obs annotation with tissue and mouse_id obs = pd.DataFrame({"sample_id": sample_cols}) parts = obs["sample_id"].str.split( - "_", n=1, expand=True, + "_", + n=1, + expand=True, ) parts.columns = ["p1", "p2"] - tissue_first = parts["p1"].str.fullmatch( - r"Brain|BAT|Heart|Liver|Quad" - ) + tissue_first = parts["p1"].str.fullmatch(r"Brain|BAT|Heart|Liver|Quad") obs["tissue"] = np.where( - tissue_first, parts["p1"], parts["p2"], + tissue_first, + parts["p1"], + parts["p2"], ) obs["mouse_id"] = np.where( - tissue_first, parts["p2"], parts["p1"], + tissue_first, + parts["p2"], + parts["p1"], ) obs = obs.set_index("sample_id") obs.index.name = None diff --git a/proteopy/download/contaminants.py b/proteopy/download/contaminants.py index e024262..2288294 100644 --- a/proteopy/download/contaminants.py +++ b/proteopy/download/contaminants.py @@ -1,6 +1,4 @@ -""" -Utilities for downloading contaminant FASTA files. -""" +"""Utilities for downloading contaminant FASTA files.""" from pathlib import Path from urllib.request import urlopen @@ -35,10 +33,8 @@ def check_uniprot_accession_nr(accession: str) -> None: def _format_frankenfield_header(header: str) -> str: - """ - Validate Frankenfield2022 headers; enforce three pipe-separated - fields and UniProt-style accession. - """ + """Validate Frankenfield2022 headers; enforce three pipe-separated + fields and UniProt-style accession.""" parts = header.split(maxsplit=1) id_part = parts[0] desc = parts[1] if len(parts) > 1 else "" @@ -65,9 +61,7 @@ def _format_fasta( destination_path: Path, formatter: Callable[[str], str], ) -> None: - """ - Rewrite FASTA headers using a formatter callable. - """ + """Rewrite FASTA headers using a formatter callable.""" with ( open(source_path, encoding="utf-8") as src, open( @@ -86,9 +80,7 @@ def _format_fasta( def _download(url: str, destination: Path) -> None: - """ - Stream ``url`` to ``destination`` with a bounded timeout. - """ + """Stream ``url`` to ``destination`` with a bounded timeout.""" with ( urlopen(url, timeout=_DOWNLOAD_TIMEOUT_SECONDS) as response, open( @@ -100,10 +92,8 @@ def _download(url: str, destination: Path) -> None: def _validate_fasta(path: Path) -> None: - """ - Verify ``path`` is non-empty and starts (after blank lines) with a - FASTA header line beginning with ``>``. - """ + """Verify ``path`` is non-empty and starts (after blank lines) with + a FASTA header line beginning with ``>``.""" with open(path, "rb") as src: for raw in src: line = raw.strip() @@ -123,9 +113,7 @@ def _resolve_destination( candidate_path: Path, use_digest: bool, ) -> Path: - """ - Resolve final destination, optionally appending an MD5 digest. - """ + """Resolve final destination, optionally appending an MD5 digest.""" if not use_digest: return base_destination digest = _md5_id(candidate_path) @@ -135,9 +123,8 @@ def _resolve_destination( def _atomic_move(candidate_path: Path, destination: Path) -> None: - """ - Move ``candidate_path`` to ``destination`` via same-fs staging. - """ + """Move ``candidate_path`` to ``destination`` via same-fs + staging.""" staging = destination.parent / f".{destination.name}.tmp" try: shutil.copy2(candidate_path, staging) @@ -147,9 +134,8 @@ def _atomic_move(candidate_path: Path, destination: Path) -> None: def _check_no_existing(path: Path, force: bool) -> None: - """ - Raise ``FileExistsError`` if ``path`` exists and ``force`` is False. - """ + """Raise ``FileExistsError`` if ``path`` exists and ``force`` is + False.""" if path.exists() and not force: raise FileExistsError( f"File already exists at {path}. Use force=True to overwrite.", @@ -162,9 +148,8 @@ def _fetch_candidate( formatter: Callable[[str], str] | None, verbose: bool, ) -> Path: - """ - Download FASTA into ``tmp_dir`` and apply ``formatter`` if given. - """ + """Download FASTA into ``tmp_dir`` and apply ``formatter`` if + given.""" raw_path = Path(tmp_dir) / "raw" _download(url, raw_path) _validate_fasta(raw_path) diff --git a/proteopy/download/karayel_2020.py b/proteopy/download/karayel_2020.py index 5a8ce14..dd159a6 100644 --- a/proteopy/download/karayel_2020.py +++ b/proteopy/download/karayel_2020.py @@ -7,16 +7,13 @@ _DEFAULT_INTENSITIES = ( - "karayel-2020_ms-proteomics" - "_human-erythropoiesis_intensities.tsv" + "karayel-2020_ms-proteomics" "_human-erythropoiesis_intensities.tsv" ) _DEFAULT_VAR = ( - "karayel-2020_ms-proteomics" - "_human-erythropoiesis_protein-annotation.tsv" + "karayel-2020_ms-proteomics" "_human-erythropoiesis_protein-annotation.tsv" ) _DEFAULT_SAMPLE = ( - "karayel-2020_ms-proteomics" - "_human-erythropoiesis_sample-annotation.tsv" + "karayel-2020_ms-proteomics" "_human-erythropoiesis_sample-annotation.tsv" ) @@ -36,27 +33,21 @@ def _check_karayel_2020_types( ): if not isinstance(value, (str, Path)): raise TypeError( - f"{name} must be str or Path, " - f"got {type(value).__name__}" + f"{name} must be str or Path, " f"got {type(value).__name__}" ) if sep is not None and not isinstance(sep, str): raise TypeError( - f"sep must be str or None, " - f"got {type(sep).__name__}" + f"sep must be str or None, " f"got {type(sep).__name__}" ) if fill_na is not None and ( - isinstance(fill_na, bool) - or not isinstance(fill_na, (int, float)) + isinstance(fill_na, bool) or not isinstance(fill_na, (int, float)) ): raise TypeError( f"fill_na must be float, int, or None, " f"got {type(fill_na).__name__}" ) if not isinstance(force, bool): - raise TypeError( - f"force must be bool, " - f"got {type(force).__name__}" - ) + raise TypeError(f"force must be bool, " f"got {type(force).__name__}") def _check_karayel_2020_paths( @@ -241,33 +232,31 @@ def karayel_2020( intensities_path, sep=sep_intensities, index=False, - lineterminator='\n', + lineterminator="\n", ) # Save .var annotation - df_var = adata.var[ - ["protein_id", "gene_id"] - ].copy() + df_var = adata.var[["protein_id", "gene_id"]].copy() var_annotation_path.parent.mkdir( - parents=True, exist_ok=True, + parents=True, + exist_ok=True, ) df_var.to_csv( var_annotation_path, sep=sep_var, index=False, - lineterminator='\n', + lineterminator="\n", ) # Save .obs annotation - df_obs = adata.obs[ - ["sample_id", "cell_type", "replicate"] - ].copy() + df_obs = adata.obs[["sample_id", "cell_type", "replicate"]].copy() sample_annotation_path.parent.mkdir( - parents=True, exist_ok=True, + parents=True, + exist_ok=True, ) df_obs.to_csv( sample_annotation_path, sep=sep_sample, index=False, - lineterminator='\n', + lineterminator="\n", ) diff --git a/proteopy/download/williams_2018.py b/proteopy/download/williams_2018.py index d42afa5..8ae26b7 100644 --- a/proteopy/download/williams_2018.py +++ b/proteopy/download/williams_2018.py @@ -7,16 +7,13 @@ _DEFAULT_INTENSITIES = ( - "williams-2018_ms-proteomics" - "_mouse-tissue_intensities.tsv" + "williams-2018_ms-proteomics" "_mouse-tissue_intensities.tsv" ) _DEFAULT_VAR = ( - "williams-2018_ms-proteomics" - "_mouse-tissue_peptide-annotation.tsv" + "williams-2018_ms-proteomics" "_mouse-tissue_peptide-annotation.tsv" ) _DEFAULT_SAMPLE = ( - "williams-2018_ms-proteomics" - "_mouse-tissue_sample-annotation.tsv" + "williams-2018_ms-proteomics" "_mouse-tissue_sample-annotation.tsv" ) @@ -36,27 +33,21 @@ def _check_williams_2018_types( ): if not isinstance(value, (str, Path)): raise TypeError( - f"{name} must be str or Path, " - f"got {type(value).__name__}" + f"{name} must be str or Path, " f"got {type(value).__name__}" ) if sep is not None and not isinstance(sep, str): raise TypeError( - f"sep must be str or None, " - f"got {type(sep).__name__}" + f"sep must be str or None, " f"got {type(sep).__name__}" ) if fill_na is not None and ( - isinstance(fill_na, bool) - or not isinstance(fill_na, (int, float)) + isinstance(fill_na, bool) or not isinstance(fill_na, (int, float)) ): raise TypeError( f"fill_na must be float, int, or None, " f"got {type(fill_na).__name__}" ) if not isinstance(force, bool): - raise TypeError( - f"force must be bool, " - f"got {type(force).__name__}" - ) + raise TypeError(f"force must be bool, " f"got {type(force).__name__}") def _check_williams_2018_paths( @@ -222,33 +213,31 @@ def williams_2018( intensities_path, sep=sep_intensities, index=False, - lineterminator='\n', + lineterminator="\n", ) # Save .var annotation - df_var = adata.var[ - ["peptide_id", "protein_id", "gene_id"] - ].copy() + df_var = adata.var[["peptide_id", "protein_id", "gene_id"]].copy() var_annotation_path.parent.mkdir( - parents=True, exist_ok=True, + parents=True, + exist_ok=True, ) df_var.to_csv( var_annotation_path, sep=sep_var, index=False, - lineterminator='\n', + lineterminator="\n", ) # Save .obs annotation - df_obs = adata.obs[ - ["sample_id", "tissue", "mouse_id"] - ].copy() + df_obs = adata.obs[["sample_id", "tissue", "mouse_id"]].copy() sample_annotation_path.parent.mkdir( - parents=True, exist_ok=True, + parents=True, + exist_ok=True, ) df_obs.to_csv( sample_annotation_path, sep=sep_sample, index=False, - lineterminator='\n', + lineterminator="\n", ) diff --git a/proteopy/get/__init__.py b/proteopy/get/__init__.py index a24fd61..b9a60c0 100644 --- a/proteopy/get/__init__.py +++ b/proteopy/get/__init__.py @@ -2,4 +2,4 @@ from .stat_tests import ( differential_abundance_df, tests, - ) +) diff --git a/proteopy/get/proteoforms.py b/proteopy/get/proteoforms.py index 41dbb46..9b8c4f1 100644 --- a/proteopy/get/proteoforms.py +++ b/proteopy/get/proteoforms.py @@ -17,8 +17,7 @@ def proteoforms_df( pval_threshold: float | None = None, pval_adj_threshold: float | None = None, ) -> pd.DataFrame: - """ - Return proteoform peptide assignment results as a tidy dataframe. + """Return proteoform peptide assignment results as a tidy dataframe. Parameters ---------- @@ -61,7 +60,8 @@ def proteoforms_df( ] missing_columns = [ - column for column in proteoform_columns + column + for column in proteoform_columns if column not in adata.var.columns ] @@ -114,10 +114,9 @@ def proteoforms_df( ] if only_proteins: - proteoforms = ( - proteoforms.drop(columns=["peptide_id", "cluster_id"]) - .drop_duplicates(ignore_index=True) - ) + proteoforms = proteoforms.drop( + columns=["peptide_id", "cluster_id"] + ).drop_duplicates(ignore_index=True) return proteoforms return proteoforms.reset_index(drop=True) diff --git a/proteopy/get/stat_tests.py b/proteopy/get/stat_tests.py index 0cfd7da..7f898ff 100644 --- a/proteopy/get/stat_tests.py +++ b/proteopy/get/stat_tests.py @@ -18,8 +18,8 @@ def differential_abundance_df( max_pval: float | None = None, sort_by: str | None = None, ) -> pd.DataFrame: - """ - Retrieve differential abundance results from ``.varm`` as a long-format DataFrame. + """Retrieve differential abundance results from ``.varm`` as a long- + format DataFrame. Merges one or more test result DataFrames stored in ``adata.varm`` into a single tidy DataFrame with an added column identifying the source test. @@ -102,9 +102,7 @@ def differential_abundance_df( "Please provide only one." ) if keys is None and key_group is None: - raise ValueError( - "Must specify either `keys` or `key_group`." - ) + raise ValueError("Must specify either `keys` or `key_group`.") # Resolve keys from key_group if provided if key_group is not None: @@ -122,8 +120,7 @@ def differential_abundance_df( elif isinstance(keys, Sequence): if not all(isinstance(k, str) for k in keys): raise TypeError( - "`keys` must contain only strings; received " - f"{keys!r}." + "`keys` must contain only strings; received " f"{keys!r}." ) keys_list = list(keys) else: @@ -157,9 +154,7 @@ def differential_abundance_df( # Reorder columns: known leading cols first, then the rest leading = ["var_id", "test_type", "group_by", "design"] ordered = [c for c in leading if c in result.columns] - remaining = [ - c for c in result.columns if c not in ordered - ] + remaining = [c for c in result.columns if c not in ordered] result = result[ordered + remaining] # Apply filters @@ -191,8 +186,8 @@ def differential_abundance_df( def tests(adata: AnnData) -> pd.DataFrame: - """ - Retrieve a summary of all differential abundance tests stored in ``.varm``. + """Retrieve a summary of all differential abundance tests stored in + ``.varm``. Scans the ``.varm`` slots of the AnnData object for statistical test results and returns a DataFrame summarizing the tests performed. @@ -246,23 +241,33 @@ def tests(adata: AnnData) -> pd.DataFrame: design_mode = "one_vs_rest" else: design_mode = "one_vs_one" - records.append({ - "key": key, - "test_type": parsed["test_type"], - "group_by": parsed["group_by"], - "design": design, - "design_label": parsed["design_label"], - "design_mode": design_mode, - "layer": parsed["layer"], - }) + records.append( + { + "key": key, + "test_type": parsed["test_type"], + "group_by": parsed["group_by"], + "design": design, + "design_label": parsed["design_label"], + "design_mode": design_mode, + "layer": parsed["layer"], + } + ) except ValueError: # Not a stat-test slot, skip continue if not records: return pd.DataFrame( - columns=["key", "key_group", "test_type", "group_by", "design", - "design_label", "design_mode", "layer"] + columns=[ + "key", + "key_group", + "test_type", + "group_by", + "design", + "design_label", + "design_mode", + "layer", + ] ) df = pd.DataFrame(records) @@ -276,7 +281,17 @@ def build_key_group(row): return ";".join(parts) df["key_group"] = df.apply(build_key_group, axis=1) - df = df[["key", "key_group", "test_type", "group_by", "design", - "design_label", "design_mode", "layer"]] + df = df[ + [ + "key", + "key_group", + "test_type", + "group_by", + "design", + "design_label", + "design_mode", + "layer", + ] + ] return df diff --git a/proteopy/pl/clustering.py b/proteopy/pl/clustering.py index 35e30c7..256980a 100644 --- a/proteopy/pl/clustering.py +++ b/proteopy/pl/clustering.py @@ -19,8 +19,7 @@ def _compute_wcss(X: np.ndarray, labels: np.ndarray) -> float: - """ - Compute within-cluster sum of squares. + """Compute within-cluster sum of squares. Parameters ---------- @@ -45,8 +44,8 @@ def _compute_wcss(X: np.ndarray, labels: np.ndarray) -> float: def hclustv_silhouette( adata: ad.AnnData, - linkage_key: str = 'auto', - values_key: str = 'auto', + linkage_key: str = "auto", + values_key: str = "auto", k: int = 15, figsize: tuple[float, float] = (6.0, 4.0), show: bool = True, @@ -54,8 +53,7 @@ def hclustv_silhouette( save: str | Path | None = None, verbose: bool = True, ) -> Axes | None: - """ - Plot silhouette scores for hierarchical clustering. + """Plot silhouette scores for hierarchical clustering. Evaluates clustering quality by computing the average silhouette score for cluster counts ranging from 2 to ``k``. Higher silhouette @@ -114,10 +112,7 @@ def hclustv_silhouette( raise ValueError("k must be at least 2 to compute silhouette scores.") linkage_key, values_key = _resolve_hclustv_keys( - adata, - linkage_key, - values_key, - verbose + adata, linkage_key, values_key, verbose ) Z = adata.uns[linkage_key] @@ -182,8 +177,8 @@ def hclustv_silhouette( def hclustv_elbow( adata: ad.AnnData, - linkage_key: str = 'auto', - values_key: str = 'auto', + linkage_key: str = "auto", + values_key: str = "auto", k: int = 15, figsize: tuple[float, float] = (6.0, 4.0), show: bool = True, @@ -191,8 +186,8 @@ def hclustv_elbow( save: str | Path | None = None, verbose: bool = True, ) -> Axes | None: - """ - Plot within-cluster sum of squares (elbow plot) for hierarchical clustering. + """Plot within-cluster sum of squares (elbow plot) for hierarchical + clustering. Evaluates clustering by computing WCSS for cluster counts ranging from 1 to ``k``. The "elbow" point where WCSS reduction diminishes suggests @@ -251,10 +246,7 @@ def hclustv_elbow( raise ValueError("k must be at least 1 to compute WCSS.") linkage_key, values_key = _resolve_hclustv_keys( - adata, - linkage_key, - values_key, - verbose + adata, linkage_key, values_key, verbose ) Z = adata.uns[linkage_key] @@ -320,7 +312,7 @@ def hclustv_elbow( def hclustv_profile_intensities( adata: ad.AnnData, profiles: str | list[str] | None = None, - profile_key: str = 'auto', + profile_key: str = "auto", group_by: str | pd.Series | dict | None = None, sort_by: str | pd.Series | dict | None = None, order: list[str] | None = None, @@ -331,10 +323,10 @@ def hclustv_profile_intensities( xlabel_rotation: float = 45, sort_by_label_rotation: float = 0, ylabel: str = "Intensity", - marker: str = 'o', + marker: str = "o", markersize: float = 6, linewidth: float = 1.5, - errorbar: str | tuple = 'se', + errorbar: str | tuple = "se", color: str | None = None, figsize: tuple[float, float] | None = None, show: bool = True, @@ -342,8 +334,7 @@ def hclustv_profile_intensities( save: str | Path | None = None, verbose: bool = True, ) -> list[Axes] | None: - """ - Plot cluster profile intensities across observations. + """Plot cluster profile intensities across observations. Displays line plots for each cluster profile showing how intensity varies across observations. When ``group_by`` is specified, observations @@ -463,9 +454,7 @@ def hclustv_profile_intensities( check_proteodata(adata) # Resolve profiles key - resolved_key = _resolve_hclustv_profile_key( - adata, profile_key, verbose - ) + resolved_key = _resolve_hclustv_profile_key(adata, profile_key, verbose) profiles_df = adata.uns[resolved_key] @@ -484,7 +473,7 @@ def hclustv_profile_intensities( # Determine which profiles to plot if profiles is None: max_profiles = n_cols * n_rows - selected_profiles = available_profiles[:min(6, max_profiles)] + selected_profiles = available_profiles[: min(6, max_profiles)] elif isinstance(profiles, str): selected_profiles = [profiles] else: @@ -547,12 +536,12 @@ def _extract_mapping(param, param_name): f"{param_name} column '{param}' not found in adata.obs." ) obs_col_data = adata.obs[param] - if hasattr(obs_col_data, 'cat'): + if hasattr(obs_col_data, "cat"): cat_order = obs_col_data.cat.categories.tolist() obs_in_profiles = profiles_df.index.intersection(adata.obs_names) mapping = adata.obs.loc[obs_in_profiles, param].to_dict() elif isinstance(param, pd.Series): - if hasattr(param, 'cat'): + if hasattr(param, "cat"): cat_order = param.cat.categories.tolist() mapping = param.to_dict() elif isinstance(param, dict): @@ -564,28 +553,30 @@ def _extract_mapping(param, param_name): ) return mapping, cat_order - group_mapping, group_category_order = _extract_mapping(group_by, 'group_by') - sort_mapping, sort_category_order = _extract_mapping(sort_by, 'sort_by') + group_mapping, group_category_order = _extract_mapping( + group_by, "group_by" + ) + sort_mapping, sort_category_order = _extract_mapping(sort_by, "sort_by") # Build long-form DataFrame for seaborn plot_data = profiles_df[selected_profiles].copy() plot_data = plot_data.reset_index() plot_data = plot_data.melt( id_vars=[plot_data.columns[0]], - var_name='profile', - value_name='intensity', + var_name="profile", + value_name="intensity", ) obs_col = plot_data.columns[0] # Determine x variable and apply grouping/sorting if group_mapping is not None: - plot_data['group'] = plot_data[obs_col].map(group_mapping) - plot_data = plot_data.dropna(subset=['group']) - x_var = 'group' + plot_data["group"] = plot_data[obs_col].map(group_mapping) + plot_data = plot_data.dropna(subset=["group"]) + x_var = "group" category_order = group_category_order elif sort_mapping is not None: - plot_data['_sort_group'] = plot_data[obs_col].map(sort_mapping) - plot_data = plot_data.dropna(subset=['_sort_group']) + plot_data["_sort_group"] = plot_data[obs_col].map(sort_mapping) + plot_data = plot_data.dropna(subset=["_sort_group"]) x_var = obs_col category_order = sort_category_order else: @@ -597,38 +588,38 @@ def _extract_mapping(param, param_name): group_order = order elif category_order is not None: if group_mapping is not None: - present_values = set(plot_data['group'].unique()) + present_values = set(plot_data["group"].unique()) elif sort_mapping is not None: - present_values = set(plot_data['_sort_group'].unique()) + present_values = set(plot_data["_sort_group"].unique()) else: present_values = set() group_order = [c for c in category_order if c in present_values] elif group_mapping is not None: - group_order = sorted(plot_data['group'].unique()) + group_order = sorted(plot_data["group"].unique()) elif sort_mapping is not None: - group_order = sorted(plot_data['_sort_group'].unique()) + group_order = sorted(plot_data["_sort_group"].unique()) else: group_order = None # Filter to only include specified groups if group_order is not None: if group_mapping is not None: - plot_data = plot_data[plot_data['group'].isin(group_order)] + plot_data = plot_data[plot_data["group"].isin(group_order)] elif sort_mapping is not None: - plot_data = plot_data[plot_data['_sort_group'].isin(group_order)] + plot_data = plot_data[plot_data["_sort_group"].isin(group_order)] # Determine x-axis order if group_mapping is not None: x_order = group_order elif sort_mapping is not None: # Sort observations by their group membership - plot_data['_sort_group'] = pd.Categorical( - plot_data['_sort_group'], categories=group_order, ordered=True + plot_data["_sort_group"] = pd.Categorical( + plot_data["_sort_group"], categories=group_order, ordered=True ) sorted_obs = ( - plot_data[[obs_col, '_sort_group']] + plot_data[[obs_col, "_sort_group"]] .drop_duplicates() - .sort_values('_sort_group')[obs_col] + .sort_values("_sort_group")[obs_col] .tolist() ) x_order = sorted_obs @@ -663,16 +654,16 @@ def _extract_mapping(param, param_name): for idx, profile_name in enumerate(selected_profiles): _ax = axes_flat[idx] - profile_data = plot_data[plot_data['profile'] == profile_name] + profile_data = plot_data[plot_data["profile"] == profile_name] if group_mapping is not None: sns.lineplot( data=profile_data, x=x_var, - y='intensity', - err_style='bars', + y="intensity", + err_style="bars", errorbar=errorbar, - err_kws={'capsize': 4}, + err_kws={"capsize": 4}, marker=marker, markersize=markersize, linewidth=linewidth, @@ -684,12 +675,12 @@ def _extract_mapping(param, param_name): sns.lineplot( data=profile_data, x=x_var, - y='intensity', + y="intensity", errorbar=None, marker=marker, markersize=markersize, linewidth=linewidth, - color=color if color else '#4C78A8', + color=color if color else "#4C78A8", ax=_ax, sort=False, ) @@ -711,7 +702,8 @@ def _extract_mapping(param, param_name): for group_label in group_order: # Find observations belonging to this group group_obs = [ - obs for obs in x_order + obs + for obs in x_order if sort_mapping.get(obs) == group_label ] if not group_obs: @@ -726,18 +718,18 @@ def _extract_mapping(param, param_name): center_x, label_y, str(group_label), - ha='center', - va='top', + ha="center", + va="top", fontsize=9, rotation=sort_by_label_rotation, ) # Set x-axis tick labels with rotation - _ax.tick_params(axis='x', rotation=xlabel_rotation) + _ax.tick_params(axis="x", rotation=xlabel_rotation) for label in _ax.get_xticklabels(): - label.set_ha('right') + label.set_ha("right") - _ax.set_xlabel('') + _ax.set_xlabel("") _ax.set_ylabel(ylabel) # Set subplot title diff --git a/proteopy/pl/copf.py b/proteopy/pl/copf.py index 4d2e3b7..2c22b5f 100644 --- a/proteopy/pl/copf.py +++ b/proteopy/pl/copf.py @@ -14,6 +14,7 @@ from proteopy.utils.anndata import check_proteodata + def proteoform_scores( adata: ad.AnnData, *, @@ -259,18 +260,15 @@ def _validate_threshold( if protein_id_key is not None: if protein_id_key not in adata.var.columns: raise ValueError( - f"Column '{protein_id_key}' not found " - "in `adata.var`." + f"Column '{protein_id_key}' not found " "in `adata.var`." ) # Validate 1-to-1 mapping. mapping_df = adata.var[ ["protein_id", protein_id_key] ].drop_duplicates() - dup_proteins = ( - mapping_df - .groupby("protein_id")[protein_id_key] - .nunique() - ) + dup_proteins = mapping_df.groupby("protein_id")[ + protein_id_key + ].nunique() bad = dup_proteins[dup_proteins > 1] if not bad.empty: raise ValueError( @@ -295,8 +293,7 @@ def _validate_threshold( # highlight_prots may contain protein_id_key # values — resolve them to protein_ids. known_labels = set(mapping_df[protein_id_key]) - resolved_pids = set() - unknown = (set(highlight_prots) - known_labels) + unknown = set(highlight_prots) - known_labels if unknown: raise ValueError( "The following values from " @@ -304,9 +301,7 @@ def _validate_threshold( f"`adata.var['{protein_id_key}']`: " f"{sorted(unknown)}" ) - highlight_pids = { - label_to_pid[v] for v in highlight_prots - } + highlight_pids = {label_to_pid[v] for v in highlight_prots} else: pid_to_label = None known_ids = set(adata.var["protein_id"]) @@ -377,9 +372,7 @@ def _validate_threshold( if save is not None: if not isinstance(save, (str, Path)): - raise TypeError( - "`save` must be a path-like object or None." - ) + raise TypeError("`save` must be a path-like object or None.") _fig.savefig(save, dpi=300, bbox_inches="tight") if show: plt.show() diff --git a/proteopy/pl/intensities.py b/proteopy/pl/intensities.py index 5b5c9d6..e094e0a 100644 --- a/proteopy/pl/intensities.py +++ b/proteopy/pl/intensities.py @@ -1,5 +1,4 @@ import warnings -from functools import partial from typing import Any from collections.abc import Sequence from collections.abc import Sequence as SequenceABC @@ -47,8 +46,8 @@ def peptide_intensities( ax: bool = False, color_scheme: Any = None, ) -> Axes | list[Axes] | None: - """ - Plot peptide intensities across samples for the requested proteins. + """Plot peptide intensities across samples for the requested + proteins. Parameters ---------- @@ -570,9 +569,8 @@ def intensity_box_per_sample( figsize: tuple[float, float] = (8, 5), color_scheme: Any | None = None, ) -> Axes: - """ - Plot intensity distributions as boxplots, either per observation or pooled - by a categorical grouping. + """Plot intensity distributions as boxplots, either per observation + or pooled by a categorical grouping. Parameters ---------- @@ -632,7 +630,8 @@ def intensity_box_per_sample( if save is not None and not isinstance(save, (str, os.PathLike)): raise TypeError("`save` must be a string, PathLike, or None.") - # Select the matrix to plot (layer vs X) while preserving dense/sparse inputs + # Select the matrix to plot (layer vs X) while preserving dense/sparse + # inputs if layer is not None: if layer not in adata.layers: raise KeyError(f"Layer '{layer}' not found in adata.layers.") @@ -970,8 +969,8 @@ def intensity_hist( ax: bool = False, save: str | os.PathLike[str] | None = None, ) -> Axes | None: - """ - Plot histogram(s) of var intensities, optionally colored by imputation status. + """Plot histogram(s) of var intensities, optionally colored by + imputation status. Parameters ---------- @@ -1189,7 +1188,7 @@ def intensity_hist( if not resolved_colors: resolved_colors = [default_palette[label] for label in status_labels] palette_map = dict(zip(status_labels, resolved_colors)) - hue_order = status_labels if color_imputed else None + hue_order = status_labels measured_color = palette_map.get("Measured", default_palette["Measured"]) value_col = "intensity_value" @@ -1486,8 +1485,7 @@ def abundance_rank( ax: bool = False, color_scheme: Any = None, ) -> Axes | None: - """ - Plot variable intensities vs their abundance rank. + """Plot variable intensities vs their abundance rank. A typical MS proteomics plot to assess dynamic range and intensity distribution. Each point represents a variable (protein/peptide) with @@ -2057,8 +2055,7 @@ def box( save: str | os.PathLike[str] | None = None, ax: bool = False, ) -> Axes | list[Axes] | None: - """ - Boxplot of intensities for one or more variables. + """Boxplot of intensities for one or more variables. Parameters ---------- @@ -2389,8 +2386,8 @@ def binary_heatmap( save: str | os.PathLike[str] | None = None, ax: Axes | None = None, ) -> Axes: - """ - Plot a binary detection heatmap of intensities across samples and features. + """Plot a binary detection heatmap of intensities across samples and + features. Values greater than ``threshold`` are encoded as 1 (present) and values less than or equal to ``threshold`` are encoded as 0 (absent). Missing diff --git a/proteopy/pl/sequence.py b/proteopy/pl/sequence.py index 358ede4..64caa75 100644 --- a/proteopy/pl/sequence.py +++ b/proteopy/pl/sequence.py @@ -115,7 +115,8 @@ def _resolve_sequences( sequences: dict[str, dict], allow_multi_match: bool, ) -> dict[str, dict]: - """Resolve each sequence entry to absolute ``[start, end)`` coordinates. + """Resolve each sequence entry to absolute ``[start, end)`` + coordinates. For entries with a ``"seq"`` key the amino-acid string is located within ``ref_sequence`` via substring search. Entries with a @@ -181,7 +182,8 @@ def _resolve_sequences( def _check_overlaps( groups: dict[str, list[tuple[int, int, str]]], ) -> None: - """Raise ``ValueError`` if any two sequences within a group overlap.""" + """Raise ``ValueError`` if any two sequences within a group + overlap.""" for group_name, intervals in groups.items(): sorted_intervals = sorted(intervals, key=lambda x: x[0]) for i in range(len(sorted_intervals) - 1): @@ -239,7 +241,8 @@ def _plot_sequences_on_reference( figsize: tuple[float, float] | None = None, ax: Axes | None = None, ) -> Axes: - """Render sequences as horizontal bars aligned to a reference sequence. + """Render sequences as horizontal bars aligned to a reference + sequence. Draws a grey reference bar at the bottom and one colored broken-barh row per group above it. Each entry in ``sequences`` must supply either a diff --git a/proteopy/pl/stat_tests.py b/proteopy/pl/stat_tests.py index 43b89bd..bec4916 100644 --- a/proteopy/pl/stat_tests.py +++ b/proteopy/pl/stat_tests.py @@ -23,8 +23,7 @@ def _stat_test_title_from_varm_slot( adata: ad.AnnData, varm_slot: str, ) -> str: - """ - Generate a human-readable plot title from a stat test varm slot. + """Generate a human-readable plot title from a stat test varm slot. Parses the varm slot name to extract test type, group_by, design (group comparison), and optional layer information, then formats them @@ -63,8 +62,7 @@ def _normalize_alt_color( adata: ad.AnnData, plot_index: pd.Index, ) -> pd.Series: - """ - Validate and align alternative color boolean mask to plot data. + """Validate and align alternative color boolean mask to plot data. Converts the user-provided ``alt_color`` input into a boolean Series indexed by ``adata.var_names``, then reindexes to match @@ -140,53 +138,34 @@ def _normalize_alt_color( series = series.reindex(plot_index) if series.isna().any(): raise ValueError( - "alt_color contains missing values after aligning to " - "varm data." + "alt_color contains missing values after aligning to " "varm data." ) return series def _validate_thresholds(fc_thresh, pval_thresh): if fc_thresh is not None: - if ( - not isinstance(fc_thresh, (int, float)) - or fc_thresh <= 0 - ): - raise ValueError( - "fc_thresh must be a positive number." - ) + if not isinstance(fc_thresh, (int, float)) or fc_thresh <= 0: + raise ValueError("fc_thresh must be a positive number.") if pval_thresh is not None: if ( not isinstance(pval_thresh, (int, float)) or pval_thresh <= 0 or pval_thresh > 1 ): - raise ValueError( - "pval_thresh must be a number in (0, 1]." - ) + raise ValueError("pval_thresh must be a number in (0, 1].") def _validate_labels(top_labels, highlight_labels): if top_labels is not None: - if ( - not isinstance(top_labels, int) - or top_labels < 0 - ): - raise ValueError( - "top_labels must be a non-negative integer." - ) + if not isinstance(top_labels, int) or top_labels < 0: + raise ValueError("top_labels must be a non-negative integer.") if highlight_labels is not None: if not isinstance(highlight_labels, list): - raise TypeError( - "highlight_labels must be a list of strings." - ) - if ( - top_labels is not None - and highlight_labels is not None - ): + raise TypeError("highlight_labels must be a list of strings.") + if top_labels is not None and highlight_labels is not None: raise ValueError( - "top_labels and highlight_labels are mutually " - "exclusive." + "top_labels and highlight_labels are mutually " "exclusive." ) @@ -194,14 +173,10 @@ def _validate_figsize(figsize): if ( not isinstance(figsize, (tuple, list)) or len(figsize) != 2 - or not all( - isinstance(v, (int, float)) and v > 0 - for v in figsize - ) + or not all(isinstance(v, (int, float)) and v > 0 for v in figsize) ): raise ValueError( - "figsize must be a tuple/list of 2 positive " - "numbers." + "figsize must be a tuple/list of 2 positive " "numbers." ) @@ -218,8 +193,7 @@ def _validate_volcano_inputs( figsize: tuple[float, float], yscale_log: bool, ) -> tuple[pd.DataFrame, str]: - """ - Validate all input parameters for the volcano plot function. + """Validate all input parameters for the volcano plot function. Checks the AnnData object, threshold values, label arguments, figure size, y-axis scale type, and the required columns in the @@ -306,23 +280,18 @@ def _validate_volcano_inputs( # -- Validate varm slot exists and contains a DataFrame if varm_slot not in adata.varm: - raise KeyError( - f"varm_slot '{varm_slot}' not found in " - f"adata.varm." - ) + raise KeyError(f"varm_slot '{varm_slot}' not found in " f"adata.varm.") results = adata.varm[varm_slot] if not isinstance(results, pd.DataFrame): raise TypeError( - "Expected adata.varm[varm_slot] to be a pandas " - "DataFrame." + "Expected adata.varm[varm_slot] to be a pandas " "DataFrame." ) # -- Validate required columns exist if fc_col not in results.columns: raise KeyError( - f"Column '{fc_col}' not found in varm slot " - f"'{varm_slot}'." + f"Column '{fc_col}' not found in varm slot " f"'{varm_slot}'." ) # Prioritize adjusted p-values, fall back to unadjusted @@ -334,17 +303,12 @@ def _validate_volcano_inputs( pval_col_used = "pval" else: raise KeyError( - f"Columns '{pval_col}' or 'pval' not found in " - f"'{varm_slot}'." + f"Columns '{pval_col}' or 'pval' not found in " f"'{varm_slot}'." ) - if ( - alt_labels_key is not None - and alt_labels_key not in adata.var.columns - ): + if alt_labels_key is not None and alt_labels_key not in adata.var.columns: raise KeyError( - f"alt_labels_key '{alt_labels_key}' not found " - f"in adata.var." + f"alt_labels_key '{alt_labels_key}' not found " f"in adata.var." ) return results, pval_col_used @@ -371,8 +335,7 @@ def volcano( save: str | Path | None = None, ax: Axes | None = None, ) -> Axes: - """ - Visualize differential abundance results as a volcano plot. + """Visualize differential abundance results as a volcano plot. Creates a scatter plot of log fold change (x-axis) versus p-value (y-axis) for proteins from a statistical test stored in @@ -523,9 +486,17 @@ def volcano( ... ) """ results, pval_col_used = _validate_volcano_inputs( - adata, varm_slot, fc_col, pval_col, alt_labels_key, - fc_thresh, pval_thresh, top_labels, - highlight_labels, figsize, yscale_log, + adata, + varm_slot, + fc_col, + pval_col, + alt_labels_key, + fc_thresh, + pval_thresh, + top_labels, + highlight_labels, + figsize, + yscale_log, ) fc_arr = results[fc_col].to_numpy() @@ -542,13 +513,16 @@ def volcano( alt_arr = None if alt_color is not None: alt_series = _normalize_alt_color( - alt_color, adata, results.index, + alt_color, + adata, + results.index, ) alt_arr = alt_series.to_numpy() if title is None: title = _stat_test_title_from_varm_slot( - adata, varm_slot, + adata, + varm_slot, ) if ylabel is None: @@ -579,15 +553,11 @@ def volcano( def _validate_varm_slot(adata, varm_slot): if varm_slot not in adata.varm: - raise KeyError( - f"varm_slot '{varm_slot}' not found in " - f"adata.varm." - ) + raise KeyError(f"varm_slot '{varm_slot}' not found in " f"adata.varm.") results = adata.varm[varm_slot] if not isinstance(results, pd.DataFrame): raise TypeError( - "Expected adata.varm[varm_slot] to be a " - "pandas DataFrame." + "Expected adata.varm[varm_slot] to be a " "pandas DataFrame." ) return results @@ -598,8 +568,7 @@ def _resolve_pval_column(results, varm_slot): if "pval" in results.columns: return "pval" raise KeyError( - f"Neither 'pval_adj' nor 'pval' found in " - f"varm slot '{varm_slot}'." + f"Neither 'pval_adj' nor 'pval' found in " f"varm slot '{varm_slot}'." ) @@ -611,8 +580,7 @@ def _validate_diff_abundance_inputs( show_pval, pval_fontsize, ): - """ - Validate inputs for :func:`differential_abundance_box`. + """Validate inputs for :func:`differential_abundance_box`. Checks the varm slot, ``top_n``, ``group_by`` column, layer, and ``pval_fontsize``. Parses the varm slot to @@ -652,30 +620,23 @@ def _validate_diff_abundance_inputs( if top_n is None: top_n = 10 if not isinstance(top_n, int) or top_n <= 0: - raise ValueError( - "top_n must be a positive integer." - ) + raise ValueError("top_n must be a positive integer.") # -- Parse varm slot metadata parsed = parse_stat_test_varm_slot( - varm_slot, adata=adata, + varm_slot, + adata=adata, ) group_by = parsed["group_by"] if group_by not in adata.obs.columns: - raise KeyError( - f"Column '{group_by}' not found in " - f"adata.obs." - ) + raise KeyError(f"Column '{group_by}' not found in " f"adata.obs.") # -- Resolve layer if layer is None: layer = parsed["layer"] if layer is not None and layer not in adata.layers: - raise KeyError( - f"Layer '{layer}' not found in " - f"adata.layers." - ) + raise KeyError(f"Layer '{layer}' not found in " f"adata.layers.") pval_col = _resolve_pval_column(results, varm_slot) @@ -683,14 +644,12 @@ def _validate_diff_abundance_inputs( if show_pval: if ( not isinstance( - pval_fontsize, (int, float), + pval_fontsize, + (int, float), ) or pval_fontsize <= 0 ): - raise ValueError( - "pval_fontsize must be a positive " - "number." - ) + raise ValueError("pval_fontsize must be a positive " "number.") return results, group_by, layer, pval_col, top_n @@ -705,8 +664,7 @@ def _prepare_diff_abundance_data( order, show_pval, ): - """ - Prepare long-format DataFrame for boxplot rendering. + """Prepare long-format DataFrame for boxplot rendering. Sorts variables by p-value, extracts intensities for the top N variables, melts into long format, applies @@ -745,20 +703,17 @@ def _prepare_diff_abundance_data( """ # -- Sort by p-value and select top N results_sorted = results.sort_values( - by=pval_col, ascending=True, + by=pval_col, + ascending=True, ) top_vars = results_sorted.head(top_n).index.tolist() if not top_vars: - raise ValueError( - "No valid variables found after filtering." - ) + raise ValueError("No valid variables found after filtering.") pvals_to_plot = None if show_pval: - pvals_to_plot = results.loc[ - top_vars, pval_col - ].reindex(top_vars) + pvals_to_plot = results.loc[top_vars, pval_col].reindex(top_vars) # -- Extract intensity matrix if layer is not None: @@ -796,19 +751,13 @@ def _prepare_diff_abundance_data( f"Available groups: " f"{sorted(available_groups)}" ) - df_long = df_long[ - df_long[group_by].isin(order) - ] + df_long = df_long[df_long[group_by].isin(order)] group_order = list(order) else: - group_order = ( - df_long[group_by].dropna().unique().tolist() - ) + group_order = df_long[group_by].dropna().unique().tolist() if df_long.empty: - raise ValueError( - "No data remaining after filtering." - ) + raise ValueError("No data remaining after filtering.") # -- Set variable categorical order (by significance) df_long["variable"] = pd.Categorical( @@ -827,8 +776,7 @@ def _annotate_boxplot_pvals( top_vars, pval_fontsize, ): - """ - Add per-variable p-value text annotations to boxplot axes. + """Add per-variable p-value text annotations to boxplot axes. Computes a uniform label y-position above the data range and annotates each variable's p-value centered @@ -850,14 +798,8 @@ def _annotate_boxplot_pvals( Font size for the annotation text. """ all_intensity = df_long["intensity"].to_numpy() - finite_intensity = all_intensity[ - np.isfinite(all_intensity) - ] - data_max = ( - float(finite_intensity.max()) - if finite_intensity.size - else 0.0 - ) + finite_intensity = all_intensity[np.isfinite(all_intensity)] + data_max = float(finite_intensity.max()) if finite_intensity.size else 0.0 y_min, y_max = _ax.get_ylim() span = y_max - y_min if y_max != y_min else 1.0 label_y = data_max + 0.05 * span @@ -896,8 +838,8 @@ def differential_abundance_box( save: str | Path | None = None, ax: bool | None = None, ) -> Axes | None: - """ - Display boxplots of intensities for top differentially abundant variables. + """Display boxplots of intensities for top differentially abundant + variables. For each of the top N differentially abundant variables (sorted by p-value), shows side-by-side boxplots comparing intensities across @@ -997,8 +939,12 @@ def differential_abundance_box( # -- Validate inputs results, group_by, layer, pval_col, top_n = ( _validate_diff_abundance_inputs( - adata, varm_slot, top_n, layer, - show_pval, pval_fontsize, + adata, + varm_slot, + top_n, + layer, + show_pval, + pval_fontsize, ) ) @@ -1009,8 +955,14 @@ def differential_abundance_box( # -- Prepare data df_long, top_vars, group_order, pvals_to_plot = ( _prepare_diff_abundance_data( - adata, results, top_n, pval_col, - group_by, layer, order, show_pval, + adata, + results, + top_n, + pval_col, + group_by, + layer, + order, + show_pval, ) ) @@ -1018,7 +970,8 @@ def differential_abundance_box( palette = None if color_scheme is not None: colors = _resolve_color_scheme( - color_scheme, group_order, + color_scheme, + group_order, ) if colors: palette = dict(zip(group_order, colors)) @@ -1034,15 +987,18 @@ def differential_abundance_box( hue_order=group_order, palette=palette, gap=0.1, - flierprops={'marker': '.', 'markersize': 1}, + flierprops={"marker": ".", "markersize": 1}, ax=_ax, ) # -- Annotate p-values if show_pval and pvals_to_plot is not None: _annotate_boxplot_pvals( - _ax, df_long, pvals_to_plot, - top_vars, pval_fontsize, + _ax, + df_long, + pvals_to_plot, + top_vars, + pval_fontsize, ) # -- Style axes @@ -1056,7 +1012,8 @@ def differential_abundance_box( if title is None: title = _stat_test_title_from_varm_slot( - adata, varm_slot, + adata, + varm_slot, ) _ax.set_title(title) diff --git a/proteopy/pl/stats.py b/proteopy/pl/stats.py index 4d9ed69..d52f73f 100644 --- a/proteopy/pl/stats.py +++ b/proteopy/pl/stats.py @@ -1,6 +1,7 @@ import warnings from pathlib import Path -from typing import Any, Sequence +from typing import Any +from collections.abc import Sequence import uuid import numpy as np @@ -41,14 +42,9 @@ def _validate_completeness_args( # noqa: C901 check_proteodata(adata) if axis not in (0, 1): - raise ValueError( - "`axis` must be either 0 (var) or 1 (obs)." - ) + raise ValueError("`axis` must be either 0 (var) or 1 (obs).") - if ( - group_by_resolution is not None - and group_by_partition is not None - ): + if group_by_resolution is not None and group_by_partition is not None: raise ValueError( "`group_by_resolution` and `group_by_partition` " "are mutually exclusive. Provide one or neither." @@ -63,18 +59,13 @@ def _validate_completeness_args( # noqa: C901 if fraction_thresh is not None and ( fraction_thresh < 0 or fraction_thresh > 1 ): - raise ValueError( - "`fraction_thresh` must be between 0 and 1." - ) + raise ValueError("`fraction_thresh` must be between 0 and 1.") if bin_width is not None and bin_width <= 0: - raise ValueError( - "`bin_width` must be a positive number." - ) + raise ValueError("`bin_width` must be a positive number.") - if ( - group_by_resolution is None - and (min_count is not None or min_fraction is not None) + if group_by_resolution is None and ( + min_count is not None or min_fraction is not None ): warnings.warn( "`min_count` and `min_fraction` are only used when " @@ -88,15 +79,12 @@ def _validate_completeness_args( # noqa: C901 matrix = adata.X else: if layer not in adata.layers: - raise KeyError( - f"Layer '{layer}' not found in adata.layers." - ) + raise KeyError(f"Layer '{layer}' not found in adata.layers.") matrix = adata.layers[layer] if matrix is None: raise ValueError( - "Selected matrix is empty; cannot compute " - "completeness." + "Selected matrix is empty; cannot compute " "completeness." ) n_obs, n_vars = matrix.shape @@ -113,14 +101,10 @@ def _validate_completeness_args( # noqa: C901 grouping_frame = adata.var if axis_length == 0: - raise ValueError( - "Cannot compute completeness on empty axis." - ) + raise ValueError("Cannot compute completeness on empty axis.") if n_items == 0: - raise ValueError( - "No items to compute completeness for." - ) + raise ValueError("No items to compute completeness for.") if order is not None and group_by_partition is None: warnings.warn( @@ -130,24 +114,36 @@ def _validate_completeness_args( # noqa: C901 ) return [ - matrix, axis_labels, n_items, axis_length, - grouping_frame, min_count, min_fraction, + matrix, + axis_labels, + n_items, + axis_length, + grouping_frame, + min_count, + min_fraction, ] def _summary_stats(values): """Return a single-row DataFrame of summary statistics.""" - s = pd.Series(values) if not isinstance( - values, pd.Series, - ) else values - return pd.DataFrame({ - "count": [s.count()], - "mean": [s.mean()], - "median": [s.median()], - "std": [s.std()], - "min": [s.min()], - "max": [s.max()], - }) + s = ( + pd.Series(values) + if not isinstance( + values, + pd.Series, + ) + else values + ) + return pd.DataFrame( + { + "count": [s.count()], + "mean": [s.mean()], + "median": [s.median()], + "std": [s.std()], + "min": [s.min()], + "max": [s.max()], + } + ) def _count_nonmissing(mat, ax, zero_to_na): @@ -198,9 +194,7 @@ def _resolve_partition_order(order, available): order = [order] else: order = list(order) - missing = [ - g for g in order if g not in available - ] + missing = [g for g in order if g not in available] if missing: raise ValueError( "Unknown group(s) in `order`: " @@ -211,7 +205,10 @@ def _resolve_partition_order(order, available): def _group_completeness_counts( - matrix, axis, g_mask, zero_to_na, + matrix, + axis, + g_mask, + zero_to_na, ): """Count non-missing values per item within a group mask.""" if axis == 0: @@ -239,7 +236,8 @@ def _plot_completeness_partition( figsize, ax, ): - """Plot boxplots of completeness partitioned by a grouping column.""" + """Plot boxplots of completeness partitioned by a grouping + column.""" if group_by_partition not in grouping_frame.columns: raise KeyError( f"Column '{group_by_partition}' not found " @@ -249,13 +247,13 @@ def _plot_completeness_partition( group_series = grouping_frame[group_by_partition] available = list(group_series.dropna().unique()) unique_groups = _resolve_partition_order( - order, available, + order, + available, ) if len(unique_groups) == 0: raise ValueError( - "No groups found for the given " - "`group_by_partition` column.", + "No groups found for the given " "`group_by_partition` column.", ) # -- compute completeness per item within each group @@ -263,35 +261,40 @@ def _plot_completeness_partition( for g in unique_groups: g_mask = (group_series == g).values counts_g, g_size = _group_completeness_counts( - matrix, axis, g_mask, zero_to_na, + matrix, + axis, + g_mask, + zero_to_na, ) fracs = counts_g / g_size for f in fracs: - records.append( - {"Group": str(g), "Completeness": f} - ) + records.append({"Group": str(g), "Completeness": f}) long_df = pd.DataFrame(records) if print_stats: print("Global:") - print(_summary_stats( - long_df["Completeness"], - ).to_string( - index=False, float_format="%.4f", - )) + print( + _summary_stats( + long_df["Completeness"], + ).to_string( + index=False, + float_format="%.4f", + ) + ) per_group = ( long_df.groupby("Group")["Completeness"] - .agg(["count", "mean", "median", - "std", "min", "max"]) + .agg(["count", "mean", "median", "std", "min", "max"]) .reindex( [str(g) for g in unique_groups], ) ) print(f"\nPer {group_by_partition}:") - print(per_group.to_string( - float_format="%.4f", - )) + print( + per_group.to_string( + float_format="%.4f", + ) + ) print() if ax is None: @@ -307,8 +310,7 @@ def _plot_completeness_partition( ax=_ax, ) _ax.set_title( - f"Completeness per {axis_labels[0]} " - f"by '{group_by_partition}'", + f"Completeness per {axis_labels[0]} " f"by '{group_by_partition}'", ) _ax.set_xlabel(group_by_partition) _ax.set_ylabel( @@ -352,9 +354,12 @@ def _plot_completeness_ungrouped( if print_stats: print("Global:") - print(_summary_stats(fractions).to_string( - index=False, float_format="%.4f", - )) + print( + _summary_stats(fractions).to_string( + index=False, + float_format="%.4f", + ) + ) print() if ax is None: @@ -379,7 +384,8 @@ def _plot_completeness_ungrouped( ) _ax.legend() plt.setp( - _ax.get_xticklabels(), rotation=xlabel_rotation, + _ax.get_xticklabels(), + rotation=xlabel_rotation, ) return fig, _ax @@ -409,15 +415,12 @@ def _plot_completeness_resolution( ) group_series = grouping_frame[group_by_resolution] - unique_groups = list( - group_series.dropna().unique() - ) + unique_groups = list(group_series.dropna().unique()) n_groups = len(unique_groups) if n_groups == 0: raise ValueError( - "No groups found for the given " - "`group_by_resolution` column.", + "No groups found for the given " "`group_by_resolution` column.", ) # Default threshold: min_count=1 @@ -431,13 +434,14 @@ def _plot_completeness_resolution( for g in unique_groups: g_mask = (group_series == g).values counts_g, group_size = _group_completeness_counts( - matrix, axis, g_mask, zero_to_na, + matrix, + axis, + g_mask, + zero_to_na, ) if use_fraction: - detected = ( - counts_g / group_size >= min_fraction - ) + detected = counts_g / group_size >= min_fraction else: detected = counts_g >= min_count @@ -447,11 +451,14 @@ def _plot_completeness_resolution( if print_stats: print("Global:") - print(_summary_stats( - detection_fractions, - ).to_string( - index=False, float_format="%.4f", - )) + print( + _summary_stats( + detection_fractions, + ).to_string( + index=False, + float_format="%.4f", + ) + ) print() if ax is None: @@ -460,19 +467,18 @@ def _plot_completeness_resolution( _ax = ax fig = _ax.get_figure() sns.histplot( - detection_fractions, bins=bin_edges, ax=_ax, + detection_fractions, + bins=bin_edges, + ax=_ax, ) if use_fraction: - threshold_label = ( - f"min_fraction={min_fraction}" - ) + threshold_label = f"min_fraction={min_fraction}" else: threshold_label = f"min_count={min_count}" _ax.set_title( - f"'{group_by_resolution}' completeness " - f"per {axis_labels[0]}", + f"'{group_by_resolution}' completeness " f"per {axis_labels[0]}", ) _ax.set_xlabel( f"Fraction of '{group_by_resolution}' groups " @@ -488,7 +494,8 @@ def _plot_completeness_resolution( ) _ax.legend() plt.setp( - _ax.get_xticklabels(), rotation=xlabel_rotation, + _ax.get_xticklabels(), + rotation=xlabel_rotation, ) return fig, _ax @@ -512,8 +519,8 @@ def completeness( ax: Axes | None = None, save: str | Path | None = None, ) -> Axes: - """ - Plot a histogram of completeness across observations or variables. + """Plot a histogram of completeness across observations or + variables. When ``group_by_resolution`` is provided, shows the distribution of the fraction of groups in which each item is "detected" (has at @@ -583,9 +590,15 @@ def completeness( The Matplotlib Axes object used for plotting. """ validated = _validate_completeness_args( - adata, axis, layer, order, - group_by_resolution, group_by_partition, - min_count, min_fraction, fraction_thresh, + adata, + axis, + layer, + order, + group_by_resolution, + group_by_partition, + min_count, + min_fraction, + fraction_thresh, bin_width, ) matrix = validated[0] @@ -597,7 +610,9 @@ def completeness( min_fraction = validated[6] bin_edges = np.arange( - 0.0, 1.0 + bin_width * 2, bin_width, + 0.0, + 1.0 + bin_width * 2, + bin_width, ) if group_by_partition is not None: @@ -674,8 +689,7 @@ def completeness_per_var( ax: Axes | None = None, save: str | Path | None = None, ) -> Axes: - """ - Plot a histogram of completeness per variable. + """Plot a histogram of completeness per variable. For each variable (column), completeness is the fraction of observations (rows) with non-missing values. When @@ -800,8 +814,7 @@ def completeness_per_sample( ax: Axes | None = None, save: str | Path | None = None, ) -> Axes: - """ - Plot a histogram of completeness per sample (observation). + """Plot a histogram of completeness per sample (observation). For each sample (row), completeness is the fraction of variables (columns) with non-missing values. When ``group_by_resolution`` @@ -922,30 +935,37 @@ def _append_unique(seq, value) -> None: def _n_var_summary_stats(series): """Return a one-row DataFrame of count summary stats.""" - return pd.DataFrame({ - "mean_count": [series.mean()], - "std_count": [series.std()], - "median_count": [series.median()], - "min_count": [series.min()], - "max_count": [series.max()], - }) + return pd.DataFrame( + { + "mean_count": [series.mean()], + "std_count": [series.std()], + "median_count": [series.median()], + "min_count": [series.min()], + "max_count": [series.max()], + } + ) def _add_pct_cols(df, total): """Add percentage columns to *df* in place.""" for col in [ - "mean", "std", "median", "min", "max", + "mean", + "std", + "median", + "min", + "max", ]: - df[f"{col}_pct"] = ( - df[f"{col}_count"] / total * 100 - ) + df[f"{col}_pct"] = df[f"{col}_count"] / total * 100 def _print_stats_df(df): """Print a DataFrame with one-decimal formatting.""" - print(df.to_string( - index=False, float_format="%.1f", - )) + print( + df.to_string( + index=False, + float_format="%.1f", + ) + ) _AGG_STATS = { @@ -976,16 +996,12 @@ def _validate_n_var_per_sample_args( # noqa: C901 "'peptide', 'protein', or None." ) if level == "peptide" and data_level == "protein": - raise ValueError( - "Cannot count peptides from " - "protein-level data." - ) + raise ValueError("Cannot count peptides from " "protein-level data.") # -- Mutual exclusivity if group_by is not None and order_by is not None: raise ValueError( - "`group_by` and `order_by` cannot be " - "used together." + "`group_by` and `order_by` cannot be " "used together." ) # -- Validate layer @@ -993,55 +1009,37 @@ def _validate_n_var_per_sample_args( # noqa: C901 matrix = adata.X else: if layer not in adata.layers: - raise KeyError( - f"Layer '{layer}' not found in " - "adata.layers." - ) + raise KeyError(f"Layer '{layer}' not found in " "adata.layers.") matrix = adata.layers[layer] if matrix is None: raise ValueError( - "Selected layer is empty; cannot " - "compute variable counts." + "Selected layer is empty; cannot " "compute variable counts." ) # -- Validate group_by column if group_by is not None: if group_by not in adata.obs.columns: - raise KeyError( - f"Column '{group_by}' not found " - "in adata.obs." - ) + raise KeyError(f"Column '{group_by}' not found " "in adata.obs.") # -- Validate order_by column if order_by is not None: if order_by not in adata.obs.columns: - raise KeyError( - f"Column '{order_by}' not found " - "in adata.obs." - ) + raise KeyError(f"Column '{order_by}' not found " "in adata.obs.") # -- Validate order elements if order is not None: if group_by is not None: - valid = set( - adata.obs[group_by].dropna().unique() - ) + valid = set(adata.obs[group_by].dropna().unique()) source = f"adata.obs['{group_by}']" elif order_by is not None: - valid = set( - adata.obs[order_by].dropna().unique() - ) + valid = set(adata.obs[order_by].dropna().unique()) source = f"adata.obs['{order_by}']" else: valid = set(adata.obs_names) source = "adata.obs_names" - invalid = [ - o for o in order if o not in valid - ] + invalid = [o for o in order if o not in valid] if invalid: - invalid_str = ", ".join( - map(str, invalid) - ) + invalid_str = ", ".join(map(str, invalid)) raise ValueError( f"Unknown value(s) in `order`: " f"{invalid_str}. Valid values " @@ -1064,7 +1062,11 @@ def _valid_mask(matrix, zero_to_na): def _n_var_count_per_sample( - matrix, zero_to_na, level, data_level, adata, + matrix, + zero_to_na, + level, + data_level, + adata, ): """Count non-missing vars per sample. @@ -1086,7 +1088,8 @@ def _n_var_count_per_sample( n_proteins = protein_codes.max() + 1 # OR-reduce peptide columns into protein columns prot_detected = np.zeros( - (valid.shape[0], n_proteins), dtype=bool, + (valid.shape[0], n_proteins), + dtype=bool, ) np.maximum.at( prot_detected, @@ -1103,8 +1106,13 @@ def _n_var_count_per_sample( def _n_var_derive_totals( - counts_array, level, data_level, - percentage, ylabel, title, adata, + counts_array, + level, + data_level, + percentage, + ylabel, + title, + adata, ): """Derive totals, percentage, ylabel, and title.""" if level == "protein" and data_level == "peptide": @@ -1115,12 +1123,9 @@ def _n_var_derive_totals( if percentage: if total_vars == 0: raise ValueError( - "Cannot compute percentage: " - "no variables found." + "Cannot compute percentage: " "no variables found." ) - counts_array = ( - counts_array / total_vars - ) * 100 + counts_array = (counts_array / total_vars) * 100 # -- Resolve y-axis label if ylabel is None: @@ -1128,15 +1133,9 @@ def _n_var_derive_totals( # -- Resolve title if title is None: - if level == "protein" or ( - level is None - and data_level == "protein" - ): + if level == "protein" or (level is None and data_level == "protein"): entity = "proteins" - elif level == "peptide" or ( - level is None - and data_level == "peptide" - ): + elif level == "peptide" or (level is None and data_level == "peptide"): entity = "peptides" else: entity = "variables" @@ -1146,7 +1145,10 @@ def _n_var_derive_totals( def _n_var_print_group_stats( - counts, stats_df, group_by, total_vars, + counts, + stats_df, + group_by, + total_vars, ): """Print global and per-group statistics.""" global_df = _n_var_summary_stats(counts["count"]) @@ -1160,27 +1162,38 @@ def _n_var_print_group_stats( def _n_var_resolve_bar_colors( - color_scheme, group_order, stats_df, group_by, + color_scheme, + group_order, + stats_df, + group_by, ): """Resolve bar colors from a color scheme.""" if color_scheme is None: return None colors = _resolve_color_scheme( - color_scheme, group_order, + color_scheme, + group_order, ) if colors is None: return None - return [ - colors[group_order.index(grp)] - for grp in stats_df[group_by] - ] + return [colors[group_order.index(grp)] for grp in stats_df[group_by]] def _n_var_group_by_path( - counts, adata, group_by, order, - color_scheme, total_vars, ylabel, title, - print_stats, figsize, xlabel_rotation, - save, show, ax=None, + counts, + adata, + group_by, + order, + color_scheme, + total_vars, + ylabel, + title, + print_stats, + figsize, + xlabel_rotation, + save, + show, + ax=None, ): """Plot mean +/- std bar chart grouped by an obs column.""" group_df = adata.obs[[group_by]].copy() @@ -1188,23 +1201,23 @@ def _n_var_group_by_path( "obs", ).reset_index() counts = pd.merge( - counts, group_df, on="obs", how="left", + counts, + group_df, + on="obs", + how="left", ) counts = counts.dropna(subset=[group_by]) if counts.empty: raise ValueError( - "No observations remain after " - "aligning `group_by` labels.", + "No observations remain after " "aligning `group_by` labels.", ) group_values = counts[group_by] if isinstance( - group_values.dtype, pd.CategoricalDtype, + group_values.dtype, + pd.CategoricalDtype, ): - group_values = ( - group_values.cat - .remove_unused_categories() - ) + group_values = group_values.cat.remove_unused_categories() counts[group_by] = group_values available_groups: list[Any] = [] @@ -1212,7 +1225,9 @@ def _n_var_group_by_path( _append_unique(available_groups, value) group_order = _n_var_resolve_group_order( - order, available_groups, group_values, + order, + available_groups, + group_values, ) # Append any groups not yet in order @@ -1221,29 +1236,30 @@ def _n_var_group_by_path( # -- Compute per-group statistics stats_df = ( - counts.groupby(group_by, observed=True)[ - "count" - ] + counts.groupby(group_by, observed=True)["count"] .agg(**_AGG_STATS) .reindex(group_order) ) stats_df = stats_df.dropna( subset=["mean_count"], ) - stats_df["std_count"] = ( - stats_df["std_count"].fillna(0.0) - ) + stats_df["std_count"] = stats_df["std_count"].fillna(0.0) stats_df = stats_df.reset_index() if print_stats: _n_var_print_group_stats( - counts, stats_df, group_by, total_vars, + counts, + stats_df, + group_by, + total_vars, ) # -- Plot grouped bar chart bar_colors = _n_var_resolve_bar_colors( - color_scheme, group_order, - stats_df, group_by, + color_scheme, + group_order, + stats_df, + group_by, ) if ax is not None: @@ -1272,7 +1288,8 @@ def _n_var_group_by_path( if save is not None: fig.savefig( - save, dpi=300, + save, + dpi=300, bbox_inches="tight", ) if show: @@ -1281,7 +1298,9 @@ def _n_var_group_by_path( def _n_var_resolve_group_order( - order, available_groups, group_values, + order, + available_groups, + group_values, ): """Resolve group ordering from order arg or categories.""" if order: @@ -1289,7 +1308,8 @@ def _n_var_resolve_group_order( group_order: list[Any] = [] for grp in order: if not _contains_value( - group_order, grp, + group_order, + grp, ): group_order.append(grp) return group_order @@ -1305,30 +1325,32 @@ def _n_var_resolve_group_order( def _n_var_resolve_obs_ordering( - counts, obs_df, group_key, order, - available_groups, ascending, + counts, + obs_df, + group_key, + order, + available_groups, + ascending, ): """Resolve observation ordering for the per-obs bar path.""" has_grouping = group_key != "_group" if has_grouping: group_order = _n_var_resolve_group_order( - order, available_groups, obs_df[group_key], + order, + available_groups, + obs_df[group_key], ) for grp in available_groups: _append_unique(group_order, grp) cat_index_map: dict[str, list[str]] = {} for grp in group_order: - obs_list = obs_df.loc[ - obs_df[group_key] == grp, "obs" - ].tolist() + obs_list = obs_df.loc[obs_df[group_key] == grp, "obs"].tolist() if obs_list: cat_index_map[str(grp)] = obs_list x_ordered = [ - obs - for obs_list in cat_index_map.values() - for obs in obs_list + obs for obs_list in cat_index_map.values() for obs in obs_list ] else: if order: @@ -1336,11 +1358,13 @@ def _n_var_resolve_obs_ordering( x_ordered: list[Any] = [] for obs_name in order: _append_unique( - x_ordered, obs_name, + x_ordered, + obs_name, ) for obs_name in counts["obs"]: _append_unique( - x_ordered, obs_name, + x_ordered, + obs_name, ) else: if ascending is not None: @@ -1349,24 +1373,30 @@ def _n_var_resolve_obs_ordering( ascending=ascending, kind="mergesort", ) - x_ordered = sorted_counts[ - "obs" - ].tolist() + x_ordered = sorted_counts["obs"].tolist() else: - x_ordered = counts[ - "obs" - ].tolist() + x_ordered = counts["obs"].tolist() cat_index_map = {"all": x_ordered} return x_ordered, cat_index_map def _n_var_plot_per_obs( - counts, x_ordered, cat_index_map, - group_key, order_by, total_vars, - color_scheme, ylabel, title, - print_stats, figsize, xlabel_rotation, - order_by_label_rotation, save, show, + counts, + x_ordered, + cat_index_map, + group_key, + order_by, + total_vars, + color_scheme, + ylabel, + title, + print_stats, + figsize, + xlabel_rotation, + order_by_label_rotation, + save, + show, ax=None, ): """Plot per-observation bars with group labels.""" @@ -1383,7 +1413,8 @@ def _n_var_plot_per_obs( _print_stats_df(global_df) print_df = ( counts.groupby( - order_by, observed=True, + order_by, + observed=True, )["count"] .agg(**_AGG_STATS) .reset_index() @@ -1399,24 +1430,20 @@ def _n_var_plot_per_obs( _print_stats_df(print_df) # -- Resolve colors - counts[group_key] = ( - counts[group_key].astype(str) - ) + counts[group_key] = counts[group_key].astype(str) unique_groups = list(cat_index_map.keys()) colors = _resolve_color_scheme( - color_scheme, unique_groups, + color_scheme, + unique_groups, ) plot_kwargs = {} if colors is not None: color_map = { - str(grp): colors[i] - for i, grp in enumerate(unique_groups) + str(grp): colors[i] for i, grp in enumerate(unique_groups) } - plot_kwargs["color"] = ( - counts[group_key].map(color_map).to_list() - ) + plot_kwargs["color"] = counts[group_key].map(color_map).to_list() # -- Plot per-observation bars if ax is not None: @@ -1442,10 +1469,8 @@ def _n_var_plot_per_obs( _ax.set_ylabel(ylabel) # -- Add group labels above bars - obs_idx_map = { - obs: i for i, obs in enumerate(x_ordered) - } - ymax = counts['count'].max() + obs_idx_map = {obs: i for i, obs in enumerate(x_ordered)} + ymax = counts["count"].max() for cat, obs_list in cat_index_map.items(): if not obs_list: continue @@ -1457,10 +1482,10 @@ def _n_var_plot_per_obs( x=mid_idx, y=ymax * 1.05, s=cat, - ha='center', - va='bottom', + ha="center", + va="bottom", fontsize=8, - fontweight='bold', + fontweight="bold", rotation=order_by_label_rotation, ) @@ -1469,7 +1494,9 @@ def _n_var_plot_per_obs( if save is not None: fig.savefig( - save, dpi=300, bbox_inches='tight', + save, + dpi=300, + bbox_inches="tight", ) if show: plt.show() @@ -1498,8 +1525,8 @@ def n_var_per_sample( ax: Axes | None = None, save: str | Path | None = None, ) -> Axes: - """ - Plot the number of detected variables (peptides or protein) per sample. + """Plot the number of detected variables (peptides or protein) per + sample. Parameters ---------- @@ -1591,24 +1618,33 @@ def n_var_per_sample( ... order=["LBaso", "Ortho"], ... ) """ - data_level, level, matrix = ( - _validate_n_var_per_sample_args( - adata, level, group_by, order_by, - order, layer, - ) + data_level, level, matrix = _validate_n_var_per_sample_args( + adata, + level, + group_by, + order_by, + order, + layer, ) # -- Count non-missing vars per sample counts_array = _n_var_count_per_sample( - matrix, zero_to_na, level, data_level, adata, + matrix, + zero_to_na, + level, + data_level, + adata, ) # -- Derive totals, percentage, ylabel, and title - total_vars, counts_array, ylabel, title = ( - _n_var_derive_totals( - counts_array, level, data_level, - percentage, ylabel, title, adata, - ) + total_vars, counts_array, ylabel, title = _n_var_derive_totals( + counts_array, + level, + data_level, + percentage, + ylabel, + title, + adata, ) # -- Build counts DataFrame @@ -1625,15 +1661,13 @@ def n_var_per_sample( if ascending is not None: if group_by is not None: warnings.warn( - "`ascending` is ignored when " - "`group_by` is set.", + "`ascending` is ignored when " "`group_by` is set.", UserWarning, stacklevel=2, ) elif order is not None: warnings.warn( - "`ascending` is ignored when " - "`order` is set explicitly.", + "`ascending` is ignored when " "`order` is set explicitly.", UserWarning, stacklevel=2, ) @@ -1641,17 +1675,25 @@ def n_var_per_sample( # -- group_by path: mean +/- std bar plot per group if group_by is not None: return _n_var_group_by_path( - counts, adata, group_by, order, - color_scheme, total_vars, ylabel, - title, print_stats, figsize, - xlabel_rotation, save, show, ax, + counts, + adata, + group_by, + order, + color_scheme, + total_vars, + ylabel, + title, + print_stats, + figsize, + xlabel_rotation, + save, + show, + ax, ) # -- Per-observation bar plot (with optional order_by) has_grouping = order_by is not None - group_key = ( - order_by if has_grouping else "_group" - ) + group_key = order_by if has_grouping else "_group" # Attach grouping column to counts if has_grouping: @@ -1661,7 +1703,10 @@ def n_var_per_sample( "obs", ).reset_index() counts = pd.merge( - counts, obs, on="obs", how="left", + counts, + obs, + on="obs", + how="left", ) else: counts[group_key] = counts["obs"] @@ -1678,20 +1723,20 @@ def n_var_per_sample( obs_df[group_key].dtype, pd.CategoricalDtype, ): - obs_df[group_key] = ( - obs_df[group_key].astype("category") - ) + obs_df[group_key] = obs_df[group_key].astype("category") available_groups: list[Any] = [] for value in obs_df[group_key]: _append_unique(available_groups, value) # -- Resolve observation ordering - x_ordered, cat_index_map = ( - _n_var_resolve_obs_ordering( - counts, obs_df, group_key, order, - available_groups, ascending, - ) + x_ordered, cat_index_map = _n_var_resolve_obs_ordering( + counts, + obs_df, + group_key, + order, + available_groups, + ascending, ) counts["obs"] = pd.Categorical( @@ -1703,11 +1748,22 @@ def n_var_per_sample( # -- Plot per-observation bars return _n_var_plot_per_obs( - counts, x_ordered, cat_index_map, - group_key, order_by, total_vars, - color_scheme, ylabel, title, - print_stats, figsize, xlabel_rotation, - order_by_label_rotation, save, show, ax, + counts, + x_ordered, + cat_index_map, + group_key, + order_by, + total_vars, + color_scheme, + ylabel, + title, + print_stats, + figsize, + xlabel_rotation, + order_by_label_rotation, + save, + show, + ax, ) @@ -1779,8 +1835,7 @@ def n_samples_per_category( save: str | Path | None = None, ax: bool = False, ) -> Axes | None: - """ - Plot sample (obs) counts per category (optionally stratified). + """Plot sample (obs) counts per category (optionally stratified). Parameters ---------- @@ -1887,14 +1942,18 @@ def _ordered_categories(series: pd.Series) -> list[Any]: if selected_categories is not None: first_level_order = [ - category for category in selected_categories if category in first_level_order + category + for category in selected_categories + if category in first_level_order ] if order is not None: if isinstance(order, str): specified = [order] else: specified = list(order) - unknown_specified = [cat for cat in specified if cat not in first_level_order] + unknown_specified = [ + cat for cat in specified if cat not in first_level_order + ] if unknown_specified: raise ValueError( "Order values not present in the first category column: " @@ -1948,13 +2007,13 @@ def _ordered_categories(series: pd.Series) -> list[Any]: _ax.yaxis.set_major_locator(MaxNLocator(integer=True)) _ax.set_xlabel(first_cat_col) - _ax.set_ylabel('#') + _ax.set_ylabel("#") ha = ( - 'right' if xlabel_rotation > 0 - else 'left' if xlabel_rotation < 0 - else 'center' - ) + "right" + if xlabel_rotation > 0 + else "left" if xlabel_rotation < 0 else "center" + ) plt.setp(_ax.get_xticklabels(), rotation=xlabel_rotation, ha=ha) fig.tight_layout() @@ -1990,9 +2049,8 @@ def n_cat1_per_cat2_hist( save: str | Path | None = None, ax: Axes | None = None, ) -> Axes: - """ - Plot the distribution of the number of first-category entries per second - category. + """Plot the distribution of the number of first-category entries per + second category. Parameters ---------- @@ -2063,9 +2121,15 @@ def n_cat1_per_cat2_hist( ) lower, upper = bin_range if lower >= upper: - raise ValueError("bin_range lower bound must be less than upper bound.") + raise ValueError( + "bin_range lower bound must be less than upper bound." + ) - temp_col = "__proteopy_axis_index__" if first_category == "index" else first_category + temp_col = ( + "__proteopy_axis_index__" + if first_category == "index" + else first_category + ) data = frame[[second_category]].copy() if first_category == "index": index_values = adata.obs_names if axis == 0 else adata.var_names @@ -2125,9 +2189,8 @@ def n_cat1_per_cat2_hist( return _ax -docstr_header = ( - "Plot the distribution of the number of first-category entries per second category." - ) + +docstr_header = "Plot the distribution of the number of first-category entries per second category." n_peptides_per_protein = partial_with_docsig( n_cat1_per_cat2_hist, first_category="peptide_id", @@ -2166,8 +2229,8 @@ def cv_by_group( save: str | None = None, print_stats: bool = False, ): - """ - Compute per-group coefficients of variation and plot their distributions. + """Compute per-group coefficients of variation and plot their + distributions. Parameters ---------- @@ -2295,13 +2358,17 @@ def cv_by_group( if temp_key_name is not None: del adata.varm[temp_key_name] - df_melted = cv_df.melt(var_name="Group", value_name="CV", ignore_index=False) + df_melted = cv_df.melt( + var_name="Group", value_name="CV", ignore_index=False + ) df_melted = df_melted.reset_index(drop=True) if order is None: order = unique_groups else: - missing = [grp for grp in order if grp not in df_melted["Group"].unique()] + missing = [ + grp for grp in order if grp not in df_melted["Group"].unique() + ] if missing: raise ValueError( "Requested ordering includes groups with no CV data: " @@ -2316,14 +2383,16 @@ def cv_by_group( if print_stats: cv_values = df_melted["CV"].dropna() - global_summary = pd.DataFrame({ - "Count": [cv_values.count()], - "Min": [round(cv_values.min(), 4)], - "Max": [round(cv_values.max(), 4)], - "Median": [round(cv_values.median(), 4)], - "Mean": [round(cv_values.mean(), 4)], - "Std": [round(cv_values.std(), 4)], - }) + global_summary = pd.DataFrame( + { + "Count": [cv_values.count()], + "Min": [round(cv_values.min(), 4)], + "Max": [round(cv_values.max(), 4)], + "Median": [round(cv_values.median(), 4)], + "Mean": [round(cv_values.mean(), 4)], + "Std": [round(cv_values.std(), 4)], + } + ) print("Global CV Summary:") print(global_summary.to_string(index=False)) print() @@ -2353,14 +2422,13 @@ def cv_by_group( if total_count > 0 else 0.0 ) - global_thresh = pd.DataFrame({ - "Count below": [int(below_count)], - "Percentage below": [pct], - }) - print( - f"Global Threshold Summary " - f"(hline={hline}):" + global_thresh = pd.DataFrame( + { + "Count below": [int(below_count)], + "Percentage below": [pct], + } ) + print(f"Global Threshold Summary " f"(hline={hline}):") print(global_thresh.to_string(index=False)) print() @@ -2368,14 +2436,14 @@ def _thresh_stats(group_cv): n_below = (group_cv < hline).sum() n_total = group_cv.count() pct_below = ( - round(n_below / n_total * 100, 4) - if n_total > 0 - else 0.0 + round(n_below / n_total * 100, 4) if n_total > 0 else 0.0 + ) + return pd.Series( + { + "Count below": int(n_below), + "Percentage below": pct_below, + } ) - return pd.Series({ - "Count below": int(n_below), - "Percentage below": pct_below, - }) per_group_thresh = ( df_melted.groupby("Group")["CV"] @@ -2383,10 +2451,7 @@ def _thresh_stats(group_cv): .unstack() .reindex(order) ) - print( - f"Per-Group Threshold Summary " - f"(hline={hline}):" - ) + print(f"Per-Group Threshold Summary " f"(hline={hline}):") print(per_group_thresh.to_string()) print() @@ -2480,8 +2545,7 @@ def sample_correlation_matrix( print_stats: bool = False, save: str | Path | None = None, ) -> Axes | None: - """ - Plot a clustered correlation heatmap across samples (obs). + """Plot a clustered correlation heatmap across samples (obs). Parameters ---------- @@ -2546,7 +2610,9 @@ def sample_correlation_matrix( matrix = adata.layers[layer] if matrix is None: - raise ValueError("Selected matrix is empty; cannot compute correlations.") + raise ValueError( + "Selected matrix is empty; cannot compute correlations." + ) if matrix.shape != expected_shape: raise ValueError( @@ -2555,7 +2621,9 @@ def sample_correlation_matrix( ) if isinstance(matrix, pd.DataFrame): - vals = matrix.reindex(index=adata.obs_names, columns=adata.var_names).copy() + vals = matrix.reindex( + index=adata.obs_names, columns=adata.var_names + ).copy() else: if sparse.issparse(matrix): # correlation requires dense values; convert temporarily @@ -2609,7 +2677,9 @@ def sample_correlation_matrix( sns.color_palette(n_colors=len(cats)) if len(cats) > 0 else [] ) - palette = {str(cat): color for cat, color in zip(cats, resolved_colors)} + palette = { + str(cat): color for cat, color in zip(cats, resolved_colors) + } groups_str = groups.astype("string") row_color_series = groups_str.map(palette) @@ -2623,7 +2693,9 @@ def sample_correlation_matrix( ) legend_handles = [ - Patch(facecolor=palette[str(cat)], edgecolor="none", label=str(cat)) + Patch( + facecolor=palette[str(cat)], edgecolor="none", label=str(cat) + ) for cat in cats ] @@ -2636,7 +2708,9 @@ def sample_correlation_matrix( ) row_colors = ( - row_color_series.to_numpy() if row_color_series is not None else None + row_color_series.to_numpy() + if row_color_series is not None + else None ) # ---- hierarchical clustering on (1 - r) @@ -2648,30 +2722,29 @@ def sample_correlation_matrix( # ---- optional statistics printout if print_stats and n > 1: # 1) Overall off-diagonal summary - summary = pd.DataFrame({ - "min": [np.nanmin(offdiag)], - "max": [np.nanmax(offdiag)], - "mean": [np.nanmean(offdiag)], - "median": [np.nanmedian(offdiag)], - "std": [np.nanstd(offdiag)], - }) - print( - f"Sample correlation summary " - f"(off-diagonal, {method}):" + summary = pd.DataFrame( + { + "min": [np.nanmin(offdiag)], + "max": [np.nanmax(offdiag)], + "mean": [np.nanmean(offdiag)], + "median": [np.nanmedian(offdiag)], + "std": [np.nanstd(offdiag)], + } ) + print(f"Sample correlation summary " f"(off-diagonal, {method}):") print(summary.to_string(index=False)) print() # 2) Per-sample mean correlation (dendrogram order) mask = ~np.eye(n, dtype=bool) - per_sample_mean = np.nanmean( - np.where(mask, A, np.nan), axis=1 - ) + per_sample_mean = np.nanmean(np.where(mask, A, np.nan), axis=1) heatmap_order = leaves_list(Z) - per_sample_df = pd.DataFrame({ - "sample_id": corr_df.index[heatmap_order], - "mean_corr": per_sample_mean[heatmap_order], - }) + per_sample_df = pd.DataFrame( + { + "sample_id": corr_df.index[heatmap_order], + "mean_corr": per_sample_mean[heatmap_order], + } + ) print("Per-sample mean correlation:") print(per_sample_df.to_string(index=False)) print() @@ -2680,44 +2753,35 @@ def sample_correlation_matrix( if margin_color is not None: if margin_color not in adata.obs.columns: raise KeyError( - f"Column '{margin_color}' not found " - f"in adata.obs." + f"Column '{margin_color}' not found " f"in adata.obs." ) - groups_ps = adata.obs.loc[ - corr_df.index, margin_color - ] + groups_ps = adata.obs.loc[corr_df.index, margin_color] unique_groups = groups_ps.dropna().unique() group_rows = [] for grp in sorted(unique_groups): - grp_idx = groups_ps[ - groups_ps == grp - ].index + grp_idx = groups_ps[groups_ps == grp].index other_idx = groups_ps[ (groups_ps != grp) & groups_ps.notna() ].index within = corr_df.loc[grp_idx, grp_idx] - within_vals = within.values[ - ~np.eye(len(grp_idx), dtype=bool) - ] + within_vals = within.values[~np.eye(len(grp_idx), dtype=bool)] mean_within = ( - np.nanmean(within_vals) - if len(within_vals) > 0 - else np.nan + np.nanmean(within_vals) if len(within_vals) > 0 else np.nan ) if len(other_idx) > 0: between_vals = corr_df.loc[ grp_idx, other_idx ].values.ravel() - mean_between = np.nanmean( - between_vals - ) + mean_between = np.nanmean(between_vals) else: mean_between = np.nan - group_rows.append({ - "group": grp, - "mean_within": mean_within, - "mean_between": mean_between, - }) + group_rows.append( + { + "group": grp, + "mean_within": mean_within, + "mean_between": mean_between, + } + ) group_df = pd.DataFrame(group_rows) print("Per-group mean correlation:") print(group_df.to_string(index=False)) @@ -2731,7 +2795,7 @@ def sample_correlation_matrix( row_colors=row_colors, col_colors=row_colors if row_colors is not None else None, cmap=cmap, - center=center_val, + center=center_val, figsize=figsize, xticklabels=xticklabels, yticklabels=yticklabels, @@ -2744,8 +2808,8 @@ def sample_correlation_matrix( handles=legend_handles, title=margin_color, bbox_to_anchor=(1.05, 1), - loc='upper left', - borderaxespad=0., + loc="upper left", + borderaxespad=0.0, frameon=False, ) @@ -2783,7 +2847,10 @@ def hclustv_profiles_heatmap( row_cluster: bool = True, col_cluster: bool = True, cbar_pos: tuple[float, float, float, float] | None = ( - 0.02, 0.8, 0.05, 0.18 + 0.02, + 0.8, + 0.05, + 0.18, ), tree_kws: dict | None = None, xticklabels: bool = True, @@ -2794,8 +2861,8 @@ def hclustv_profiles_heatmap( ax: bool = False, save: str | Path | None = None, ) -> Axes | None: - """ - Plot a clustered heatmap of variable profiles across samples or groups. + """Plot a clustered heatmap of variable profiles across samples or + groups. Computes z-scores for each variable across samples (or group summaries), then applies hierarchical clustering to visualize variable expression @@ -2923,19 +2990,19 @@ def hclustv_profiles_heatmap( raise KeyError(f"Column '{order_by}' not found in adata.obs.") # order_by and col_cluster are mutually exclusive; disable clustering if col_cluster: - print(( + print( "`order_by` parameter is incompatible with `col_cluster=True`. " "`col_cluster` has been overridden." - )) + ) col_cluster = False # Validate order parameter if order is not None: if col_cluster: - print(( + print( "`order` parameter is incompatible with `col_cluster=True`. " "`col_cluster` has been overridden." - )) + ) col_cluster = False order = list(order) if order_by is None and group_by is None: @@ -3061,8 +3128,9 @@ def hclustv_profiles_heatmap( ) sorted_idx = ( pd.Series(order_col_values, index=filtered_cols) - .sort_values().index - ) + .sort_values() + .index + ) else: # Use categorical order if categorical, sorted order otherwise if isinstance(order_col_values.dtype, pd.CategoricalDtype): @@ -3072,10 +3140,14 @@ def hclustv_profiles_heatmap( categories=cat_order, ordered=True, ) - sorted_idx = pd.Series( - order_col_values, - index=z_df_filled.columns, - ).sort_values().index + sorted_idx = ( + pd.Series( + order_col_values, + index=z_df_filled.columns, + ) + .sort_values() + .index + ) else: sorted_idx = order_col_values.sort_values().index z_df_filled = z_df_filled[sorted_idx] @@ -3105,7 +3177,8 @@ def hclustv_profiles_heatmap( if resolved_colors is None: resolved_colors = ( sns.color_palette("husl", n_colors=len(unique_cats)) - if len(unique_cats) > 0 else [] + if len(unique_cats) > 0 + else [] ) color_map = dict(zip(unique_cats, resolved_colors)) col_colors = pd.Series( diff --git a/proteopy/pp/__init__.py b/proteopy/pp/__init__.py index f60a244..166f18a 100644 --- a/proteopy/pp/__init__.py +++ b/proteopy/pp/__init__.py @@ -7,15 +7,15 @@ filter_samples_by_category_count, remove_zero_variance_vars, remove_contaminants, - ) +) from .imputation import ( impute_downshift, - ) +) from .normalization import ( normalize_median, - ) +) from .quantification import ( extract_peptide_groups, @@ -24,6 +24,6 @@ quantify_by_category, quantify_proteins, quantify_proteoforms, - ) +) from .stats import calculate_cv diff --git a/proteopy/pp/filtering.py b/proteopy/pp/filtering.py index 5f48839..4a9423b 100644 --- a/proteopy/pp/filtering.py +++ b/proteopy/pp/filtering.py @@ -1,6 +1,6 @@ import warnings from pathlib import Path -from typing import Callable +from collections.abc import Callable import numpy as np import pandas as pd import scipy.sparse as sp @@ -20,8 +20,8 @@ def filter_axis( zero_to_na=False, inplace=True, ): - """ - Filter observations or variables based on non-missing value content. + """Filter observations or variables based on non-missing value + content. This function filters the AnnData object along a specified axis (observations or variables) based on the fraction or number of non-missing (np.nan) values. @@ -85,7 +85,7 @@ def filter_axis( axis_i = 1 - axis axis_labels = adata.obs_names if axis == 0 else adata.var_names - completeness = None # assigned below when min_fraction is set + completeness = None # assigned below when min_fraction is set if group_by is not None: metadata = adata.obs if axis == 1 else adata.var @@ -127,7 +127,9 @@ def filter_axis( if not completeness_by_group: completeness = pd.Series(0, index=axis_labels, dtype=float) else: - completeness = pd.concat(completeness_by_group, axis=1).max(axis=1) + completeness = pd.concat(completeness_by_group, axis=1).max( + axis=1 + ) else: if sp.issparse(X): counts = pd.Series(X.getnnz(axis=axis_i), index=axis_labels) @@ -158,7 +160,9 @@ def filter_axis( check_proteodata(adata) return None else: - adata_filtered = adata[mask_filt, :] if axis == 0 else adata[:, mask_filt] + adata_filtered = ( + adata[mask_filt, :] if axis == 0 else adata[:, mask_filt] + ) check_proteodata(adata_filtered) return adata_filtered @@ -174,7 +178,7 @@ def filter_axis( filter_axis, axis=0, docstr_header=docstr_header, - ) +) docstr_header = """ Filter observations based on data completeness. @@ -188,7 +192,7 @@ def filter_axis( axis=0, min_count=None, docstr_header=docstr_header, - ) +) docstr_header = """ Filter variables based on non-missing value content. @@ -201,7 +205,7 @@ def filter_axis( filter_axis, axis=1, docstr_header=docstr_header, - ) +) docstr_header = """ Filter variables based on data completeness. @@ -215,7 +219,7 @@ def filter_axis( axis=1, min_count=None, docstr_header=docstr_header, - ) +) def filter_proteins_by_peptide_count( @@ -224,9 +228,8 @@ def filter_proteins_by_peptide_count( max_count=None, protein_col="protein_id", inplace=True, - ): - """ - Filter proteins by their peptide count. +): + """Filter proteins by their peptide count. Parameters ---------- @@ -248,9 +251,9 @@ def filter_proteins_by_peptide_count( """ check_proteodata(adata) if is_proteodata(adata)[1] != "peptide": - raise ValueError(( + raise ValueError( "`AnnData` object must be in ProteoData peptide format." - )) + ) if min_count is None and max_count is None: warnings.warn("Pass at least one argument: min_count | max_count") @@ -265,7 +268,9 @@ def filter_proteins_by_peptide_count( if max_count is not None: if max_count < 0: raise ValueError("`max_count` must be non-negative.") - if (min_count is not None and max_count is not None) and (min_count > max_count): + if (min_count is not None and max_count is not None) and ( + min_count > max_count + ): raise ValueError("`min_count` cannot be greater than `max_count`.") if protein_col not in adata.var.columns: @@ -312,10 +317,9 @@ def filter_samples_by_category_count( min_count=None, max_count=None, inplace=True, - ): - """ - Filter observations by the frequency of their category value in a ``.vars`` - metadata column. +): + """Filter observations by the frequency of their category value in a + ``.vars`` metadata column. Parameters ---------- @@ -354,7 +358,9 @@ def filter_samples_by_category_count( raise ValueError("`min_count` cannot be greater than `max_count`.") if category_col not in adata.obs.columns: - raise KeyError(f"`category_col`='{category_col}' not found in adata.obs") + raise KeyError( + f"`category_col`='{category_col}' not found in adata.obs" + ) obs_series = adata.obs[category_col] counts = obs_series.value_counts(dropna=False) @@ -399,26 +405,22 @@ def _validate_remove_zero_variance_vars_input( ) if not isinstance(atol, (int, float)): raise TypeError( - f"`atol` must be a numeric value, " - f"got {type(atol).__name__}." + f"`atol` must be a numeric value, " f"got {type(atol).__name__}." ) if atol < 0: raise ValueError("`atol` must be non-negative.") if not isinstance(inplace, bool): raise TypeError( - f"`inplace` must be a bool, " - f"got {type(inplace).__name__}." + f"`inplace` must be a bool, " f"got {type(inplace).__name__}." ) if not isinstance(verbose, bool): raise TypeError( - f"`verbose` must be a bool, " - f"got {type(verbose).__name__}." + f"`verbose` must be a bool, " f"got {type(verbose).__name__}." ) if group_by is not None: if group_by not in adata.obs.columns: raise KeyError( - f"`group_by`='{group_by}' not found " - f"in adata.obs" + f"`group_by`='{group_by}' not found " f"in adata.obs" ) if adata.obs[group_by].isna().any(): raise ValueError( @@ -434,8 +436,8 @@ def remove_zero_variance_vars( inplace=True, verbose=False, ): - """ - Remove variables with near-zero or zero variance, skipping NaN values. + """Remove variables with near-zero or zero variance, skipping NaN + values. Variables whose variance is at or below ``atol`` are removed. Variables that are entirely NaN — globally or within any group @@ -532,7 +534,11 @@ def remove_zero_variance_vars( ['p1'] """ _validate_remove_zero_variance_vars_input( - adata, group_by, atol, inplace, verbose, + adata, + group_by, + atol, + inplace, + verbose, ) check_proteodata(adata) X = adata.X @@ -560,10 +566,7 @@ def remove_zero_variance_vars( if idx.size == 0: continue Xg = X[idx, :] - Xg_arr = ( - Xg.toarray() if sp.issparse(Xg) - else np.asarray(Xg) - ) + Xg_arr = Xg.toarray() if sp.issparse(Xg) else np.asarray(Xg) with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) vg = np.nanvar(Xg_arr, axis=0, ddof=0) @@ -608,9 +611,9 @@ def remove_contaminants( protein_key="protein_id", header_parser: Callable[[str], str] | None = None, inplace=True, - ): - """ - Remove variables whose protein identifier matches a contaminant FASTA entry. +): + """Remove variables whose protein identifier matches a contaminant + FASTA entry. Parameters ---------- @@ -638,6 +641,7 @@ def remove_contaminants( check_proteodata(adata) if header_parser is None: + def header_parser(header: str) -> str: parts = header.split("|") return parts[1] if len(parts) > 1 else header @@ -648,13 +652,16 @@ def _load_contaminant_ids_from_fasta(fasta_path: Path) -> set[str]: parsed = header_parser(record.id) if parsed == "": warnings.warn( - f"Header parser returned empty ID for record '{record.id}'.", + f"Header parser returned empty ID for record '{ + record.id}'.", ) continue contaminant_ids.add(parsed) return contaminant_ids - def _load_contaminant_ids_from_table(table_path: Path, sep: str) -> set[str]: + def _load_contaminant_ids_from_table( + table_path: Path, sep: str + ) -> set[str]: series = pd.read_csv(table_path, sep=sep, usecols=[0]).iloc[:, 0] series = series.dropna().astype(str) return set(series.tolist()) diff --git a/proteopy/pp/normalization.py b/proteopy/pp/normalization.py index 7495e4e..058a331 100644 --- a/proteopy/pp/normalization.py +++ b/proteopy/pp/normalization.py @@ -17,8 +17,7 @@ def normalize_median( inplace: bool = True, force: bool = False, ): - """ - Median normalization of intensities. + """Median normalization of intensities. NAs are ignored when computing sample medians. @@ -87,7 +86,7 @@ def normalize_median( raise ValueError(f"target must be one of {allowed_targets!r}") if fill_na is not None and zeros_to_na: - raise ValueError('Cannot set both zeros_to_na and fill_na to True.') + raise ValueError("Cannot set both zeros_to_na and fill_na to True.") Xsrc = adata.X was_sparse = sparse.issparse(Xsrc) @@ -95,7 +94,7 @@ def normalize_median( X = X.astype(float, copy=True) is_log, _ = is_log_transformed(adata) - mismatch = (log_space != is_log) + mismatch = log_space != is_log if mismatch and not force: if log_space: raise ValueError( @@ -114,7 +113,7 @@ def normalize_median( # Track original missingness/zeros to restore later na_mask = ~np.isfinite(X) - zero_mask = (X == 0) + zero_mask = X == 0 if zeros_to_na: X_new[zero_mask] = np.nan @@ -122,19 +121,19 @@ def normalize_median( if fill_na is not None: X_new = np.where(~np.isfinite(X_new), fill_na, X_new) - def _normalize_samples( X_work, target, log_space, - ): - """Normalize a subset of samples and return normalized values and factors.""" - with np.errstate(invalid='ignore'): + ): + """Normalize a subset of samples and return normalized values + and factors.""" + with np.errstate(invalid="ignore"): sample_medians = np.nanmedian(X_work, axis=1) - if target == 'median': + if target == "median": target_val = float(np.nanmedian(sample_medians)) - elif target == 'max': + elif target == "max": target_val = float(np.nanmax(sample_medians)) else: raise ValueError("target must be one of {'median', 'max'}") @@ -143,7 +142,7 @@ def _normalize_samples( factors = (target_val - sample_medians)[:, None] sub_norm = X_work + factors else: - with np.errstate(divide='ignore', invalid='ignore'): + with np.errstate(divide="ignore", invalid="ignore"): factors = (target_val / sample_medians)[:, None] sub_norm = X_work * factors @@ -160,8 +159,10 @@ def _normalize_samples( all_factors[idx] = sub_fac if log_space else np.squeeze(sub_fac) else: if per_batch not in adata.obs.columns: - raise KeyError(f"per_batch='{per_batch}' not found in adata.obs columns.") - batches = adata.obs[per_batch].astype('category') + raise KeyError( + f"per_batch='{per_batch}' not found in adata.obs columns." + ) + batches = adata.obs[per_batch].astype("category") for b in batches.cat.categories: idx = np.where(batches.values == b)[0] if idx.size == 0: @@ -181,10 +182,12 @@ def _normalize_samples( else: factor_name = "scale_linear" - factors_df = pd.DataFrame({ - "sample_index": np.arange(n_samples), - factor_name: all_factors, - }) + factors_df = pd.DataFrame( + { + "sample_index": np.arange(n_samples), + factor_name: all_factors, + } + ) if per_batch is not None: factors_df[per_batch] = adata.obs[per_batch].values @@ -192,10 +195,18 @@ def _normalize_samples( # Surface problematic medians via warnings if np.isnan(all_factors).any(): bad = np.where(np.isnan(all_factors))[0] - print(f"Warning: {bad.size} sample(s) had undefined median; factors are NaN for indices {bad.tolist()}.") + print( + f"Warning: { + bad.size} sample(s) had undefined median; factors are NaN for indices { + bad.tolist()}." + ) if np.isinf(all_factors).any(): bad = np.where(np.isinf(all_factors))[0] - print(f"Warning: {bad.size} sample(s) had zero median; factors are inf for indices {bad.tolist()}.") + print( + f"Warning: { + bad.size} sample(s) had zero median; factors are inf for indices { + bad.tolist()}." + ) out = sparse.csr_matrix(all_norm) if was_sparse else all_norm diff --git a/proteopy/pp/quantification.py b/proteopy/pp/quantification.py index 670e60c..4b9ba19 100644 --- a/proteopy/pp/quantification.py +++ b/proteopy/pp/quantification.py @@ -1,7 +1,7 @@ import re from collections import defaultdict from functools import partial -from typing import Callable, List, Dict, Union, Optional +from collections.abc import Callable import numpy as np import pandas as pd @@ -36,10 +36,7 @@ def _rebuild_adata(adata, X_new, var_new, inplace): varm={}, varp={}, layers={}, - obsp=( - adata.obsp - if hasattr(adata, "obsp") else None - ), + obsp=(adata.obsp if hasattr(adata, "obsp") else None), ) adata.var_names = var_new.index return None @@ -49,10 +46,7 @@ def _rebuild_adata(adata, X_new, var_new, inplace): var=var_new.copy(), uns=adata.uns.copy(), obsm=adata.obsm.copy(), - obsp=( - adata.obsp.copy() - if hasattr(adata, "obsp") else None - ), + obsp=(adata.obsp.copy() if hasattr(adata, "obsp") else None), ) out.var_names = var_new.index return out @@ -84,8 +78,7 @@ def _apply_grouped_func(grouped, func): ) return result raise TypeError( - "`func` must be either a string " - "identifier or a callable." + "`func` must be either a string " "identifier or a callable." ) @@ -98,14 +91,14 @@ def _find_root(parent, x): def _union_find_groups(peptides): - """Return ``{representative: [members]}`` via union-find - on substring containment.""" + """Return ``{representative: [members]}`` via union-find on + substring containment.""" parent = {p: p for p in peptides} rank = {p: 0 for p in peptides} peps_by_len = sorted(peptides, key=len, reverse=True) for i, longp in enumerate(peps_by_len): - for shortp in peps_by_len[i + 1:]: + for shortp in peps_by_len[i + 1 :]: if shortp in longp: ra = _find_root(parent, longp) rb = _find_root(parent, shortp) @@ -124,23 +117,20 @@ def _union_find_groups(peptides): for _, members in buckets.items(): rep = max(members, key=len) groups[rep] = sorted( - members, key=lambda s: (-len(s), s), + members, + key=lambda s: (-len(s), s), ) return groups -def _group_peptides(peptides: List[str]) -> Dict[str, List[str]]: - """ - Group peptides so that any peptide fully contained - in another belongs to the same group. +def _group_peptides(peptides: list[str]) -> dict[str, list[str]]: + """Group peptides so that any peptide fully contained in another + belongs to the same group. Returns {representative_longest: [members...]} with members sorted by (-len, lexicographically). """ - peptides = ( - pd.Series(peptides).dropna().astype(str) - .unique().tolist() - ) + peptides = pd.Series(peptides).dropna().astype(str).unique().tolist() return _union_find_groups(peptides) @@ -150,12 +140,10 @@ def extract_peptide_groups( group_by: str | None = None, inplace: bool = True, ): - """ - Create new columns ``adata.var['peptide_group_id']`` - and ``adata.var['peptide_group_nr']``. - ``peptide_group_id`` contains all overlapping (substring) - peptide_ids joined by ``';'``. ``peptide_group_nr`` - is a unique integer identifier for each group, + """Create new columns ``adata.var['peptide_group_id']`` and + ``adata.var['peptide_group_nr']``. ``peptide_group_id`` contains all + overlapping (substring) peptide_ids joined by ``';'``. + ``peptide_group_nr`` is a unique integer identifier for each group, numbered across all groups in order of appearance. Parameters @@ -182,14 +170,10 @@ def extract_peptide_groups( modified copy. """ if peptide_col not in adata.var.columns: - raise KeyError( - f"'{peptide_col}' not found in adata.var" - ) + raise KeyError(f"'{peptide_col}' not found in adata.var") has_group_by = group_by is not None if has_group_by and group_by not in adata.var.columns: - raise KeyError( - f"'{group_by}' not found in adata.var" - ) + raise KeyError(f"'{group_by}' not found in adata.var") pep_series = adata.var[peptide_col].astype(str) @@ -203,11 +187,10 @@ def extract_peptide_groups( else: pep_to_group = {} for _, sub_df in adata.var.groupby( - group_by, observed=True, + group_by, + observed=True, ): - sub_peps = ( - sub_df[peptide_col].astype(str).tolist() - ) + sub_peps = sub_df[peptide_col].astype(str).tolist() groups = _group_peptides(sub_peps) for members in groups.values(): label = ";".join(members) @@ -216,25 +199,17 @@ def extract_peptide_groups( group_col = pep_series.map(pep_to_group) - group_to_nr = { - g: i for i, g in enumerate(group_col.unique()) - } + group_to_nr = {g: i for i, g in enumerate(group_col.unique())} group_nr_col = group_col.map(group_to_nr) if inplace: adata.var["peptide_group_id"] = group_col.values - adata.var["peptide_group_nr"] = ( - group_nr_col.values - ) + adata.var["peptide_group_nr"] = group_nr_col.values return None else: adata_copy = adata.copy() - adata_copy.var["peptide_group_id"] = ( - group_col.values - ) - adata_copy.var["peptide_group_nr"] = ( - group_nr_col.values - ) + adata_copy.var["peptide_group_id"] = group_col.values + adata_copy.var["peptide_group_nr"] = group_nr_col.values return adata_copy @@ -242,14 +217,13 @@ def summarize_overlapping_peptides( adata: ad.AnnData, group_by: str | None = "protein_id", layer: str | None = None, - func: Union[str, Callable] = "sum", + func: str | Callable = "sum", zero_to_na: bool = False, fill_na: float | int | None = None, skip_na: bool = True, inplace: bool = True, ): - """ - Aggregate intensities across overlapping peptides. + """Aggregate intensities across overlapping peptides. Calls :func:`extract_peptide_groups` internally to identify overlapping (substring-contained) peptides, @@ -302,25 +276,22 @@ def summarize_overlapping_peptides( modifies in place. """ if zero_to_na and fill_na is not None: - raise ValueError( - "Cannot set both zero_to_na and fill_na." - ) + raise ValueError("Cannot set both zero_to_na and fill_na.") # --- extract peptide groups if not inplace: adata = adata.copy() extract_peptide_groups( - adata, group_by=group_by, inplace=True, + adata, + group_by=group_by, + inplace=True, ) group_col = "peptide_group_nr" peptide_col = "peptide_id" # --- matrix as DataFrame (obs × vars) - X = ( - adata.layers[layer] if layer is not None - else adata.X - ) + X = adata.layers[layer] if layer is not None else adata.X if sparse.issparse(X): X = X.toarray() X = np.asarray(X, dtype=float) @@ -341,19 +312,30 @@ def summarize_overlapping_peptides( # --- group columns and aggregate group_keys = adata.var[group_col].astype(str) grouped = vals.T.groupby( - group_keys, sort=True, observed=True, + group_keys, + sort=True, + observed=True, ) agg_vals = _apply_grouped_func(grouped, func).T if not skip_na: - has_nan = vals.isna().T.groupby( - group_keys, sort=True, observed=True, - ).any().T + has_nan = ( + vals.isna() + .T.groupby( + group_keys, + sort=True, + observed=True, + ) + .any() + .T + ) agg_vals[has_nan] = np.nan # --- build new var table (aggregate annotations) groups = adata.var.groupby( - group_col, sort=True, observed=True, + group_col, + sort=True, + observed=True, ) records, group_to_peptide = [], {} @@ -378,15 +360,10 @@ def summarize_overlapping_peptides( rec[col] = _aggregate_var_value(df_g[col]) records.append(rec) - var_new = ( - pd.DataFrame.from_records(records) - .set_index(peptide_col) - ) + var_new = pd.DataFrame.from_records(records).set_index(peptide_col) # --- rename aggregated matrix columns - agg_vals.columns = [ - group_to_peptide[k] for k in agg_vals.columns - ] + agg_vals.columns = [group_to_peptide[k] for k in agg_vals.columns] var_new = var_new.loc[agg_vals.columns] # --- ensure 'peptide_id' column always matches index @@ -394,18 +371,21 @@ def summarize_overlapping_peptides( # --- rebuild AnnData return _rebuild_adata( - adata, agg_vals.values, var_new, inplace, + adata, + agg_vals.values, + var_new, + inplace, ) def _validate_quantify_by_category_input( - adata, group_by, proteodata_target_level, + adata, + group_by, + proteodata_target_level, ): """Validate arguments for ``quantify_by_category``.""" if group_by is None or group_by not in adata.var.columns: - raise KeyError( - f"'{group_by}' not found in adata.var" - ) + raise KeyError(f"'{group_by}' not found in adata.var") _allowed_levels = {None, "protein", "peptide"} if proteodata_target_level not in _allowed_levels: raise ValueError( @@ -416,11 +396,12 @@ def _validate_quantify_by_category_input( def _build_var_table_category(adata, group_by): - """Build aggregated ``.var`` for ``quantify_by_category``. - """ + """Build aggregated ``.var`` for ``quantify_by_category``.""" records = [] for gkey, df_g in adata.var.groupby( - group_by, sort=True, observed=True, + group_by, + sort=True, + observed=True, ): rec = {group_by: str(gkey)} for col in adata.var.columns: @@ -429,18 +410,14 @@ def _build_var_table_category(adata, group_by): rec[col] = _aggregate_var_value(df_g[col]) records.append(rec) - var_new = ( - pd.DataFrame.from_records(records) - .set_index(group_by) - ) + var_new = pd.DataFrame.from_records(records).set_index(group_by) var_new[group_by] = var_new.index var_new.index.name = None return var_new def _apply_target_level(var_new, proteodata_target_level): - """Adjust ``.var`` columns for the requested - proteodata level.""" + """Adjust ``.var`` columns for the requested proteodata level.""" if proteodata_target_level == "protein": if "protein_id" in var_new.columns: var_new = var_new.rename( @@ -467,62 +444,61 @@ def quantify_by_category( adata: ad.AnnData, group_by: str = None, layer=None, - func: Union[str, Callable] = "sum", + func: str | Callable = "sum", proteodata_target_level: str | None = None, inplace: bool = True, -) -> Optional[ad.AnnData]: - """ - Aggregate intensities in ``adata.X`` (or selected - layer) by ``.var[group_col]``, aggregate annotations - in ``adata.var`` by concatenating unique values with - ``';'``, and set ``group_col`` as the new index - (``var_names``). - - Parameters - ---------- - adata : AnnData - Input AnnData with .X (obs x vars) and .var - annotations. - group_by : str - Column in adata.var to group by - (e.g. 'protein_id'). - layer : str, optional - Key in ``adata.layers``; when set, quantification - uses that layer instead of ``adata.X``. - func : {'sum', 'median', 'max'} | Callable - Aggregation to apply across grouped variables. - proteodata_target_level : {'protein', 'peptide'} or \ -None, optional - Set the proteodata level of the output. When - ``None`` the data level and columns are left as - is. When ``'protein'``, a ``protein_id`` column - matching the new var index is added so the result - satisfies protein-level proteodata requirements; - any pre-existing ``protein_id`` column is renamed - to ``protein_id_old`` and any ``peptide_id`` - column is removed. When ``'peptide'``, a - ``peptide_id`` column matching the new var index - is added so the result satisfies peptide-level - proteodata requirements; a ``protein_id`` column - must already exist in the inherited annotations. - inplace : bool - If True, modify `adata` in place; else return a - new AnnData. - - Returns - ------- - AnnData | None - Aggregated AnnData if inplace=False; otherwise - None. +) -> ad.AnnData | None: + """Aggregate intensities in ``adata.X`` (or selected layer) by + ``.var[group_col]``, aggregate annotations in ``adata.var`` by + concatenating unique values with ``';'``, and set ``group_col`` as + the new index (``var_names``). + + Parameters + ---------- + adata : AnnData + Input AnnData with .X (obs x vars) and .var + annotations. + group_by : str + Column in adata.var to group by + (e.g. 'protein_id'). + layer : str, optional + Key in ``adata.layers``; when set, quantification + uses that layer instead of ``adata.X``. + func : {'sum', 'median', 'max'} | Callable + Aggregation to apply across grouped variables. + proteodata_target_level : {'protein', 'peptide'} or \ + None, optional + Set the proteodata level of the output. When + ``None`` the data level and columns are left as + is. When ``'protein'``, a ``protein_id`` column + matching the new var index is added so the result + satisfies protein-level proteodata requirements; + any pre-existing ``protein_id`` column is renamed + to ``protein_id_old`` and any ``peptide_id`` + column is removed. When ``'peptide'``, a + ``peptide_id`` column matching the new var index + is added so the result satisfies peptide-level + proteodata requirements; a ``protein_id`` column + must already exist in the inherited annotations. + inplace : bool + If True, modify `adata` in place; else return a + new AnnData. + + Returns + ------- + AnnData | None + Aggregated AnnData if inplace=False; otherwise + None. """ _validate_quantify_by_category_input( - adata, group_by, proteodata_target_level, + adata, + group_by, + proteodata_target_level, ) # --- Matrix as DataFrame (obs × vars) vals = pd.DataFrame( - adata.layers[layer] if layer is not None - else adata.X, + adata.layers[layer] if layer is not None else adata.X, index=adata.obs_names, columns=adata.var_names, ) @@ -530,42 +506,53 @@ def quantify_by_category( # --- Group columns and aggregate group_keys = adata.var[group_by].astype(str) grouped = vals.T.groupby( - group_keys, sort=True, observed=True, + group_keys, + sort=True, + observed=True, ) agg_vals = _apply_grouped_func(grouped, func).T # --- Build new var table var_new = _build_var_table_category( - adata, group_by, + adata, + group_by, ) var_new = var_new.loc[agg_vals.columns] # --- Apply proteodata_target_level conformance var_new = _apply_target_level( - var_new, proteodata_target_level, + var_new, + proteodata_target_level, ) # --- Rebuild AnnData return _rebuild_adata( - adata, agg_vals.values, var_new, inplace, + adata, + agg_vals.values, + var_new, + inplace, ) quantify_proteins = partial( quantify_by_category, - group_by='protein_id', - proteodata_target_level='protein', + group_by="protein_id", + proteodata_target_level="protein", ) quantify_proteoforms = partial( quantify_by_category, - group_by='proteoform_id', - proteodata_target_level='protein', + group_by="proteoform_id", + proteodata_target_level="protein", ) def _validate_summarize_mods_input( - adata, method, zero_to_na, fill_na, keep_var_cols, + adata, + method, + zero_to_na, + fill_na, + keep_var_cols, ): """Validate arguments for ``summarize_modifications``.""" _, level = is_proteodata(adata) @@ -579,31 +566,23 @@ def _validate_summarize_mods_input( allowed = {"sum", "mean", "median", "max"} if method not in allowed: raise ValueError( - f"method must be one of {allowed!r}, " - f"got '{method}'." + f"method must be one of {allowed!r}, " f"got '{method}'." ) if zero_to_na and fill_na is not None: - raise ValueError( - "Cannot set both zero_to_na and fill_na." - ) + raise ValueError("Cannot set both zero_to_na and fill_na.") if keep_var_cols is not None: - missing = [ - c for c in keep_var_cols - if c not in adata.var.columns - ] + missing = [c for c in keep_var_cols if c not in adata.var.columns] if missing: raise KeyError( - f"keep_var_cols entries not found in " - f"adata.var: {missing}" + f"keep_var_cols entries not found in " f"adata.var: {missing}" ) _reserved = { - "peptide_id", "protein_id", - "n_peptidoforms", "n_modifications", + "peptide_id", + "protein_id", + "n_peptidoforms", + "n_modifications", } - overlap = [ - c for c in keep_var_cols - if c in _reserved - ] + overlap = [c for c in keep_var_cols if c in _reserved] if overlap: raise ValueError( f"keep_var_cols must not include " @@ -624,13 +603,18 @@ def _count_modifications(peptide_ids, pattern): def _build_var_table_mods( - var_src, stripped, sort, pattern, keep_var_cols, + var_src, + stripped, + sort, + pattern, + keep_var_cols, ): - """Build the new ``.var`` table for - ``summarize_modifications``.""" + """Build the new ``.var`` table for ``summarize_modifications``.""" records = [] for gkey, df_g in var_src.groupby( - stripped, sort=sort, observed=True, + stripped, + sort=sort, + observed=True, ): rec = {"peptide_id": gkey} pids = df_g["protein_id"].unique() @@ -645,9 +629,10 @@ def _build_var_table_mods( rec["protein_id"] = pids[0] rec["n_peptidoforms"] = len(df_g) rec["n_modifications"] = _count_modifications( - df_g.index, pattern, + df_g.index, + pattern, ) - for col in (keep_var_cols or []): + for col in keep_var_cols or []: rec[col] = _aggregate_var_value(df_g[col]) records.append(rec) @@ -682,8 +667,7 @@ def summarize_modifications( inplace: bool = True, verbose: bool = False, ) -> ad.AnnData | None: - """ - Aggregate modified peptides by their stripped sequence. + """Aggregate modified peptides by their stripped sequence. Removes modification annotations from peptide identifiers using a regular expression, groups peptides sharing the @@ -852,14 +836,15 @@ def summarize_modifications( # --- validate input check_proteodata(adata, layers=layer) _validate_summarize_mods_input( - adata, method, zero_to_na, fill_na, keep_var_cols, + adata, + method, + zero_to_na, + fill_na, + keep_var_cols, ) # --- extract matrix - Xsrc = ( - adata.layers[layer] if layer is not None - else adata.X - ) + Xsrc = adata.layers[layer] if layer is not None else adata.X was_sparse = sparse.issparse(Xsrc) X = Xsrc.toarray() if was_sparse else np.asarray(Xsrc) X = X.astype(float, copy=True) @@ -874,12 +859,8 @@ def summarize_modifications( try: pattern = re.compile(mod_regex) except re.error as exc: - raise ValueError( - f"Invalid mod_regex '{mod_regex}': {exc}" - ) from exc - stripped = np.array([ - pattern.sub("", pid) for pid in peptide_ids - ]) + raise ValueError(f"Invalid mod_regex '{mod_regex}': {exc}") from exc + stripped = np.array([pattern.sub("", pid) for pid in peptide_ids]) if verbose: n_unique = len(np.unique(stripped)) @@ -900,15 +881,24 @@ def summarize_modifications( agg_vals = _apply_str_method(grouped, method).T if not skip_na: - has_nan = df.isna().T.groupby( - stripped, sort=sort, - ).any().T + has_nan = ( + df.isna() + .T.groupby( + stripped, + sort=sort, + ) + .any() + .T + ) agg_vals[has_nan] = np.nan # --- build new .var table var_new = _build_var_table_mods( - adata.var.copy(), stripped, sort, - pattern, keep_var_cols, + adata.var.copy(), + stripped, + sort, + pattern, + keep_var_cols, ) # --- result matrix @@ -918,7 +908,10 @@ def summarize_modifications( # --- rebuild AnnData result = _rebuild_adata( - adata, X_new, var_new, inplace, + adata, + X_new, + var_new, + inplace, ) if inplace: check_proteodata(adata) diff --git a/proteopy/pp/stats.py b/proteopy/pp/stats.py index 7dd05e1..a69e6fb 100644 --- a/proteopy/pp/stats.py +++ b/proteopy/pp/stats.py @@ -11,8 +11,8 @@ def _compute_cv_stats(X, zero_to_na=True): - """ - Compute mean, std, and count across observations for CV calculation. + """Compute mean, std, and count across observations for CV + calculation. Parameters ---------- @@ -37,7 +37,7 @@ def _compute_cv_stats(X, zero_to_na=True): # TODO: implement sparse-native algorithm to avoid densification X_dense = X.toarray() with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=RuntimeWarning) + warnings.filterwarnings("ignore", category=RuntimeWarning) mean_ = np.nanmean(X_dense, axis=0) std_ = np.nanstd(X_dense, axis=0, ddof=1) n_ = np.sum(~np.isnan(X_dense), axis=0) @@ -45,7 +45,7 @@ def _compute_cv_stats(X, zero_to_na=True): else: X_dense = X.toarray() with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=RuntimeWarning) + warnings.filterwarnings("ignore", category=RuntimeWarning) mean_ = np.nanmean(X_dense, axis=0) std_ = np.nanstd(X_dense, axis=0, ddof=1) n_ = np.sum(~np.isnan(X_dense), axis=0) @@ -57,7 +57,7 @@ def _compute_cv_stats(X, zero_to_na=True): X_arr[X_arr == 0] = np.nan with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=RuntimeWarning) + warnings.filterwarnings("ignore", category=RuntimeWarning) mean_ = np.nanmean(X_arr, axis=0) std_ = np.nanstd(X_arr, axis=0, ddof=1) n_ = np.sum(~np.isnan(X_arr), axis=0) @@ -74,8 +74,8 @@ def calculate_cv( key_added: str | None = None, inplace: bool = True, ) -> AnnData | None: - """ - Compute the coefficient of variation (CV = std / mean) for each variable. + """Compute the coefficient of variation (CV = std / mean) for each + variable. Performed within ``group_by`` groups optionally. CV is calculated ignoring NaNs. diff --git a/proteopy/read/__init__.py b/proteopy/read/__init__.py index eed1d14..52f342f 100644 --- a/proteopy/read/__init__.py +++ b/proteopy/read/__init__.py @@ -1,4 +1,4 @@ from .long import ( long, - ) +) from .diann import diann diff --git a/proteopy/read/diann.py b/proteopy/read/diann.py index afa2176..a0f30e1 100644 --- a/proteopy/read/diann.py +++ b/proteopy/read/diann.py @@ -33,18 +33,15 @@ def _resolve_aggr_level(aggr_level): """Resolve an aggr_level string to its canonical column name. - Returns the canonical column name and a boolean indicating - whether the level corresponds to protein-level aggregation. + Returns the canonical column name and a boolean indicating whether + the level corresponds to protein-level aggregation. """ for pattern, canonical in _ALL_AGGR_PATTERNS.items(): if re.fullmatch(pattern, aggr_level): return canonical, pattern in _PROTEIN_AGGR_PATTERNS - valid = ", ".join( - f"'{p}'" for p in _ALL_AGGR_PATTERNS - ) + valid = ", ".join(f"'{p}'" for p in _ALL_AGGR_PATTERNS) raise ValueError( - f"Invalid aggr_level '{aggr_level}'. " - f"Valid regex patterns: {valid}" + f"Invalid aggr_level '{aggr_level}'. " f"Valid regex patterns: {valid}" ) @@ -93,222 +90,192 @@ def _read_diann_v1( """ # -- Check args aggr_level_options = [ - 'Stripped.Sequence', - 'Modified.Sequence', - 'Precursor.Id', + "Stripped.Sequence", + "Modified.Sequence", + "Precursor.Id", ] if aggr_level not in aggr_level_options: raise ValueError( - f'Wrong option passsed to aggr_level argument: ' - f'{aggr_level}.' + f"Wrong option passsed to aggr_level argument: " f"{aggr_level}." ) if run_parser is not None and not callable(run_parser): - raise ValueError( - 'run_parser arg must either be a function or None.' - ) + raise ValueError("run_parser arg must either be a function or None.") base_required_cols = { - 'Run', - 'Proteotypic', - 'Protein.Ids', - 'Precursor.Quantity', - 'Protein.Q.Value', - 'Global.Q.Value', - 'Q.Value', - 'Protein.Group', - 'Genes', - 'Protein.Names', - 'Stripped.Sequence', + "Run", + "Proteotypic", + "Protein.Ids", + "Precursor.Quantity", + "Protein.Q.Value", + "Global.Q.Value", + "Q.Value", + "Protein.Group", + "Genes", + "Protein.Names", + "Stripped.Sequence", } required_cols = set(base_required_cols) required_cols.add(aggr_level) - if aggr_level == 'Precursor.Id': - required_cols.update( - {'Modified.Sequence', 'Precursor.Charge'} - ) - if aggr_level == 'Modified.Sequence': - required_cols.add('Modified.Sequence') + if aggr_level == "Precursor.Id": + required_cols.update({"Modified.Sequence", "Precursor.Charge"}) + if aggr_level == "Modified.Sequence": + required_cols.add("Modified.Sequence") - header = pd.read_csv(diann_output_path, sep='\t', nrows=0) + header = pd.read_csv(diann_output_path, sep="\t", nrows=0) missing_cols = sorted(required_cols - set(header.columns)) if missing_cols: - missing_str = ', '.join(missing_cols) + missing_str = ", ".join(missing_cols) raise ValueError( - 'Missing required columns in DIA-NN output: ' - f'{missing_str}.' + "Missing required columns in DIA-NN output: " f"{missing_str}." ) data = pd.read_csv( diann_output_path, - sep='\t', + sep="\t", header=0, usecols=sorted(required_cols), ) if run_parser: - data['Run'] = data['Run'].apply(run_parser) + data["Run"] = data["Run"].apply(run_parser) if show_input_stats: - print( - 'Before Q-value and proteotypicity filtering\n' - '------' - ) - proteotypic_fraction = ( - (data['Proteotypic'] == 1).sum() / len(data) - ) - print( - f'Proteotypic peptide fraction: ' - f'{proteotypic_fraction:.2f}' - ) + print("Before Q-value and proteotypicity filtering\n" "------") + proteotypic_fraction = (data["Proteotypic"] == 1).sum() / len(data) + print(f"Proteotypic peptide fraction: " f"{proteotypic_fraction:.2f}") multimapper_fraction = ( - (data['Protein.Ids'].str.split(';').apply(len) == 1) - .sum() / len(data) - ) - print( - f'Multimapper peptide fraction: ' - f'{multimapper_fraction:.2f}' - ) + data["Protein.Ids"].str.split(";").apply(len) == 1 + ).sum() / len(data) + print(f"Multimapper peptide fraction: " f"{multimapper_fraction:.2f}") # Q value distr. plots - fig, axes = plt.subplots( - nrows=1, ncols=3, figsize=(16, 4) - ) + fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(16, 4)) plt.subplots_adjust(wspace=0.3) - sns.histplot(data['Q.Value'], bins=100, ax=axes[0]) - axes[0].set_title('Q.Value distr.') + sns.histplot(data["Q.Value"], bins=100, ax=axes[0]) + axes[0].set_title("Q.Value distr.") if precursor_pval_max: axes[0].axvline( x=precursor_pval_max, - color='red', - linestyle='--', + color="red", + linestyle="--", linewidth=2, ) - sns.histplot( - data['Global.Q.Value'], bins=100, ax=axes[1] - ) - axes[1].set_title('Gobal.Q.Value distr.') + sns.histplot(data["Global.Q.Value"], bins=100, ax=axes[1]) + axes[1].set_title("Gobal.Q.Value distr.") if global_precursor_pval_max: axes[1].axvline( x=global_precursor_pval_max, - color='red', - linestyle='--', + color="red", + linestyle="--", linewidth=2, ) - sns.histplot( - data['Protein.Q.Value'], bins=100, ax=axes[2] - ) - axes[2].set_title('Protein.Q.Value distr.') + sns.histplot(data["Protein.Q.Value"], bins=100, ax=axes[2]) + axes[2].set_title("Protein.Q.Value distr.") if gene_pval_max: axes[2].axvline( x=gene_pval_max, - color='red', - linestyle='--', + color="red", + linestyle="--", linewidth=2, ) plt.show() # Q values stats - q_stats = data[[ - 'Q.Value', 'Protein.Q.Value', 'Global.Q.Value' - ]].describe() + q_stats = data[ + ["Q.Value", "Protein.Q.Value", "Global.Q.Value"] + ].describe() print(q_stats) # -- Filter ds data_sub = data[ - (data['Proteotypic'] == 1) - & (data['Protein.Ids'].str.split(';') - .apply(len).eq(1)) + (data["Proteotypic"] == 1) + & (data["Protein.Ids"].str.split(";").apply(len).eq(1)) ].copy() del data gc.collect() # ToDo: change to < instead of <= if precursor_pval_max: - data_sub = data_sub[ - data_sub['Q.Value'] <= precursor_pval_max - ] + data_sub = data_sub[data_sub["Q.Value"] <= precursor_pval_max] if global_precursor_pval_max: data_sub = data_sub[ - data_sub['Global.Q.Value'] - <= global_precursor_pval_max + data_sub["Global.Q.Value"] <= global_precursor_pval_max ] if gene_pval_max: - data_sub = data_sub[ - data_sub['Protein.Q.Value'] <= gene_pval_max - ] + data_sub = data_sub[data_sub["Protein.Q.Value"] <= gene_pval_max] if len(data_sub) == 0: - raise ValueError('Dataframe after filtering empty') + raise ValueError("Dataframe after filtering empty") if show_input_stats: # Q values stats - q_stats = data_sub[[ - 'Q.Value', 'Protein.Q.Value', 'Global.Q.Value' - ]].describe() - print( - '\nAfter Q-value and proteotypicity filtering\n' - '------' - ) + q_stats = data_sub[ + ["Q.Value", "Protein.Q.Value", "Global.Q.Value"] + ].describe() + print("\nAfter Q-value and proteotypicity filtering\n" "------") print(q_stats) # -- Check: how peptides map to proteins is_pep_multiprots = ( - data_sub.groupby( - [aggr_level, 'Run'], observed=True - )['Protein.Ids'].nunique() > 1 + data_sub.groupby([aggr_level, "Run"], observed=True)[ + "Protein.Ids" + ].nunique() + > 1 ) if is_pep_multiprots.any(): raise ValueError( - f'Peptides at aggregation level {aggr_level} ' - 'map to multiple proteins. ' - 'Not implemented yet.' + f"Peptides at aggregation level {aggr_level} " + "map to multiple proteins. " + "Not implemented yet." ) # -- Aggregate precursors data_cols = [ - 'Run', + "Run", aggr_level, - 'Protein.Ids', - 'Precursor.Quantity', + "Protein.Ids", + "Precursor.Quantity", ] precursor_data = data_sub[data_cols].copy() precursor_data_summed = ( precursor_data.groupby( - [aggr_level, 'Protein.Ids', 'Run'], + [aggr_level, "Protein.Ids", "Run"], observed=True, - )['Precursor.Quantity'] + )["Precursor.Quantity"] .sum() .reset_index() ) # -- Check: proteotypicity - assert (( - precursor_data_summed - .groupby('Stripped.Sequence', observed=True) - ['Protein.Ids'] - .nunique().le(1).all() - )), "Error: Some peptides map to multiple proteins!" + assert ( + precursor_data_summed.groupby("Stripped.Sequence", observed=True)[ + "Protein.Ids" + ] + .nunique() + .le(1) + .all() + ), "Error: Some peptides map to multiple proteins!" X = pd.pivot( precursor_data_summed, - index='Run', + index="Run", columns=aggr_level, - values='Precursor.Quantity', + values="Precursor.Quantity", ) X = X.sort_index(axis=0).sort_index(axis=1) @@ -323,28 +290,28 @@ def _read_diann_v1( gc.collect() # -- obs - obs = pd.DataFrame( - {'run_id': X.index}, index=X.index - ) + obs = pd.DataFrame({"run_id": X.index}, index=X.index) obs.index.name = None meta_cols = [ aggr_level, - 'Protein.Ids', - 'Protein.Group', - 'Genes', - 'Protein.Names', + "Protein.Ids", + "Protein.Group", + "Genes", + "Protein.Names", ] - if aggr_level == 'Modified.Sequence': - meta_cols.append('Stripped.Sequence') + if aggr_level == "Modified.Sequence": + meta_cols.append("Stripped.Sequence") - if aggr_level == 'Precursor.Id': - meta_cols.extend([ - 'Stripped.Sequence', - 'Modified.Sequence', - 'Precursor.Charge', - ]) + if aggr_level == "Precursor.Id": + meta_cols.extend( + [ + "Stripped.Sequence", + "Modified.Sequence", + "Precursor.Charge", + ] + ) precursor_meta = data_sub[meta_cols].copy() @@ -358,12 +325,10 @@ def _read_diann_v1( .all() ) - var = precursor_meta.groupby( - aggr_level, observed=True - ).first() + var = precursor_meta.groupby(aggr_level, observed=True).first() var = var.loc[X.columns] var[aggr_level] = var.index - var['peptide_id'] = var.index + var["peptide_id"] = var.index var.index.name = None del precursor_meta @@ -381,15 +346,15 @@ def _read_diann_v1( if len(adata.obs_names.unique()) < adata.n_obs: adata.obs_names_make_unique() warnings.warn( - 'Repeated obs names were present in the data. ' - 'They were made unique by numbered suffixes.' + "Repeated obs names were present in the data. " + "They were made unique by numbered suffixes." ) if len(adata.var_names.unique()) < adata.n_vars: adata.var_names_make_unique() warnings.warn( - 'Repeated var names were present in the data. ' - 'They were made unique by numbered suffixes.' + "Repeated var names were present in the data. " + "They were made unique by numbered suffixes." ) return adata @@ -407,7 +372,8 @@ def _read_diann_v1_9_1( zero_to_na=False, verbose=False, ): - """Read a DIA-NN v1.9.1+ parquet report into an :class:`~anndata.AnnData`. + """Read a DIA-NN v1.9.1+ parquet report into an + :class:`~anndata.AnnData`. Filters decoys and multi-mapping precursors, applies Q-value thresholds, aggregates intensities by ``aggr_level``, and returns @@ -415,9 +381,7 @@ def _read_diann_v1_9_1( """ # -- Validate arguments if run_parser is not None and not callable(run_parser): - raise ValueError( - "run_parser must be a callable or None." - ) + raise ValueError("run_parser must be a callable or None.") aggr_col, is_protein = _resolve_aggr_level(aggr_level) @@ -428,15 +392,11 @@ def _read_diann_v1_9_1( ) if fill_na is not None and zero_to_na: - raise ValueError( - "fill_na and zero_to_na are mutually exclusive." - ) + raise ValueError("fill_na and zero_to_na are mutually exclusive.") # -- Determine columns to read intensity_col = ( - "Precursor.Normalised" - if normalized - else "Precursor.Quantity" + "Precursor.Normalised" if normalized else "Precursor.Quantity" ) base_cols = [ @@ -457,11 +417,13 @@ def _read_diann_v1_9_1( if max_global_precursor_q is not None: base_cols.append("Global.Q.Value") if aggr_col == "Precursor.Id": - base_cols.extend([ - "Modified.Sequence", - "Stripped.Sequence", - "Precursor.Charge", - ]) + base_cols.extend( + [ + "Modified.Sequence", + "Stripped.Sequence", + "Precursor.Charge", + ] + ) elif aggr_col == "Modified.Sequence": base_cols.append("Stripped.Sequence") @@ -469,7 +431,8 @@ def _read_diann_v1_9_1( # -- Read parquet data = pd.read_parquet( - diann_output_path, columns=usecols, + diann_output_path, + columns=usecols, ) if verbose: @@ -483,39 +446,27 @@ def _read_diann_v1_9_1( data.drop(columns=["Decoy"], inplace=True) # -- Filter proteotypicity (single protein mapping) - proteotypic_mask = ( - data["Protein.Ids"].str.split(";").str.len() == 1 - ) + proteotypic_mask = data["Protein.Ids"].str.split(";").str.len() == 1 data = data[proteotypic_mask].copy() if verbose: print( - f"Rows after decoy and proteotypicity " - f"filtering: {len(data):,}" + f"Rows after decoy and proteotypicity " f"filtering: {len(data):,}" ) # -- Apply Q-value filters if max_precursor_q is not None: data = data[data["Q.Value"] <= max_precursor_q] if max_protein_q is not None: - data = data[ - data["Protein.Q.Value"] <= max_protein_q - ] + data = data[data["Protein.Q.Value"] <= max_protein_q] if max_global_precursor_q is not None: - data = data[ - data["Global.Q.Value"] - <= max_global_precursor_q - ] + data = data[data["Global.Q.Value"] <= max_global_precursor_q] if len(data) == 0: - raise ValueError( - "No rows remain after Q-value filtering." - ) + raise ValueError("No rows remain after Q-value filtering.") if verbose: - print( - f"Rows after Q-value filtering: {len(data):,}" - ) + print(f"Rows after Q-value filtering: {len(data):,}") # -- Parse Run column if run_parser is not None: @@ -566,14 +517,17 @@ def _read_diann_v1_9_1( if aggr_col == "Modified.Sequence": meta_cols.append("Stripped.Sequence") elif aggr_col == "Precursor.Id": - meta_cols.extend([ - "Stripped.Sequence", - "Modified.Sequence", - "Precursor.Charge", - ]) + meta_cols.extend( + [ + "Stripped.Sequence", + "Modified.Sequence", + "Precursor.Charge", + ] + ) meta = data[meta_cols].drop_duplicates( - subset=[aggr_col], keep="first", + subset=[aggr_col], + keep="first", ) var = meta.set_index(aggr_col) @@ -719,9 +673,7 @@ def diann( ... verbose=True, ... ) """ - handler = _resolve_version_handler( - version, _DIANN_VERSION_DISPATCH - ) + handler = _resolve_version_handler(version, _DIANN_VERSION_DISPATCH) return handler( diann_output_path=diann_output_path, aggr_level=aggr_level, diff --git a/proteopy/read/long.py b/proteopy/read/long.py index 9ee94fc..377fe50 100644 --- a/proteopy/read/long.py +++ b/proteopy/read/long.py @@ -47,14 +47,10 @@ def _validate_intensities_df( ) -> pd.DataFrame: """Copy, validate, and rename the intensities DataFrame.""" if fill_na is not None and zero_to_na: - raise ValueError( - "fill_na and zero_to_na are mutually exclusive." - ) + raise ValueError("fill_na and zero_to_na are mutually exclusive.") df = df.copy() if df.empty: - raise ValueError( - "Intensities DataFrame is empty." - ) + raise ValueError("Intensities DataFrame is empty.") required = {column_aliases[k] for k in required_keys} missing = required.difference(df.columns) if missing: @@ -63,8 +59,7 @@ def _validate_intensities_df( f"columns: {', '.join(sorted(missing))}" ) rename_map = { - actual: canonical - for canonical, actual in column_aliases.items() + actual: canonical for canonical, actual in column_aliases.items() } df = df.rename(columns=rename_map) for col in id_columns: @@ -84,13 +79,9 @@ def _validate_intensities_df( if dup_mask.any(): duplicated = df.loc[dup_mask, duplicate_subset] n_duplicates = len(duplicated) - examples = duplicated.head(5).to_dict( - orient="records" - ) + examples = duplicated.head(5).to_dict(orient="records") extra = ( - f" (showing first 5 of {n_duplicates})" - if n_duplicates > 5 - else "" + f" (showing first 5 of {n_duplicates})" if n_duplicates > 5 else "" ) entity = " and ".join(duplicate_subset) raise ValueError( @@ -110,8 +101,7 @@ def _resolve_protein_id( ) -> pd.DataFrame: """Resolve protein_id, merging from annotation if needed. - Also validates that each peptide maps to exactly one - protein. + Also validates that each peptide maps to exactly one protein. """ protein_id_col = column_aliases["protein_id"] @@ -138,15 +128,10 @@ def _resolve_protein_id( and column_aliases[key] != key } ann_df = ann_df.rename(columns=rename_map) - protein_map = ( - ann_df[["peptide_id", "protein_id"]] - .drop_duplicates( - subset=["peptide_id"], keep="first" - ) - ) - df = df.merge( - protein_map, on="peptide_id", how="left" + protein_map = ann_df[["peptide_id", "protein_id"]].drop_duplicates( + subset=["peptide_id"], keep="first" ) + df = df.merge(protein_map, on="peptide_id", how="left") n_unresolved = df["protein_id"].isna().sum() if n_unresolved: raise ValueError( @@ -166,9 +151,7 @@ def _resolve_protein_id( "intensities DataFrame." ) - protein_counts = ( - df.groupby("peptide_id")["protein_id"].nunique() - ) + protein_counts = df.groupby("peptide_id")["protein_id"].nunique() inconsistent = protein_counts[protein_counts > 1] if not inconsistent.empty: raise ValueError( @@ -201,19 +184,12 @@ def _merge_sample_annotations( if "sample_id" not in annotation_df.columns: raise ValueError( - "Annotation file is missing the required " - "`sample_id` column." + "Annotation file is missing the required " "`sample_id` column." ) - dup_mask = annotation_df.duplicated( - subset=["sample_id"], keep=False - ) + dup_mask = annotation_df.duplicated(subset=["sample_id"], keep=False) if dup_mask.any(): - dup_count = ( - annotation_df - .loc[dup_mask, "sample_id"] - .nunique() - ) + dup_count = annotation_df.loc[dup_mask, "sample_id"].nunique() warnings.warn( "Duplicate sample entries found in " "annotation file; keeping the first " @@ -245,9 +221,7 @@ def _merge_sample_annotations( ) annotation_order = [ - name - for name in annotation_unique["sample_id"] - if name in obs_samples + name for name in annotation_unique["sample_id"] if name in obs_samples ] # preserve original index through merge @@ -289,15 +263,9 @@ def _merge_var_annotations( "column." ) - dup_mask = annotation_df.duplicated( - subset=[id_column], keep=False - ) + dup_mask = annotation_df.duplicated(subset=[id_column], keep=False) if dup_mask.any(): - dup_count = ( - annotation_df - .loc[dup_mask, id_column] - .nunique() - ) + dup_count = annotation_df.loc[dup_mask, id_column].nunique() warnings.warn( f"Duplicate {entity_name} entries found in " f"{entity_name} annotation file; keeping " @@ -439,9 +407,7 @@ def _peptides_long_from_df( ) protein_id_col = column_aliases["protein_id"] - protein_id_in_intensities = ( - protein_id_col in intensities_df.columns - ) + protein_id_in_intensities = protein_id_col in intensities_df.columns required_keys = ["sample_id", "intensity", "peptide_id"] id_columns = ["sample_id", "peptide_id"] @@ -461,13 +427,14 @@ def _peptides_long_from_df( # -- Resolve protein_id df = _resolve_protein_id( - df, peptide_annotation_df, column_aliases, - protein_id_in_intensities, verbose, + df, + peptide_annotation_df, + column_aliases, + protein_id_in_intensities, + verbose, ) - default_obs_order = ( - df["sample_id"].drop_duplicates().tolist() - ) + default_obs_order = df["sample_id"].drop_duplicates().tolist() annotation_order = None # -- Build .X @@ -477,13 +444,9 @@ def _peptides_long_from_df( values="intensity", ) intensity_matrix = intensity_matrix.astype(float) - intensity_matrix = ( - intensity_matrix.sort_index().sort_index(axis=1) - ) + intensity_matrix = intensity_matrix.sort_index().sort_index(axis=1) if fill_na is not None: - intensity_matrix = intensity_matrix.fillna( - float(fill_na) - ) + intensity_matrix = intensity_matrix.fillna(float(fill_na)) intensity_matrix.index.name = None intensity_matrix.columns.name = None @@ -499,21 +462,22 @@ def _peptides_long_from_df( if sample_annotation_df is not None: obs, annotation_order = _merge_sample_annotations( - obs, sample_annotation_df, - column_aliases, verbose, + obs, + sample_annotation_df, + column_aliases, + verbose, ) # -- Build .var var = pd.DataFrame(index=intensity_matrix.columns) var.index.name = None var["peptide_id"] = var.index - var["protein_id"] = ( - peptide_to_protein.loc[var.index].values - ) + var["protein_id"] = peptide_to_protein.loc[var.index].values if peptide_annotation_df is not None: var = _merge_var_annotations( - var, peptide_annotation_df, + var, + peptide_annotation_df, id_column="peptide_id", column_aliases=column_aliases, rename_keys=["peptide_id", "protein_id"], @@ -524,14 +488,19 @@ def _peptides_long_from_df( # -- Reorder observations if sort_obs_by_annotation: intensity_matrix, obs = _reorder_observations( - intensity_matrix, obs, - annotation_order, default_obs_order, + intensity_matrix, + obs, + annotation_order, + default_obs_order, ) # -- Build AnnData return _finalize_adata( - intensity_matrix, obs, var, - zero_to_na, "peptide", + intensity_matrix, + obs, + var, + zero_to_na, + "peptide", ) @@ -580,9 +549,7 @@ def _proteins_long_from_df( zero_to_na=zero_to_na, ) - default_obs_order = ( - df["sample_id"].drop_duplicates().tolist() - ) + default_obs_order = df["sample_id"].drop_duplicates().tolist() annotation_order = None # -- Build .X @@ -592,13 +559,9 @@ def _proteins_long_from_df( values="intensity", ) intensity_matrix = intensity_matrix.astype(float) - intensity_matrix = ( - intensity_matrix.sort_index().sort_index(axis=1) - ) + intensity_matrix = intensity_matrix.sort_index().sort_index(axis=1) if fill_na is not None: - intensity_matrix = intensity_matrix.fillna( - float(fill_na) - ) + intensity_matrix = intensity_matrix.fillna(float(fill_na)) intensity_matrix.index.name = None intensity_matrix.columns.name = None @@ -608,8 +571,10 @@ def _proteins_long_from_df( if sample_annotation_df is not None: obs, annotation_order = _merge_sample_annotations( - obs, sample_annotation_df, - column_aliases, verbose, + obs, + sample_annotation_df, + column_aliases, + verbose, ) # -- Build .var @@ -619,7 +584,8 @@ def _proteins_long_from_df( if protein_annotation_df is not None: var = _merge_var_annotations( - var, protein_annotation_df, + var, + protein_annotation_df, id_column="protein_id", column_aliases=column_aliases, rename_keys=["protein_id"], @@ -630,14 +596,19 @@ def _proteins_long_from_df( # -- Reorder observations if sort_obs_by_annotation: intensity_matrix, obs = _reorder_observations( - intensity_matrix, obs, - annotation_order, default_obs_order, + intensity_matrix, + obs, + annotation_order, + default_obs_order, ) # -- Build AnnData return _finalize_adata( - intensity_matrix, obs, var, - zero_to_na, "protein", + intensity_matrix, + obs, + var, + zero_to_na, + "protein", ) @@ -654,8 +625,8 @@ def long( sort_obs_by_annotation: bool = False, verbose: bool = False, ) -> ad.AnnData: - """Read long-format peptide or protein tabular data into an - AnnData container. + """Read long-format peptide or protein tabular data into an AnnData + container. The ``intensities`` table must be in long format with one row per (sample, feature) measurement. Required columns differ by level: @@ -836,8 +807,7 @@ def long( # -- Validate arguments if level is None: raise ValueError( - "level is required; expected 'peptide' or " - "'protein'." + "level is required; expected 'peptide' or " "'protein'." ) level_normalised = level.lower() @@ -848,19 +818,21 @@ def long( ) if fill_na is not None and zero_to_na: - raise ValueError( - "fill_na and zero_to_na are mutually exclusive." - ) + raise ValueError("fill_na and zero_to_na are mutually exclusive.") if column_map: if level_normalised == "peptide": valid_keys = { - "sample_id", "intensity", - "peptide_id", "protein_id", + "sample_id", + "intensity", + "peptide_id", + "protein_id", } else: valid_keys = { - "sample_id", "intensity", "protein_id", + "sample_id", + "intensity", + "protein_id", } invalid = set(column_map).difference(valid_keys) if invalid: diff --git a/proteopy/tl/__init__.py b/proteopy/tl/__init__.py index 687f985..9819be9 100644 --- a/proteopy/tl/__init__.py +++ b/proteopy/tl/__init__.py @@ -3,10 +3,10 @@ peptide_dendograms_by_correlation, peptide_clusters_from_dendograms, proteoform_scores, - ) +) from .stat_tests import differential_abundance from .clustering import ( hclustv_tree, hclustv_cluster_ann, hclustv_profiles, - ) +) diff --git a/proteopy/tl/clustering.py b/proteopy/tl/clustering.py index bc0a461..3353f20 100644 --- a/proteopy/tl/clustering.py +++ b/proteopy/tl/clustering.py @@ -23,8 +23,8 @@ def _validate_linkage_and_values( Z: np.ndarray, values_df: pd.DataFrame, ) -> None: - """ - Validate linkage matrix and values DataFrame for clustering operations. + """Validate linkage matrix and values DataFrame for clustering + operations. Parameters ---------- @@ -82,8 +82,8 @@ def hclustv_tree( key_added: str | None = None, verbose: bool = True, ) -> ad.AnnData | None: - """ - Perform hierarchical clustering on variables (peptides or proteins). + """Perform hierarchical clustering on variables (peptides or + proteins). Computes a linkage matrix from variable profiles across samples or groups, storing the result in ``adata.uns`` for downstream visualization or analysis. @@ -214,7 +214,9 @@ def hclustv_tree( duplicates = [v for v in selected_vars if v in seen or seen.add(v)] if duplicates: raise ValueError( - f"Duplicate variables in selected_vars: {list(set(duplicates))}" + f"Duplicate variables in selected_vars: { + list( + set(duplicates))}" ) missing_vars = [v for v in selected_vars if v not in df.columns] if missing_vars: @@ -347,14 +349,14 @@ def hclustv_tree( def hclustv_cluster_ann( adata: ad.AnnData, k: int, - linkage_key: str = 'auto', - values_key: str = 'auto', + linkage_key: str = "auto", + values_key: str = "auto", inplace: bool = True, key_added: str | None = None, verbose: bool = True, ) -> ad.AnnData | None: - """ - Annotate variables with cluster assignments from hierarchical clustering. + """Annotate variables with cluster assignments from hierarchical + clustering. Uses :func:`scipy.cluster.hierarchy.fcluster` to cut the dendrogram at ``k`` clusters and stores cluster assignments in ``.var``. @@ -501,7 +503,7 @@ def hclustv_cluster_ann( def hclustv_profiles( adata: ad.AnnData, - cluster_key: str = 'auto', + cluster_key: str = "auto", layer: str | None = None, group_by: str | None = None, method: str = "median", @@ -512,8 +514,7 @@ def hclustv_profiles( key_added: str | None = None, verbose: bool = True, ) -> ad.AnnData | None: - """ - Compute cluster profiles from cluster annotations. + """Compute cluster profiles from cluster annotations. Summarizes variables within each cluster using mean or median to create cluster profile intensities across all observations. @@ -595,9 +596,7 @@ def hclustv_profiles( method = method.lower() if method not in ("mean", "median"): - raise ValueError( - f"method must be 'mean' or 'median', got '{method}'." - ) + raise ValueError(f"method must be 'mean' or 'median', got '{method}'.") resolved_key = _resolve_hclustv_cluster_key( adata, @@ -654,7 +653,9 @@ def hclustv_profiles( clusters = sorted(clusters) if len(clusters) == 0: - raise ValueError("No cluster assignments found in the specified column.") + raise ValueError( + "No cluster assignments found in the specified column." + ) # Compute profiles for each cluster cluster_profiles = {} diff --git a/proteopy/tl/copf.py b/proteopy/tl/copf.py index 923230b..9fe43f7 100644 --- a/proteopy/tl/copf.py +++ b/proteopy/tl/copf.py @@ -2,7 +2,6 @@ import copy as copym import numpy as np import pandas as pd -import anndata as ad from scipy import stats from sklearn.cluster import AgglomerativeClustering from scipy.stats import norm @@ -19,10 +18,9 @@ def pairwise_peptide_correlations_( sample_column="filename", peptide_column="peptide_id", value_column="intensity", - ): - ''' - Calculate pairwise peptide correlations. - Only outputs unique (non-symmetrical) correlations. +): + """Calculate pairwise peptide correlations. Only outputs unique + (non-symmetrical) correlations. Parameters: - df (pandas.DataFrame): The input DataFrame containing the data. @@ -34,13 +32,15 @@ def pairwise_peptide_correlations_( - result (pandas.DataFrame): A DataFrame containing the pairwise peptide correlations. Columns: 'pepA', 'pepB', 'PCC' (Pearson correlation coefficient). Only outputs unique (non-symmetrical) correlations (AB, not AB, B-A, AA, BB). - ''' + """ # TODO: modify df input to be obs x vars. Here we have redundant steps with # AnnDataTrces pairwise_peptide_correlations() df = df[[sample_column, peptide_column, value_column]] - pivot_df = df.pivot_table(index=sample_column, columns=peptide_column, values=value_column) + pivot_df = df.pivot_table( + index=sample_column, columns=peptide_column, values=value_column + ) columns = pivot_df.columns.tolist() corr_dict = {} @@ -49,13 +49,17 @@ def pairwise_peptide_correlations_( pivot_col_a = pivot_df.loc[:, col_a] pivot_col_b = pivot_df.loc[:, col_b] - corr_dict[col_a + '_' + col_b] = stats.pearsonr(pivot_col_a, pivot_col_b) + corr_dict[col_a + "_" + col_b] = stats.pearsonr( + pivot_col_a, pivot_col_b + ) - corr_df = pd.DataFrame.from_dict(corr_dict, orient='index') - corr_df.columns = ['PCC', 'p-value'] - corr_df['peptide_pair'] = corr_df.index - corr_df[['pepA', 'pepB']] = corr_df['peptide_pair'].str.split('_', expand=True) - corr_df = corr_df[["pepA","pepB","PCC"]] + corr_df = pd.DataFrame.from_dict(corr_dict, orient="index") + corr_df.columns = ["PCC", "p-value"] + corr_df["peptide_pair"] = corr_df.index + corr_df[["pepA", "pepB"]] = corr_df["peptide_pair"].str.split( + "_", expand=True + ) + corr_df = corr_df[["pepA", "pepB", "PCC"]] corr_df = corr_df.reset_index(drop=True) return corr_df @@ -63,19 +67,19 @@ def pairwise_peptide_correlations_( def pairwise_peptide_correlations( adata, - protein_id='protein_id', + protein_id="protein_id", inplace=True, copy=False, - batch_key: str | None = None, # per-batch if provided → always pooled - min_contrib_batches: int = 1, # pooling threshold - min_wsum: float = 0.0, # pooling threshold on sum(n_b-3) - ): + batch_key: str | None = None, # per-batch if provided → always pooled + min_contrib_batches: int = 1, # pooling threshold + min_wsum: float = 0.0, # pooling threshold on sum(n_b-3) +): if inplace and copy: - raise ValueError('Arguments raise and copy are mutually exclusive') + raise ValueError("Arguments raise and copy are mutually exclusive") if protein_id not in adata.var.columns: - raise ValueError(f'protein_id: {protein_id} not in .var.columns') + raise ValueError(f"protein_id: {protein_id} not in .var.columns") STORE_KEY = "pairwise_peptide_correlations" PER_BATCH_STORE_KEY = "pairwise_peptide_correlations_by_batch" @@ -93,34 +97,38 @@ def _finalize(out, per_batch=None): adata.uns[PER_BATCH_STORE_KEY] = per_batch return return out - + def compute_corrs(df): corrs = pairwise_peptide_correlations_( df, - sample_column='obs_id', - peptide_column='var_id', - value_column='intensity') + sample_column="obs_id", + peptide_column="var_id", + value_column="intensity", + ) return corrs - anns = adata.var[['protein_id']].reset_index() + anns = adata.var[["protein_id"]].reset_index() traces_df = adata.to_df().T.reset_index() - traces_df = traces_df.merge(anns, on='index') - traces_df = traces_df.rename(columns={'index': 'var_id'}) + traces_df = traces_df.merge(anns, on="index") + traces_df = traces_df.rename(columns={"index": "var_id"}) # TODO: remove unnecessary step of melting which gets unmelted # in protein-level function traces_df = pd.melt( traces_df, - id_vars=['protein_id', 'var_id'], - var_name='obs_id', - value_name='intensity') + id_vars=["protein_id", "var_id"], + var_name="obs_id", + value_name="intensity", + ) if batch_key is None: - corrs = traces_df.groupby('protein_id', observed=True).apply(compute_corrs, include_groups=False) + corrs = traces_df.groupby("protein_id", observed=True).apply( + compute_corrs, include_groups=False + ) corrs = corrs.droplevel(1, axis=0) - corrs = corrs.sort_values(['pepA', 'pepB']).sort_index() + corrs = corrs.sort_values(["pepA", "pepB"]).sort_index() return _finalize(corrs) if batch_key not in adata.obs.columns: @@ -129,64 +137,77 @@ def compute_corrs(df): batches = ( adata.obs[[batch_key]] .reset_index() - .rename(columns={'index': 'obs_id', batch_key: 'batch_id'}) + .rename(columns={"index": "obs_id", batch_key: "batch_id"}) ) - long = traces_df.merge(batches, on='obs_id', how='left') + long = traces_df.merge(batches, on="obs_id", how="left") batch_sizes = adata.obs[batch_key].value_counts().to_dict() batch_weights = {b: max(n - 3.0, 0.0) for b, n in batch_sizes.items()} - per_batch = ( - long - .groupby([protein_id, 'batch_id'], observed=True) - .apply(compute_corrs, include_groups=False) + per_batch = long.groupby([protein_id, "batch_id"], observed=True).apply( + compute_corrs, include_groups=False ) if per_batch.empty: - per_batch_df = pd.DataFrame(columns=['pepA', 'pepB', 'PCC']) - per_batch_df.index = pd.MultiIndex.from_tuples([], names=[protein_id, 'batch_id']) + per_batch_df = pd.DataFrame(columns=["pepA", "pepB", "PCC"]) + per_batch_df.index = pd.MultiIndex.from_tuples( + [], names=[protein_id, "batch_id"] + ) else: per_batch_df = ( - per_batch - .reset_index(level=2, drop=True) - .sort_values(['pepA', 'pepB']) + per_batch.reset_index(level=2, drop=True) + .sort_values(["pepA", "pepB"]) .sort_index() ) # Fisher pooling across batches rows = [] - for prot, gprot in per_batch_df.reset_index().groupby(protein_id, observed=True, sort=False): - for (pa, pb), gp in gprot.groupby(['pepA', 'pepB'], observed=True, sort=False): - r = gp['PCC'].to_numpy(dtype=float) - bids = gp['batch_id'].to_numpy() + for prot, gprot in per_batch_df.reset_index().groupby( + protein_id, observed=True, sort=False + ): + for (pa, pb), gp in gprot.groupby( + ["pepA", "pepB"], observed=True, sort=False + ): + r = gp["PCC"].to_numpy(dtype=float) + bids = gp["batch_id"].to_numpy() r = np.clip(r, -0.999999, 0.999999) z = np.arctanh(r) - w = np.array([batch_weights.get(b, 0.0) for b in bids], dtype=float) + w = np.array( + [batch_weights.get(b, 0.0) for b in bids], dtype=float + ) mask = w > 0 if not np.any(mask): continue - w = w[mask]; z = z[mask] + w = w[mask] + z = z[mask] wsum = float(w.sum()) if (mask.sum() >= min_contrib_batches) and (wsum >= min_wsum): - # Fixed-effects mean (zbar_fe) and weighted between-batch variance (var_z_between) + # Fixed-effects mean (zbar_fe) and weighted between-batch + # variance (var_z_between) zbar_fe = float((w * z).sum() / wsum) Q = float((w * (z - zbar_fe) ** 2).sum()) var_z_between = Q / wsum - # Conservative PCC from fixed-effects mean (no DL): shift by var_z_between + # Conservative PCC from fixed-effects mean (no DL): shift by + # var_z_between rhat = float(np.tanh(zbar_fe - var_z_between)) rows.append((prot, pa, pb, rhat, var_z_between)) if rows: pooled_df = ( - pd.DataFrame(rows, columns=[protein_id, 'pepA', 'pepB', 'PCC', 'var_z_between']) + pd.DataFrame( + rows, + columns=[protein_id, "pepA", "pepB", "PCC", "var_z_between"], + ) .set_index(protein_id) - .sort_values(['pepA', 'pepB']) + .sort_values(["pepA", "pepB"]) .sort_index() ) else: - pooled_df = pd.DataFrame(columns=['pepA', 'pepB', 'PCC', 'var_z_between']) + pooled_df = pd.DataFrame( + columns=["pepA", "pepB", "PCC", "var_z_between"] + ) pooled_df.index.name = protein_id return _finalize(pooled_df, per_batch=per_batch_df) @@ -194,11 +215,9 @@ def compute_corrs(df): def peptide_dendograms_by_correlation_( df, - method: str = 'agglomerative-hierarchical-clustering', - ): - ''' - Perform peptide clustering grouped by protein annotation. - + method: str = "agglomerative-hierarchical-clustering", +): + """Perform peptide clustering grouped by protein annotation. Parameters: ---------- @@ -220,24 +239,26 @@ def peptide_dendograms_by_correlation_( The two ids included for every step represent the index of the peptide in 'labels'. - heights: The height of each merging step in 'merge'. The idx of the height corresponds to the index of the step in 'merge'. - ''' + """ assert all(df.index == df.columns) - model = AgglomerativeClustering(n_clusters=None, - metric='precomputed', - linkage='average', - distance_threshold=0, - compute_distances=True) + model = AgglomerativeClustering( + n_clusters=None, + metric="precomputed", + linkage="average", + distance_threshold=0, + compute_distances=True, + ) model.fit(df) # pylint: disable=no-member dendogram = { - 'type': 'sklearn_agglomerative_clustering', - 'labels': model.feature_names_in_.tolist(), - 'heights': model.distances_.tolist(), - 'merge': model.children_.tolist() + "type": "sklearn_agglomerative_clustering", + "labels": model.feature_names_in_.tolist(), + "heights": model.distances_.tolist(), + "merge": model.children_.tolist(), } # pylint: enable=no-member @@ -246,59 +267,54 @@ def peptide_dendograms_by_correlation_( def peptide_dendograms_by_correlation( adata, - method='agglomerative-hierarchical-clustering', + method="agglomerative-hierarchical-clustering", inplace=True, copy=False, - ): +): if inplace and copy: - raise ValueError('Arguments raise and copy are mutually exclusive') - - - if 'pairwise_peptide_correlations' not in adata.uns: - raise ValueError(f'pairwise_peptide_correlations not in .uns') + raise ValueError("Arguments raise and copy are mutually exclusive") + if "pairwise_peptide_correlations" not in adata.uns: + raise ValueError("pairwise_peptide_correlations not in .uns") - corrs = adata.uns['pairwise_peptide_correlations'].copy() + corrs = adata.uns["pairwise_peptide_correlations"].copy() dends = {} - for protein_id, df in corrs.groupby('protein_id', observed=True): + for protein_id, df in corrs.groupby("protein_id", observed=True): corr_sym = reconstruct_corrs_df_symmetric_from_long_df( - df, - var_a_col='pepA', - var_b_col='pepB', - corr_col='PCC') + df, var_a_col="pepA", var_b_col="pepB", corr_col="PCC" + ) corr_dists = 1 - corr_sym dends[protein_id] = peptide_dendograms_by_correlation_( - corr_dists, - method= 'agglomerative-hierarchical-clustering') + corr_dists, method="agglomerative-hierarchical-clustering" + ) if inplace: - adata.uns['dendograms'] = dends + adata.uns["dendograms"] = dends elif copy: adata_new = adata.copy() - adata_new.uns['dendograms'] = dends + adata_new.uns["dendograms"] = dends return adata_new - + else: return dends def peptide_clusters_from_dendograms_( - dendogram, - n_clusters=2, - min_peptides_per_cluster=2, - noise=1e6, - ): - ''' - Cut clusters from cluster_peptides into N clusters with more than 1 peptide. - ''' - n_peptides = len(dendogram['labels']) + dendogram, + n_clusters=2, + min_peptides_per_cluster=2, + noise=1e6, +): + """Cut clusters from cluster_peptides into N clusters with more than + 1 peptide.""" + n_peptides = len(dendogram["labels"]) n_real_clusters = 0 k = n_clusters cluster_tree = BinaryClusterTree(constructor=dendogram) @@ -319,7 +335,7 @@ def peptide_clusters_from_dendograms_( # Rename cluster_ids to systematic format max_cluster = clusters.max() - cats = clusters.astype('category').cat.categories + cats = clusters.astype("category").cat.categories n_clusters = len(cats) if max_cluster != n_clusters: @@ -339,66 +355,59 @@ def peptide_clusters_from_dendograms( noise=NOISE, inplace=True, copy=False, - ): +): if inplace and copy: - raise ValueError('Arguments raise and copy are mutually exclusive') + raise ValueError("Arguments raise and copy are mutually exclusive") - if 'dendograms' not in adata.uns: - raise ValueError(f'dendograms not in .uns') + if "dendograms" not in adata.uns: + raise ValueError("dendograms not in .uns") var = adata.var.copy() - var['cluster_id'] = np.nan + var["cluster_id"] = np.nan clusters_ann = {} - dends = adata.uns['dendograms'] + dends = adata.uns["dendograms"] for prot, dend in dends.items(): dend_upd = copym.deepcopy(dend) - dend_upd['type'] = 'sklearn_agglomerative_clustering' + dend_upd["type"] = "sklearn_agglomerative_clustering" clusters = peptide_clusters_from_dendograms_( - dend_upd, - n_clusters=2, - min_peptides_per_cluster=2, - noise=noise) + dend_upd, n_clusters=2, min_peptides_per_cluster=2, noise=noise + ) - mask = (var['protein_id'] == prot) & (var.index.isin(clusters.index)) - var.loc[mask, 'cluster_id'] = clusters.reindex(var.index[mask]) + mask = (var["protein_id"] == prot) & (var.index.isin(clusters.index)) + var.loc[mask, "cluster_id"] = clusters.reindex(var.index[mask]) clusters_ann[prot] = clusters - assert not any((var['cluster_id'] == -1).tolist()) + assert not any((var["cluster_id"] == -1).tolist()) - var['proteoform_id'] = ( - var['protein_id'].astype(str) + - '_' + - var['cluster_id'].astype(int).astype(str) - ) + var["proteoform_id"] = ( + var["protein_id"].astype(str) + + "_" + + var["cluster_id"].astype(int).astype(str) + ) if inplace: - adata.uns['clusters'] = clusters_ann + adata.uns["clusters"] = clusters_ann adata.var = var elif copy: adata_new = adata.copy() - adata_new.uns['clusters'] = clusters_ann + adata_new.uns["clusters"] = clusters_ann return adata_new - + else: return clusters_ann def proteoform_scores_( - corrs, - clusters, - n_fractions, - summary_func=np.mean, - noise=NOISE - ): - ''' - Calculates a score for proteoforms based on the difference of within - cluster distances and between cluster distances. + corrs, clusters, n_fractions, summary_func=np.mean, noise=NOISE +): + """Calculates a score for proteoforms based on the difference of + within cluster distances and between cluster distances. IMPORTANT: currently only implemented properly for n_clusters = 2 @@ -410,7 +419,7 @@ def proteoform_scores_( n_fractions (int): Number of samples. summary_func (Callable): Summary function to apply to intra- and inter- cluster correlation coefficients. - ''' + """ def replace_upper_triangle(df, replacement, k=0): arr = df.to_numpy().astype(float) @@ -422,15 +431,15 @@ def replace_upper_triangle(df, replacement, k=0): return new_df if isinstance(clusters, pd.DataFrame): - clusters = clusters['cluster'] + clusters = clusters["cluster"] if np.issubdtype(clusters.dtype, np.floating): clusters = clusters.astype(int) assert any(corrs.index == corrs.columns) - assert all([i in clusters.index for i in corrs.index]), \ - f'clusters.index = {clusters.index}' \ - f'\ncorrs_index = {corrs.index}' + assert all([i in clusters.index for i in corrs.index]), ( + f"clusters.index = {clusters.index}" f"\ncorrs_index = {corrs.index}" + ) if (clusters == noise).all().all(): return np.array([0, np.nan, np.nan, np.nan]) @@ -440,7 +449,9 @@ def replace_upper_triangle(df, replacement, k=0): if len(cluster_ids) > 2: - raise ValueError('Functionality with n_clusters > 2 not implemented yet.') + raise ValueError( + "Functionality with n_clusters > 2 not implemented yet." + ) mat = corrs.copy(deep=True) stat_v = [] @@ -452,12 +463,16 @@ def replace_upper_triangle(df, replacement, k=0): clust_ids_ord = clust1_ids + clust2_ids mat_inv = corrs.loc[clust_ids_ord, clust_ids_ord] - cross = mat_inv.loc[clust1_ids, clust2_ids] # QUESTION: why no diagonal removal as below? + cross = mat_inv.loc[ + clust1_ids, clust2_ids + ] # QUESTION: why no diagonal removal as below? values = cross.to_numpy().flatten() values = values[~np.isnan(values)] stat_across = np.apply_along_axis(summary_func, 0, cross) - rows, cols = np.triu_indices_from(mat_inv, k=0) # k=1 excludes diagonal + rows, cols = np.triu_indices_from( + mat_inv, k=0 + ) # k=1 excludes diagonal mat_inv.to_numpy()[rows, cols] = np.nan within_c1 = mat_inv.loc[clust1_ids, clust1_ids] @@ -478,7 +493,9 @@ def replace_upper_triangle(df, replacement, k=0): z_stat_across = np.atanh(stat_across) z_diff_stat = z_stat_within - z_stat_across - dz = z_diff_stat / (np.sqrt((1 / (n_fractions-3)) + (1 / (n_fractions-3)))) + dz = z_diff_stat / ( + np.sqrt((1 / (n_fractions - 3)) + (1 / (n_fractions - 3))) + ) pval = 2 * (1 - norm.cdf(np.abs(dz))) stat_v.append([diff_stat, z_diff_stat, dz, pval]) @@ -523,7 +540,9 @@ def replace_upper_triangle(df, replacement, k=0): # T-test: intra-cluster peptide correlations are significantly different # from cross-cluster peptide correlations - dz = z_diff_stat / np.sqrt((1 / (n_fractions-3)) + (1 / (n_fractions-3))) + dz = z_diff_stat / np.sqrt( + (1 / (n_fractions - 3)) + (1 / (n_fractions - 3)) + ) pval = 2 * (1 - norm.cdf(np.abs(dz))) return np.array([diff_stat, z_diff_stat, dz, pval]) @@ -537,27 +556,26 @@ def proteoform_scores( noise=NOISE, inplace=True, copy=False, - ): +): if inplace and copy: - raise ValueError('Arguments raise and copy are mutually exclusive') - + raise ValueError("Arguments raise and copy are mutually exclusive") - if 'pairwise_peptide_correlations' not in adata.uns: - raise ValueError(f'pairwise_peptide_correlations not in .uns') + if "pairwise_peptide_correlations" not in adata.uns: + raise ValueError("pairwise_peptide_correlations not in .uns") - if 'dendograms' not in adata.uns: - raise ValueError(f'dendograms not in .uns') + if "dendograms" not in adata.uns: + raise ValueError("dendograms not in .uns") columns = [ - 'protein_id', - 'proteoform_score', - 'proteoform_score_z', - 'proteoform_score_dz', - 'proteoform_score_pval', - ] - - corrs = adata.uns['pairwise_peptide_correlations'].copy().reset_index() + "protein_id", + "proteoform_score", + "proteoform_score_z", + "proteoform_score_dz", + "proteoform_score_pval", + ] + + corrs = adata.uns["pairwise_peptide_correlations"].copy().reset_index() # pylint: disable=access-member-before-definition var = adata.var # pylint: enable=access-member-before-definition @@ -565,24 +583,22 @@ def proteoform_scores( proteoform_scores_list = [] - for prot, corrs_prot in corrs.groupby('protein_id', observed=True): + for prot, corrs_prot in corrs.groupby("protein_id", observed=True): corrs_mat = reconstruct_corrs_df_symmetric_from_long_df( - corrs_prot, - var_a_col='pepA', - var_b_col='pepB', - corr_col='PCC') + corrs_prot, var_a_col="pepA", var_b_col="pepB", corr_col="PCC" + ) - clusters = var.loc[var['protein_id'] == prot, 'cluster_id'] + clusters = var.loc[var["protein_id"] == prot, "cluster_id"] scores = proteoform_scores_( - corrs_mat, - clusters, - n_fractions, - summary_func=np.mean) + corrs_mat, clusters, n_fractions, summary_func=np.mean + ) - scores_entry = {column:value for column, value in zip(columns[1:5], scores)} - scores_entry['protein_id'] = prot + scores_entry = { + column: value for column, value in zip(columns[1:5], scores) + } + scores_entry["protein_id"] = prot scores_entry = pd.DataFrame([scores_entry]) proteoform_scores_list.append(scores_entry) @@ -591,53 +607,54 @@ def proteoform_scores( # Perform multiple-testing correction - mask_nonan = proteoform_scores['proteoform_score_pval'].notna() - pvals = proteoform_scores.loc[mask_nonan, 'proteoform_score_pval'] + mask_nonan = proteoform_scores["proteoform_score_pval"].notna() + pvals = proteoform_scores.loc[mask_nonan, "proteoform_score_pval"] bh_alpha = min_pval_adj if min_pval_adj is not None else 0.05 _, corrected_pvals, _, _ = multipletests( pvals, alpha=bh_alpha, - method='fdr_bh', + method="fdr_bh", ) - proteoform_scores['proteoform_score_pval_adj'] = np.nan - proteoform_scores['is_proteoform'] = np.nan + proteoform_scores["proteoform_score_pval_adj"] = np.nan + proteoform_scores["is_proteoform"] = np.nan - proteoform_scores.loc[ - pvals.index, 'proteoform_score_pval_adj' - ] = corrected_pvals + proteoform_scores.loc[pvals.index, "proteoform_score_pval_adj"] = ( + corrected_pvals + ) if min_pval_adj is not None or min_score is not None: is_pf = pd.Series(True, index=pvals.index) if min_pval_adj is not None: is_pf &= corrected_pvals <= min_pval_adj if min_score is not None: - scores = proteoform_scores.loc[pvals.index, 'proteoform_score'] + scores = proteoform_scores.loc[pvals.index, "proteoform_score"] is_pf &= scores >= min_score - proteoform_scores.loc[ - pvals.index, 'is_proteoform' - ] = is_pf.astype(int).values + proteoform_scores.loc[pvals.index, "is_proteoform"] = is_pf.astype( + int + ).values # --- drop existing score columns before merge (safe for re-runs) --- score_cols = [ - 'proteoform_score', - 'proteoform_score_z', - 'proteoform_score_dz', - 'proteoform_score_pval', - 'proteoform_score_pval_adj', - 'is_proteoform', + "proteoform_score", + "proteoform_score_z", + "proteoform_score_dz", + "proteoform_score_pval", + "proteoform_score_pval_adj", + "is_proteoform", ] var = var.drop(columns=[c for c in score_cols if c in var.columns]) # Add all new scores to .var var_upd = pd.merge( var, proteoform_scores, - on='protein_id', - how='left', - validate='many_to_one') + on="protein_id", + how="left", + validate="many_to_one", + ) - var_upd = var_upd.set_index('peptide_id', drop=False) + var_upd = var_upd.set_index("peptide_id", drop=False) var_upd.index.name = None assert (var.index == var_upd.index).all() @@ -649,6 +666,6 @@ def proteoform_scores( adata_new = adata.copy() adata_new.var = var_upd return adata_new - + else: return proteoform_scores diff --git a/proteopy/tl/stat_tests.py b/proteopy/tl/stat_tests.py index b0b5247..125ed2c 100644 --- a/proteopy/tl/stat_tests.py +++ b/proteopy/tl/stat_tests.py @@ -1,6 +1,4 @@ -""" -Statistical tests for differential abundance analysis. -""" +"""Statistical tests for differential abundance analysis.""" import warnings @@ -45,7 +43,10 @@ SUPPORTED_CORRECTIONS = [ "bonferroni", - "fdr_bh", "fdr", "bh", "benjamini_hochberg", + "fdr_bh", + "fdr", + "bh", + "benjamini_hochberg", ] MIN_SAMPLES_PER_GROUP = 3 @@ -57,8 +58,7 @@ def _validate_setup_two_group( obs_column, method: str, ) -> None: - """ - Validate setup dictionary for two-group comparison methods. + """Validate setup dictionary for two-group comparison methods. Parameters ---------- @@ -109,8 +109,7 @@ def _validate_setup_1vrest( group_by: str, obs_column: pd.Series, ) -> None: - """ - Validate setup dictionary for one-vs-rest comparison methods. + """Validate setup dictionary for one-vs-rest comparison methods. Parameters ---------- @@ -152,8 +151,7 @@ def _validate_setup_1vrest( # Check for duplicates if len(groups_spec) != len(set(groups_spec)): duplicates = [ - g for g in set(groups_spec) - if groups_spec.count(g) > 1 + g for g in set(groups_spec) if groups_spec.count(g) > 1 ] raise ValueError( f"setup['groups'] contains duplicate values: {duplicates}" @@ -178,8 +176,7 @@ def _perform_ttest( effective_space: str, equal_var: bool, ) -> dict: - """ - Perform t-test between two groups and compute summary statistics. + """Perform t-test between two groups and compute summary statistics. Parameters ---------- @@ -242,30 +239,31 @@ def _perform_ttest( mean2 = X2.mean(axis=0) # Flatten if needed (in case of matrix return) - if hasattr(mean1, 'A1'): + if hasattr(mean1, "A1"): mean1 = mean1.A1 - if hasattr(mean2, 'A1'): + if hasattr(mean2, "A1"): mean2 = mean2.A1 # Compute logFC - if effective_space == 'log': + if effective_space == "log": logfc = mean1 - mean2 else: # linear -> compute log2 fold change logfc = np.log2(mean1 / mean2) # Execute t-test tstats, pvals = stats.ttest_ind( - X1, X2, + X1, + X2, axis=0, equal_var=equal_var, ) return { - 'mean1': mean1, - 'mean2': mean2, - 'logfc': logfc, - 'tstat': tstats, - 'pval': pvals, + "mean1": mean1, + "mean2": mean2, + "logfc": logfc, + "tstat": tstats, + "pval": pvals, } @@ -279,8 +277,7 @@ def _execute_two_group_ttest( equal_var: bool, **kwargs, ) -> tuple[dict, str]: - """ - Execute two-sample t-test for differential abundance. + """Execute two-sample t-test for differential abundance. Parameters ---------- @@ -354,8 +351,7 @@ def _execute_one_vs_rest_ttest( equal_var: bool, **kwargs, ) -> list[tuple[dict, str]]: - """ - Execute one-vs-rest t-tests for differential abundance. + """Execute one-vs-rest t-tests for differential abundance. For each specified group, performs a t-test comparing that group against all other groups combined. @@ -448,8 +444,7 @@ def _perform_anova( groups_to_test: list, equal_var: bool = True, ) -> dict: - """ - Perform one-way ANOVA across groups for each variable. + """Perform one-way ANOVA across groups for each variable. Parameters ---------- @@ -513,16 +508,16 @@ def _perform_anova( ) results = { - 'fstat': fstats, - 'pval': pvals, + "fstat": fstats, + "pval": pvals, } # Add per-group means for group in groups_to_test: mean_vals = group_arrays[group].mean(axis=0) - if hasattr(mean_vals, 'A1'): + if hasattr(mean_vals, "A1"): mean_vals = mean_vals.A1 - results[f'mean_{sanitize_string(str(group))}'] = mean_vals + results[f"mean_{sanitize_string(str(group))}"] = mean_vals return results @@ -537,8 +532,7 @@ def _execute_anova( equal_var: bool = True, **kwargs, ) -> list[tuple[dict, str]]: - """ - Execute one-way ANOVA for differential abundance. + """Execute one-way ANOVA for differential abundance. Parameters ---------- @@ -604,9 +598,7 @@ def _execute_anova( if set(groups_to_test) == set(unique_groups): group_label = "all" else: - group_label = sanitize_string( - "_".join(sorted_groups) - ) + group_label = sanitize_string("_".join(sorted_groups)) return [(results, group_label)] @@ -632,8 +624,7 @@ def differential_abundance( fill_na: float | int | None = None, inplace: bool = True, ) -> AnnData | None: - """ - Perform differential abundance analysis between sample groups. + """Perform differential abundance analysis between sample groups. Compares expression values between groups using statistical tests. Computes log fold changes, p-values, and applies multiple testing @@ -817,7 +808,7 @@ def differential_abundance( f"Column '{group_by}' not found in adata.obs. " f"Available columns: {list(target.obs.columns)}" ) - + # Validate layer if layer is not None and layer not in target.layers: available_layers = list(target.layers.keys()) @@ -829,13 +820,18 @@ def differential_abundance( # Get data obs_column = target.obs[group_by] X_orig = target.layers[layer] if layer is not None else target.X - X = X_orig.toarray() if sparse.issparse(X_orig) else np.asarray(X_orig, dtype=float) + X = ( + X_orig.toarray() + if sparse.issparse(X_orig) + else np.asarray(X_orig, dtype=float) + ) # Validate and apply fill_na if fill_na is not None: if not isinstance(fill_na, (int, float)): raise ValueError( - f"Parameter 'fill_na' must be a number, got {type(fill_na).__name__}." + f"Parameter 'fill_na' must be a number, got { + type(fill_na).__name__}." ) X = np.nan_to_num(X, fill_na) @@ -882,11 +878,11 @@ def differential_abundance( data_is_log, _ = is_log_transformed(target, layer=layer) if data_is_log: - effective_space = 'log' + effective_space = "log" else: - effective_space = 'linear' + effective_space = "linear" - if space != 'auto' and space != effective_space: + if space != "auto" and space != effective_space: if force: effective_space = space else: @@ -897,7 +893,7 @@ def differential_abundance( ) X_proc = X.copy() - if effective_space == 'linear' and final_space == 'log': + if effective_space == "linear" and final_space == "log": pseudocount = 1 if np.any((X_proc + pseudocount) <= 0): raise ValueError( @@ -909,7 +905,7 @@ def differential_abundance( "Data treated as linear; applying log2 transform with " "pseudocount=1 for differential_abundance.", ) - effective_space = 'log' + effective_space = "log" if final_space != effective_space: raise ValueError( @@ -919,7 +915,7 @@ def differential_abundance( # Determine executor method_config = SUPPORTED_METHODS[method] - if method == 'ttest_two_sample' or method == 'welch': + if method == "ttest_two_sample" or method == "welch": # Determine comparison mode based on setup contents has_group_keys = "group1" in setup and "group2" in setup is_one_vs_rest = not has_group_keys @@ -930,10 +926,10 @@ def differential_abundance( else: executor_name = method_config["executor"] executor = METHOD_EXECUTORS[executor_name] - elif method in ('anova_oneway', 'anova_oneway_welch'): + elif method in ("anova_oneway", "anova_oneway_welch"): executor = METHOD_EXECUTORS["_execute_anova"] else: - raise ValueError('Not implemented yet') + raise ValueError("Not implemented yet") results_list = executor( X=X_proc, @@ -967,7 +963,7 @@ def differential_abundance( for results, group_label in results_list: # Multiple testing correction reject, pval_adj, _, _ = multipletests( - results['pval'], + results["pval"], alpha=alpha, method=correction_method, ) @@ -981,7 +977,9 @@ def differential_abundance( # Format: ;; or # ;;; if layer is used if layer_label is not None: - slot_name = f"{method_label};{group_by_label};{group_label};{layer_label}" + slot_name = ( + f"{method_label};{group_by_label};{group_label};{layer_label}" + ) else: slot_name = f"{method_label};{group_by_label};{group_label}" diff --git a/proteopy/utils/anndata.py b/proteopy/utils/anndata.py index 79f3ec4..c3654c4 100644 --- a/proteopy/utils/anndata.py +++ b/proteopy/utils/anndata.py @@ -10,8 +10,8 @@ def _has_infinite_values(X) -> bool: - """ - Check if the matrix X contains any infinite values (np.inf or -np.inf). + """Check if the matrix X contains any infinite values (np.inf or + -np.inf). Handles both dense numpy arrays and scipy sparse matrices. """ @@ -23,10 +23,8 @@ def _has_infinite_values(X) -> bool: def _axis_len(a, axis: int = 0) -> int: - """ - returns the length along `axis` using .shape if available, - otherwise falls back to len(a). - """ + """Returns the length along `axis` using .shape if available, + otherwise falls back to len(a).""" # Prefer shape if present (numpy, pandas, scipy.sparse, torch, etc.) shape = getattr(a, "shape", None) if shape is not None: @@ -39,18 +37,14 @@ def _axis_len(a, axis: int = 0) -> int: return int(len(a)) except Exception as e: raise TypeError( - ( - "Object of type " - f"{type(a)!r} does not expose a usable " - f"length along axis {axis}." - ) + "Object of type " + f"{type(a)!r} does not expose a usable " + f"length along axis {axis}." ) from e def _check_2d_shape(adata: AnnData) -> None: - """ - Ensure .X is 2-dimensional if present. - """ + """Ensure .X is 2-dimensional if present.""" if adata.X is not None: shp = getattr(adata.X, "shape", ()) if len(shp) != 2: @@ -60,18 +54,14 @@ def _check_2d_shape(adata: AnnData) -> None: def _check_axis_synchronization(adata: AnnData) -> None: - """ - Ensure obs/var are synchronized with obs_names/var_names and that - obsm/varm first dimensions match n_obs/n_vars respectively. - """ + """Ensure obs/var are synchronized with obs_names/var_names and that + obsm/varm first dimensions match n_obs/n_vars respectively.""" # obs axis if len(adata.obs) != len(adata.obs_names): raise ValueError( - ( - "Length of obs " - f"({len(adata.obs)}) does not match length of obs_names " - f"({len(adata.obs_names)})." - ) + "Length of obs " + f"({len(adata.obs)}) does not match length of obs_names " + f"({len(adata.obs_names)})." ) if not adata.obs.index.equals(adata.obs_names): raise ValueError("obs.index must exactly match obs_names.") @@ -79,11 +69,9 @@ def _check_axis_synchronization(adata: AnnData) -> None: # var axis if len(adata.var) != len(adata.var_names): raise ValueError( - ( - "Length of var " - f"({len(adata.var)}) does not match length of var_names " - f"({len(adata.var_names)})." - ) + "Length of var " + f"({len(adata.var)}) does not match length of var_names " + f"({len(adata.var_names)})." ) if not adata.var.index.equals(adata.var_names): raise ValueError("var.index must exactly match var_names.") @@ -93,10 +81,8 @@ def _check_axis_synchronization(adata: AnnData) -> None: n0 = _axis_len(arr, 0) if n0 != adata.n_obs: raise ValueError( - ( - f"obsm['{key}'] must have first dimension equal to " - f"n_obs ({adata.n_obs}), but has {n0}." - ) + f"obsm['{key}'] must have first dimension equal to " + f"n_obs ({adata.n_obs}), but has {n0}." ) # varm dimensions @@ -104,25 +90,20 @@ def _check_axis_synchronization(adata: AnnData) -> None: n0 = _axis_len(arr, 0) if n0 != adata.n_vars: raise ValueError( - ( - f"varm['{key}'] must have first dimension equal to " - f"n_vars ({adata.n_vars}), but has {n0}." - ) + f"varm['{key}'] must have first dimension equal to " + f"n_vars ({adata.n_vars}), but has {n0}." ) def _check_dimensions(adata: AnnData) -> None: - """ - Composite dimension/index checks for an AnnData object. - """ + """Composite dimension/index checks for an AnnData object.""" _check_2d_shape(adata) _check_axis_synchronization(adata) def _check_uniqueness(adata: AnnData, warn_only: bool = False) -> None: - """ - Check uniqueness of obs/var indices. - Raises a ValueError by default if duplicates are found. + """Check uniqueness of obs/var indices. Raises a ValueError by + default if duplicates are found. Parameters ---------- @@ -131,6 +112,7 @@ def _check_uniqueness(adata: AnnData, warn_only: bool = False) -> None: warn_only : bool, optional (default: False) If True, duplicates will only trigger warnings instead of errors. """ + def _handle_duplicates(axis_name: str, index): dup_mask = index.duplicated() if dup_mask.any(): @@ -153,16 +135,14 @@ def _handle_duplicates(axis_name: str, index): def _check_structure(adata: AnnData) -> None: - """ - High-level structure checks for an AnnData object. - """ + """High-level structure checks for an AnnData object.""" _check_uniqueness(adata) _check_dimensions(adata) def _var_column_matches_axis(adata: AnnData, column: str) -> bool: - """Return True when the chosen .var column exactly - matches both axis definitions.""" + """Return True when the chosen .var column exactly matches both axis + definitions.""" if column not in adata.var.columns: return False @@ -182,8 +162,8 @@ def _var_column_matches_axis(adata: AnnData, column: str) -> bool: def _has_multiple_values_per_cell( series: pd.Series, delimiters: str = " ,;" ) -> bool: - """Return True when any entry contains more than one - value separated by delimiters.""" + """Return True when any entry contains more than one value separated + by delimiters.""" if series.isna().any(): return True @@ -203,7 +183,8 @@ def _has_multiple_values_per_cell( def _validation_fail(msg, raise_error): - """Raise ValueError or return ``_FAIL`` depending on *raise_error*.""" + """Raise ValueError or return ``_FAIL`` depending on + *raise_error*.""" if raise_error: raise ValueError(msg) return _FAIL @@ -266,8 +247,7 @@ def _check_obs_requirements(adata, raise_error): ) misplaced_in_obs = [ - col for col in ("protein_id", "peptide_id") - if col in obs.columns + col for col in ("protein_id", "peptide_id") if col in obs.columns ] if misplaced_in_obs: return _validation_fail( @@ -364,8 +344,7 @@ def is_proteodata( raise_error: bool = False, layers: str | list[str] | None = None, ) -> tuple[bool, str | None]: - """ - Check whether the AnnData object stores peptide- or protein-level + """Check whether the AnnData object stores peptide- or protein-level proteomics data. Parameters @@ -447,8 +426,7 @@ def check_proteodata( *, layers: str | list[str] | None = None, ) -> tuple[bool, str | None]: - """ - Validate that *adata* satisfies ProteoPy assumptions, raising on + """Validate that *adata* satisfies ProteoPy assumptions, raising on failure. Thin wrapper around :func:`is_proteodata` with @@ -483,6 +461,7 @@ def check_proteodata( # sanitize_obs_cols helpers # ------------------------------------------------------------------ + def _is_missing(x): """Return True if *x* is a pandas-recognised missing value.""" try: @@ -508,7 +487,9 @@ def _to_jsonish(x, jsonize_complex): x = sorted(list(x)) try: return json.dumps( - x, default=str, ensure_ascii=False, + x, + default=str, + ensure_ascii=False, ) except Exception: return str(x) @@ -518,26 +499,24 @@ def _to_jsonish(x, jsonize_complex): def _coerce_series(s, jsonize_complex): """Coerce a single Series to an HDF5-writable dtype.""" if pd.api.types.is_bool_dtype(s): - return s.astype('boolean') + return s.astype("boolean") if pd.api.types.is_integer_dtype(s): - return s.astype('int64') + return s.astype("int64") if pd.api.types.is_float_dtype(s): - return s.astype('float64') + return s.astype("float64") if isinstance(s.dtype, pd.CategoricalDtype): return s if pd.api.types.is_object_dtype(s): only_strings = s.map( - lambda x: isinstance( - x, (str, np.str_) - ) or _is_missing(x) + lambda x: isinstance(x, (str, np.str_)) or _is_missing(x) ).all() if only_strings: - return s.astype('object') + return s.astype("object") out = s.map( lambda x: _to_jsonish(x, jsonize_complex), - ).astype('object') + ).astype("object") return out return s @@ -547,7 +526,7 @@ def sanitize_obs_cols( adata, jsonize_complex=True, ): - '''Sanitize anndata columns (in-place). + """Sanitize anndata columns (in-place). Makes all columns of adata.obs HDF5-writable by converting unsupported types. @@ -563,7 +542,7 @@ def sanitize_obs_cols( Args: jsonize_complex (bool): JSON-serialize lists/dicts/sets in object columns. - ''' + """ if adata.obs is not None and len(adata.obs.columns): obs = adata.obs.copy() for c in obs.columns: diff --git a/proteopy/utils/array.py b/proteopy/utils/array.py index 66e957f..87057d4 100644 --- a/proteopy/utils/array.py +++ b/proteopy/utils/array.py @@ -35,14 +35,9 @@ def _is_log_transformed_array( frac_negative = float(np.mean(vals < 0)) p95 = float(np.nanpercentile(vals, 95)) p5 = float(np.nanpercentile(vals, 5)) - dr_ratio = float( - (p95 - p5) / max(abs(p5), 1e-12) - ) + dr_ratio = float((p95 - p5) / max(abs(p5), 1e-12)) - is_log = ( - frac_negative >= neg_frac_thresh - or p95 <= p95_thresh - ) + is_log = frac_negative >= neg_frac_thresh or p95 <= p95_thresh stats = dict( frac_negative=frac_negative, @@ -56,13 +51,12 @@ def _is_log_transformed_array( def is_log_transformed( - adata, - layer=None, - neg_frac_thresh=5e-3, - p95_thresh=100.0, + adata, + layer=None, + neg_frac_thresh=5e-3, + p95_thresh=100.0, ): - """ - Heuristic detector for log-transformed matrices. + """Heuristic detector for log-transformed matrices. Returns ------- @@ -72,14 +66,8 @@ def is_log_transformed( {'frac_negative', 'p95', 'p5', 'dynamic_range_ratio', 'n_finite'} """ - Xsrc = ( - adata.layers[layer] if layer is not None - else adata.X - ) - X = ( - Xsrc.toarray() if sparse.issparse(Xsrc) - else np.asarray(Xsrc) - ) + Xsrc = adata.layers[layer] if layer is not None else adata.X + X = Xsrc.toarray() if sparse.issparse(Xsrc) else np.asarray(Xsrc) X = X.astype(float, copy=False) return _is_log_transformed_array( diff --git a/proteopy/utils/copf.py b/proteopy/utils/copf.py index 7f5a577..3a2b9c9 100644 --- a/proteopy/utils/copf.py +++ b/proteopy/utils/copf.py @@ -2,20 +2,22 @@ import pandas as pd -def reconstruct_corrs_df_symmetric_from_long_df(df, var_a_col=0, var_b_col=1, corr_col=2): - '''Reconstruct correlation dataframe in symmetrical matrix format. +def reconstruct_corrs_df_symmetric_from_long_df( + df, var_a_col=0, var_b_col=1, corr_col=2 +): + """Reconstruct correlation dataframe in symmetrical matrix format. Reconstruct a full correlation matrix from a long DataFrame containing asymmetric correlation data. - + Args: df (pd.DataFrame): DataFrame with columns for peptide A, peptide B, and their correlation value var_a_col (str | int): Name of column containing first peptide identifier var_b_col (str | int): Name of column containing second peptide identifier corr_col (str | int): Name of column containing correlation values - + Returns: pd.DataFrame: Fully symmetric correlation matrix as a pd.DataFrame with peptide labels as columns and rows. - ''' + """ if isinstance(var_a_col, int): var_a_col = df.columns[var_a_col] @@ -28,38 +30,43 @@ def reconstruct_corrs_df_symmetric_from_long_df(df, var_a_col=0, var_b_col=1, co all_peptides = set(df[var_a_col]).union(set(df[var_b_col])) all_peptides = sorted(list(all_peptides)) n = len(all_peptides) - + pep_to_idx = {pep: i for i, pep in enumerate(all_peptides)} - + # Init corr_matrix = np.full((n, n), np.nan) np.fill_diagonal(corr_matrix, 1.0) - + # Fill in the known correlation values for _, row in df.iterrows(): i = pep_to_idx[row[var_a_col]] j = pep_to_idx[row[var_b_col]] corr_matrix[i, j] = row[corr_col] - + # Fill in the symmetric values where possible for i in range(n): - for j in range(i+1, n): + for j in range(i + 1, n): if np.isnan(corr_matrix[i, j]) and not np.isnan(corr_matrix[j, i]): corr_matrix[i, j] = corr_matrix[j, i] - elif np.isnan(corr_matrix[j, i]) and not np.isnan(corr_matrix[i, j]): + elif np.isnan(corr_matrix[j, i]) and not np.isnan( + corr_matrix[i, j] + ): corr_matrix[j, i] = corr_matrix[i, j] elif np.isnan(corr_matrix[j, i]) and np.isnan(corr_matrix[i, j]): rev = {i: pep for pep, i in pep_to_idx.items()} - raise ValueError(( - f'Logical bug. For combination of peptides: {rev[i]} and ' - f'{rev[j]} there was no value found.' - )) - elif not np.isnan(corr_matrix[j, i]) and not np.isnan(corr_matrix[i, j]): - assert corr_matrix[i,j] == corr_matrix[j,i] - - - corr_df = pd.DataFrame(corr_matrix, index=all_peptides, columns=all_peptides) - + raise ValueError( + f"Logical bug. For combination of peptides: {rev[i]} and " + f"{rev[j]} there was no value found." + ) + elif not np.isnan(corr_matrix[j, i]) and not np.isnan( + corr_matrix[i, j] + ): + assert corr_matrix[i, j] == corr_matrix[j, i] + + corr_df = pd.DataFrame( + corr_matrix, index=all_peptides, columns=all_peptides + ) + return corr_df diff --git a/proteopy/utils/data_structures.py b/proteopy/utils/data_structures.py index e11448b..cfb0f4f 100644 --- a/proteopy/utils/data_structures.py +++ b/proteopy/utils/data_structures.py @@ -1,6 +1,7 @@ import pandas as pd import numpy as np + class ListDict(dict): def __getitem__(self, key): @@ -10,7 +11,6 @@ def __getitem__(self, key): return super().__getitem__(key) - def setdefault(self, key, default=None): if default is None: @@ -19,10 +19,9 @@ def setdefault(self, key, default=None): return super().setdefault(key, default) +class BinaryClusterTree: -class BinaryClusterTree(): - - class Node(): + class Node: def __init__(self, value=None, height=None): self.value = value @@ -33,7 +32,8 @@ def __init__(self, value=None, height=None): def __str__(self): left_val = self.left.value if self.left else None right_val = self.right.value if self.right else None - repr = f'Node {self.value}: children: left={left_val}, right={right_val}' + repr = f"Node { + self.value}: children: left={left_val}, right={right_val}" return repr def is_leaf(self): @@ -47,14 +47,14 @@ def __init__(self, constructor=None): self.constructor = constructor if constructor: - self.labels = constructor['labels'] + self.labels = constructor["labels"] self._init_from_constructor(constructor) def __len__(self): return self.size def find(self, value): - node = BinaryClusterTree._find(self.root, value) + node = BinaryClusterTree._find(self.root, value) if node is None: raise KeyError(value) @@ -71,7 +71,7 @@ def count_leaves(self): return BinaryClusterTree._count_leaves(self.root) def cut(self, k, use_labels=False): - + if self.root is None: raise ValueError() @@ -81,7 +81,9 @@ def cut(self, k, use_labels=False): for cluster_id, leaf_nodes in cluster_leaves_map.items(): cluster_pep_map[cluster_id] = [n.value for n in leaf_nodes] - pep_cluster_map = {p: c for c, peps in cluster_pep_map.items() for p in peps} + pep_cluster_map = { + p: c for c, peps in cluster_pep_map.items() for p in peps + } ds = pd.Series(pep_cluster_map) @@ -111,7 +113,7 @@ def _cut(node, k): max_height = -1 candidate_idx_in_queue = None - for (idx, n) in candidates: + for idx, n in candidates: if n.height > max_height: max_height = n.height candidate_idx_in_queue = idx @@ -137,41 +139,45 @@ def get_labels(self, cluster_ids): def _init_from_constructor(self, constructor): - match constructor['type']: + match constructor["type"]: - case 'sklearn_agglomerative_clustering': + case "sklearn_agglomerative_clustering": # Create binary tree from sklearn.cluster.AgglomerativeClustering object # From leaves up to root # leaves -> labels # nodes -> cluster numbers - children = constructor['merge'] + children = constructor["merge"] if not children: - raise ValueError(constructor['merge']) + raise ValueError(constructor["merge"]) - #labels = constructor['labels'] - heights = constructor['heights'] - n_samples = len(children) + 1 #len(labels) # == len(merge) + 1 + # labels = constructor['labels'] + heights = constructor["heights"] + n_samples = ( + len(children) + 1 + ) # len(labels) # == len(merge) + 1 # The root is the last merge operation - self.root = self._build_sklearn_tree(children, heights, n_samples, len(children) - 1) + self.root = self._build_sklearn_tree( + children, heights, n_samples, len(children) - 1 + ) self.size += n_samples + len(children) case _: - raise ValueError('Constructor type not supported') + raise ValueError("Constructor type not supported") def print_tree(self): if self.root is None: print("Empty tree") return - + BinaryClusterTree._print_node(self.root, labels=self.labels) @staticmethod def _find(node, value): - '''Breadth first approach''' + """Breadth first approach.""" if node is None: return None @@ -227,44 +233,51 @@ def count_children(node): count = (node.left is not None) + (node.right is not None) - if count not in (0,2): - raise ValueError('There are not 0 or 2 children') + if count not in (0, 2): + raise ValueError("There are not 0 or 2 children") return count def _build_sklearn_tree(self, children, heights, n_samples, merge_idx): if merge_idx < 0: raise ValueError(merge_idx) - - # Current merge creates node with value = n_samples + merge_idx (cluster ID) + + # Current merge creates node with value = n_samples + merge_idx + # (cluster ID) left_child_id, right_child_id = children[merge_idx] - node = BinaryClusterTree.Node(value=n_samples + merge_idx, - height=heights[merge_idx]) # value= n_samples + merge_idx, - #value=n_samples - merge_idx - 2, + node = BinaryClusterTree.Node( + value=n_samples + merge_idx, height=heights[merge_idx] + ) # value= n_samples + merge_idx, + # value=n_samples - merge_idx - 2, - # Handle left child if left_child_id < n_samples: # Left child is a leaf (original sample) - node.left = BinaryClusterTree.Node(value=left_child_id, - height=heights[merge_idx]) - + node.left = BinaryClusterTree.Node( + value=left_child_id, height=heights[merge_idx] + ) + else: # Left child is an internal node, recurse child_merge_idx = left_child_id - n_samples - node.left = self._build_sklearn_tree(children, heights, n_samples, child_merge_idx) - - # Handle right child + node.left = self._build_sklearn_tree( + children, heights, n_samples, child_merge_idx + ) + + # Handle right child if right_child_id < n_samples: # Right child is a leaf (original sample) - node.right = BinaryClusterTree.Node(value=right_child_id, - height=heights[merge_idx]) + node.right = BinaryClusterTree.Node( + value=right_child_id, height=heights[merge_idx] + ) else: # Right child is an internal node, recurse child_merge_idx = right_child_id - n_samples - node.right = self._build_sklearn_tree(children, heights, n_samples, child_merge_idx) - + node.right = self._build_sklearn_tree( + children, heights, n_samples, child_merge_idx + ) + return node @staticmethod @@ -272,23 +285,33 @@ def _print_node(node, indent=0, labels=None): if node is None: return - + # Print current node with indentation - height = node.height if node.height else '' - label = ' "' + str(labels[node.value]) + '"' if labels and node.is_leaf() else '' - print('(' + str(height) + ')' + ' ' * indent + str(node.value) + label) - + height = node.height if node.height else "" + label = ( + ' "' + str(labels[node.value]) + '"' + if labels and node.is_leaf() + else "" + ) + print( + "(" + str(height) + ")" + " " * indent + str(node.value) + label + ) + # Print children with increased indentation if node.left is not None or node.right is not None: if node.left is not None: - BinaryClusterTree._print_node(node.left, indent + 1, labels=labels) + BinaryClusterTree._print_node( + node.left, indent + 1, labels=labels + ) else: - print(' ' * (indent + 1) + 'None') - + print(" " * (indent + 1) + "None") + if node.right is not None: - BinaryClusterTree._print_node(node.right, indent + 1, labels=labels) + BinaryClusterTree._print_node( + node.right, indent + 1, labels=labels + ) else: - print(' ' * (indent + 1) + 'None') + print(" " * (indent + 1) + "None") diff --git a/proteopy/utils/functools.py b/proteopy/utils/functools.py index ffd16b7..ab4e7f6 100644 --- a/proteopy/utils/functools.py +++ b/proteopy/utils/functools.py @@ -2,7 +2,6 @@ import inspect from functools import partial, update_wrapper from textwrap import dedent -from typing import Dict, Set, Optional # ----------------------- # Docstring utilities @@ -11,25 +10,36 @@ _NUMPY_START = re.compile(r"^\s*Parameters\s*\Z", re.IGNORECASE) _NUMPY_EXAMPLES = re.compile(r"^\s*Examples?\s*\Z", re.IGNORECASE) _GOOGLE_START = re.compile(r"^\s*Args:\s*\Z") -_REST_PARAM = lambda name: re.compile(rf"^\s*:param\s+{re.escape(name)}\s*:") -_REST_TYPE = lambda name: re.compile(rf"^\s*:type\s+{re.escape(name)}\s*:") -def _numpy_param_re(name): # e.g. "greeting : str" or just "greeting" + +def _REST_PARAM(name): + return re.compile(rf"^\s*:param\s+{re.escape(name)}\s*:") + + +def _REST_TYPE(name): + return re.compile(rf"^\s*:type\s+{re.escape(name)}\s*:") + + +def _numpy_param_re(name): # e.g. "greeting : str" or just "greeting" return re.compile( rf"^(\s*){re.escape(name)}\s*(?::|\Z)", re.UNICODE, ) + def _google_param_re(name): # e.g. "greeting (str):" or "greeting:" return re.compile(rf"^(\s*){re.escape(name)}\s*(\([^)]+\))?\s*:\s*\Z") + def _strip_block_at(lines, start_idx, base_indent_str): + """Remove a parameter block starting at start_idx. + + A block is the start line plus following lines that are more + indented (or blank). """ - Remove a parameter block starting at start_idx. - A block is the start line plus following lines that are more indented (or blank). - """ + def indent_of(s: str) -> int: - return len(s) - len(s.lstrip(' ')) + return len(s) - len(s.lstrip(" ")) base = indent_of(lines[start_idx]) i = start_idx + 1 @@ -45,6 +55,7 @@ def indent_of(s: str) -> int: del lines[start_idx:i] return start_idx + def _remove_numpy_google_param_blocks(lines, name): i = 0 n = len(lines) @@ -57,13 +68,20 @@ def _remove_numpy_google_param_blocks(lines, name): # Section starts if _NUMPY_START.match(line): in_numpy, in_google = True, False - i += 1; continue + i += 1 + continue if _GOOGLE_START.match(line): in_numpy, in_google = False, True - i += 1; continue + i += 1 + continue # Section termination heuristics - if in_numpy and re.match(r"^\s*\S.*\Z", line) and line.strip().endswith(":") and line.strip().lower() not in {"parameters:"}: + if ( + in_numpy + and re.match(r"^\s*\S.*\Z", line) + and line.strip().endswith(":") + and line.strip().lower() not in {"parameters:"} + ): in_numpy = False if in_google and (line.strip() and not line.startswith(" ")): in_google = False @@ -84,21 +102,22 @@ def _remove_numpy_google_param_blocks(lines, name): i += 1 + def _remove_rest_param_lines(lines, name): i = 0 n = len(lines) param_re = _REST_PARAM(name) - type_re = _REST_TYPE(name) + type_re = _REST_TYPE(name) def remove_line_and_continuation(start_idx): - base_indent = len(lines[start_idx]) - len(lines[start_idx].lstrip(' ')) + base_indent = len(lines[start_idx]) - len(lines[start_idx].lstrip(" ")) j = start_idx + 1 while j < len(lines): ln = lines[j] if not ln.strip(): j += 1 continue - indent = len(ln) - len(ln.lstrip(' ')) + indent = len(ln) - len(ln.lstrip(" ")) if indent <= base_indent: break j += 1 @@ -113,14 +132,17 @@ def remove_line_and_continuation(start_idx): continue i += 1 -def _format_fixed_note(fixed_map: Dict[str, object], func_name: str) -> str: - # short, universal note appended to the docstring, now mentioning the original function + +def _format_fixed_note(fixed_map: dict[str, object], func_name: str) -> str: + # short, universal note appended to the docstring, now mentioning the + # original function kv = ", ".join(f"{k}={v!r}" for k, v in fixed_map.items()) return ( "Note:\n" f" This function is a partial of `{func_name}`, with the following arguments fixed: {kv}." ) + def _replace_doc_header(lines, new_header: str): header_text = dedent(new_header).strip() header_lines = header_text.splitlines() if header_text else [] @@ -145,6 +167,7 @@ def _replace_doc_header(lines, new_header: str): return header_lines + [""] + tail return tail + def _replace_doc_examples(lines, new_examples: str): examples_text = dedent(new_examples).strip() examples_lines = examples_text.splitlines() if examples_text else [] @@ -163,13 +186,7 @@ def _replace_doc_examples(lines, new_examples: str): # Strip trailing blank lines while result and result[-1].strip() == "": result.pop() - return ( - result - + [""] - + ["Examples"] - + ["--------"] - + examples_lines - ) + return result + [""] + ["Examples"] + ["--------"] + examples_lines return list(lines) # Find the end of the Examples section: skip underline then @@ -216,14 +233,15 @@ def _replace_doc_examples(lines, new_examples: str): return before + after -def _prune_docstring(doc: str, - fixed_names: Set[str], - add_note: bool, - fixed_map: Dict[str, object], - func_name: str, - docstr_header: Optional[str], - docstr_examples: Optional[str] = None, - ) -> str: +def _prune_docstring( + doc: str, + fixed_names: set[str], + add_note: bool, + fixed_map: dict[str, object], + func_name: str, + docstr_header: str | None, + docstr_examples: str | None = None, +) -> str: if doc: lines = dedent(doc).splitlines() else: @@ -264,39 +282,49 @@ def _prune_docstring(doc: str, return doc_out + # ----------------------- # Signature + metadata # ----------------------- -def _bound_fixed_map(func, args, kwargs) -> Dict[str, object]: + +def _bound_fixed_map(func, args, kwargs) -> dict[str, object]: sig = inspect.signature(func) bound = sig.bind_partial(*args, **kwargs) return dict(bound.arguments) -def _prune_signature(sig: inspect.Signature, fixed_names: Set[str]) -> inspect.Signature: - new_params = [p for name, p in sig.parameters.items() if name not in fixed_names] + +def _prune_signature( + sig: inspect.Signature, fixed_names: set[str] +) -> inspect.Signature: + new_params = [ + p for name, p in sig.parameters.items() if name not in fixed_names + ] return sig.replace(parameters=new_params) -def _prune_annotations(ann: Optional[dict], fixed_names: Set[str]) -> Optional[dict]: + +def _prune_annotations(ann: dict | None, fixed_names: set[str]) -> dict | None: if not ann: return None return {k: v for k, v in ann.items() if k not in fixed_names} + # ----------------------- # Public helper # ----------------------- + def partial_with_docsig( func, /, *args, add_fixed_note: bool = True, - docstr_header: Optional[str] = None, - docstr_examples: Optional[str] = None, + docstr_header: str | None = None, + docstr_examples: str | None = None, **kwargs, ): - """ - Create a functools.partial that: + """Create a functools.partial that: + - inherits metadata (__name__, __module__, __qualname__, __wrapped__, etc.) - removes fixed parameters from the displayed signature - removes fixed parameters' entries from the docstring (NumPy/Google/reST styles) @@ -328,18 +356,21 @@ def partial_with_docsig( p = partial(func, *args, **kwargs) update_wrapper(p, func) - fixed_map = _bound_fixed_map(func, args, kwargs) + fixed_map = _bound_fixed_map(func, args, kwargs) fixed_names = set(fixed_map) # Signature: remove fixed params p.__signature__ = _prune_signature(inspect.signature(func), fixed_names) # Annotations: drop fixed params (optional but tidy) - pruned_ann = _prune_annotations(getattr(func, "__annotations__", None), fixed_names) + pruned_ann = _prune_annotations( + getattr(func, "__annotations__", None), fixed_names + ) if pruned_ann is not None: p.__annotations__ = pruned_ann - # Docstring: remove fixed params docs + append note with original function name + # Docstring: remove fixed params docs + append note with original function + # name original_doc = inspect.getdoc(func) or func.__doc__ or "" p.__doc__ = _prune_docstring( original_doc, diff --git a/proteopy/utils/matplotlib.py b/proteopy/utils/matplotlib.py index a5105a4..96b2b2b 100644 --- a/proteopy/utils/matplotlib.py +++ b/proteopy/utils/matplotlib.py @@ -3,25 +3,30 @@ import matplotlib.pyplot as plt from matplotlib.colors import Colormap + def _resolve_color_scheme(color_scheme, labels): - """Convert a user-supplied color scheme into a list matching `labels` order.""" + """Convert a user-supplied color scheme into a list matching + `labels` order.""" labels_list = list(labels) if color_scheme is None: if not labels_list: return None - color_cycle = plt.rcParams.get('axes.prop_cycle') + color_cycle = plt.rcParams.get("axes.prop_cycle") if color_cycle is None: return None - default_colors = color_cycle.by_key().get('color', []) + default_colors = color_cycle.by_key().get("color", []) if not default_colors: return None - return [default_colors[i % len(default_colors)] for i in range(len(labels_list))] + return [ + default_colors[i % len(default_colors)] + for i in range(len(labels_list)) + ] if isinstance(color_scheme, np.ndarray): color_scheme = color_scheme.tolist() @@ -63,21 +68,21 @@ def _resolve_color_scheme(color_scheme, labels): ): color_list = list(color_scheme) if not color_list: - raise ValueError('color_scheme sequence cannot be empty.') + raise ValueError("color_scheme sequence cannot be empty.") if len(color_list) == 1: return color_list * len(labels_list) if len(color_list) < len(labels_list): raise ValueError( - 'color_scheme sequence must include at least as many colors as labels.' + "color_scheme sequence must include at least as many colors as labels." ) - return color_list[:len(labels_list)] + return color_list[: len(labels_list)] if callable(color_scheme): positions = np.linspace(0, 1, len(labels_list)) return [color_scheme(pos) for pos in positions] raise TypeError( - 'color_scheme must be a sequence of colors, a Matplotlib colormap, a ' - 'callable returning colors, a dictionary mapping categories to colors, ' - 'or a named Matplotlib palette.' + "color_scheme must be a sequence of colors, a Matplotlib colormap, a " + "callable returning colors, a dictionary mapping categories to colors, " + "or a named Matplotlib palette." ) diff --git a/proteopy/utils/pandas.py b/proteopy/utils/pandas.py index a5b1f58..784f779 100644 --- a/proteopy/utils/pandas.py +++ b/proteopy/utils/pandas.py @@ -10,7 +10,7 @@ def load_dataframe( data: str | Path | pd.DataFrame, sep: str | None = None, - ) -> pd.DataFrame: +) -> pd.DataFrame: """Load data from file path or return DataFrame directly. Parameters diff --git a/proteopy/utils/parsers.py b/proteopy/utils/parsers.py index 023d713..9396b63 100644 --- a/proteopy/utils/parsers.py +++ b/proteopy/utils/parsers.py @@ -1,6 +1,5 @@ import re import warnings -from typing import Dict, Optional, List import anndata as ad import numpy as np @@ -16,7 +15,9 @@ } -def parse_tumor_subclass(df: pd.DataFrame, col: str = "tumor_class") -> pd.DataFrame: +def parse_tumor_subclass( + df: pd.DataFrame, col: str = "tumor_class" +) -> pd.DataFrame: """ Parse a less-structured tumor_class column into: - main_tumor_type @@ -53,7 +54,6 @@ def parse_tumor_subclass(df: pd.DataFrame, col: str = "tumor_class") -> pd.DataF df = df.copy() df.index.name = None - # Compile patterns once # Genetic markers to capture (exact phrases) genetic_marker_patterns = [ @@ -64,26 +64,35 @@ def parse_tumor_subclass(df: pd.DataFrame, col: str = "tumor_class") -> pd.DataF ] # subclass and subtype helpers - subclass_bracket_pat = re.compile(r"\[([^\]]*subclass[^\]]*)\]", re.IGNORECASE) + subclass_bracket_pat = re.compile( + r"\[([^\]]*subclass[^\]]*)\]", re.IGNORECASE + ) subclass_pat = re.compile(r"\bsubclass\b[^\),;\]]*", re.IGNORECASE) - subtype_bracket_pat = re.compile(r"\[([^\]]*subtype[^\]]*)\]", re.IGNORECASE) + subtype_bracket_pat = re.compile( + r"\[([^\]]*subtype[^\]]*)\]", re.IGNORECASE + ) # 'subtype ...' subtype_after_pat = re.compile(r"\bsubtype\b[^\),;\]]*", re.IGNORECASE) # '... subtype' (capture up to 3 words before subtype) - subtype_before_pat = re.compile(r"(?:\b[\w/-]+\s+){1,3}\bsubtype\b", re.IGNORECASE) + subtype_before_pat = re.compile( + r"(?:\b[\w/-]+\s+){1,3}\bsubtype\b", re.IGNORECASE + ) # Splitter on comma or the word 'and' splitter = re.compile(r"\s*,\s*|\s+\band\b\s+", re.IGNORECASE) def strip_wrappers(s: str) -> str: s = s.strip() - # remove enclosing brackets or parentheses only if they enclose the whole chunk - if len(s) >= 2 and ((s[0] == "[" and s[-1] == "]") or (s[0] == "(" and s[-1] == ")")): + # remove enclosing brackets or parentheses only if they enclose the + # whole chunk + if len(s) >= 2 and ( + (s[0] == "[" and s[-1] == "]") or (s[0] == "(" and s[-1] == ")") + ): s = s[1:-1].strip() return s.strip(" ,;") - def dedupe_keep_order(items: List[str]) -> List[str]: + def dedupe_keep_order(items: list[str]) -> list[str]: seen = set() out = [] for x in items: @@ -98,7 +107,7 @@ def normalize_case(val: str) -> str: # Keep original chunk case for readability return val.strip() - def parse_one(value: Optional[str]) -> Dict[str, Optional[str]]: + def parse_one(value: str | None) -> dict[str, str | None]: if value is None or (isinstance(value, float) and np.isnan(value)): return { "main_tumor_type": None, @@ -109,11 +118,11 @@ def parse_one(value: Optional[str]) -> Dict[str, Optional[str]]: } remaining = str(value).strip() - markers: List[str] = [] - subclass_val: Optional[str] = None - subtype_val: Optional[str] = None - rest_parts: List[str] = [] - main_tumor_type: Optional[str] = None + markers: list[str] = [] + subclass_val: str | None = None + subtype_val: str | None = None + rest_parts: list[str] = [] + main_tumor_type: str | None = None while True: # Split into tokens @@ -130,7 +139,7 @@ def parse_one(value: Optional[str]) -> Dict[str, Optional[str]]: remaining_next = ", ".join(tokens[:-1]) chunk_work = chunk - consumed_spans: List[tuple] = [] + consumed_spans: list[tuple] = [] def record_span(m): if m: @@ -148,7 +157,8 @@ def record_span(m): subclass_val = normalize_case(m.group(0)) record_span(m) - # 2) subtype (first bracketed, then 'subtype ...', then '... subtype') + # 2) subtype (first bracketed, then 'subtype ...', then '... + # subtype') if subtype_val is None: m = subtype_bracket_pat.search(chunk_work) if m: @@ -173,15 +183,20 @@ def record_span(m): record_span(m) # Compute residual of this chunk after removing matches - residual = strip_wrappers(_remove_spans(chunk_work, consumed_spans)) + residual = strip_wrappers( + _remove_spans(chunk_work, consumed_spans) + ) if residual: rest_parts.append(residual) if remaining_next is None: # Final chunk: this defines main_tumor_type (after removing matched parts) - # If residual is empty (i.e., the entire chunk was a match), fall back to cleaned chunk - main_tumor_type = residual if residual else strip_wrappers(chunk_work) + # If residual is empty (i.e., the entire chunk was a match), + # fall back to cleaned chunk + main_tumor_type = ( + residual if residual else strip_wrappers(chunk_work) + ) break else: remaining = remaining_next @@ -205,7 +220,7 @@ def record_span(m): "rest": rest, } - def _remove_spans(text: str, spans: List[tuple]) -> str: + def _remove_spans(text: str, spans: list[tuple]) -> str: if not spans: return text spans_sorted = sorted(spans) @@ -223,12 +238,12 @@ def _remove_spans(text: str, spans: List[tuple]) -> str: parsed = df[col].apply(parse_one) parsed_df = pd.DataFrame(list(parsed)) df_list = [ - df.reset_index()[['index', col]], - parsed_df.reset_index(drop=True) + df.reset_index()[["index", col]], + parsed_df.reset_index(drop=True), ] - new_df = pd.concat(df_list, axis=1) - new_df = new_df.set_index('index') + new_df = pd.concat(df_list, axis=1) + new_df = new_df.set_index("index") # Add original index new_df = new_df.loc[df.index,] @@ -237,19 +252,22 @@ def _remove_spans(text: str, spans: List[tuple]) -> str: def diann_run(s, warn=False): - match = re.search(r'_(\d+)_T', s) + match = re.search(r"_(\d+)_T", s) if match: - return 'Run_' + match.group(1) + return "Run_" + match.group(1) - match = re.search(r'(?<=_)(?:N?\d{2,5}(?:_[A-Za-z0-9]+)*_[A-Za-z]+|N?\d{5}|N?\d{2}_\d{4}[A-Za-z]?_[A-Za-z]+)(?=_T1_DIA)', s) + match = re.search( + r"(?<=_)(?:N?\d{2,5}(?:_[A-Za-z0-9]+)*_[A-Za-z]+|N?\d{5}|N?\d{2}_\d{4}[A-Za-z]?_[A-Za-z]+)(?=_T1_DIA)", + s, + ) if match: - return 'Run_' + match.group(0) + return "Run_" + match.group(0) if warn: - warnings.warn(f'No match for string:\n{s}') - return 'no_parse_match' + warnings.warn(f"No match for string:\n{s}") + return "no_parse_match" - raise ValueError(f'No match for string:\n{s}') + raise ValueError(f"No match for string:\n{s}") def _pretty_design_label(label: str) -> str: @@ -260,8 +278,7 @@ def parse_stat_test_varm_slot( varm_slot: str, adata: ad.AnnData | None = None, ) -> dict[str, str | None]: - """ - Parse a stat-test varm slot name into its components. + """Parse a stat-test varm slot name into its components. The expected format is ``;;`` when no layer is used, or ``;;;`` when @@ -328,8 +345,7 @@ def parse_stat_test_varm_slot( if layer_part: if adata is not None and adata.layers: layer_map = { - sanitize_string(name): name - for name in adata.layers.keys() + sanitize_string(name): name for name in adata.layers.keys() } if layer_part in layer_map: layer = layer_map[layer_part] @@ -339,7 +355,7 @@ def parse_stat_test_varm_slot( f"must contain the sanitized layer part for back-" f"mapping. '{layer_part}' not found in adata varm layers" f"(unsanitized): {adata.layers}." - ) + ) else: layer = layer_part @@ -366,8 +382,7 @@ def parse_stat_test_varm_slot( ) else: raise ValueError( - "Design must use '_vs_' or " - "'_vs_rest'." + "Design must use '_vs_' or " "'_vs_rest'." ) test_info = { @@ -383,8 +398,7 @@ def parse_stat_test_varm_slot( def _is_standard_hclustv_key(key: str, key_type: str = "linkage") -> bool: - """ - Check if a key follows the standard hclust key format. + """Check if a key follows the standard hclust key format. Parameters ---------- @@ -404,8 +418,8 @@ def _is_standard_hclustv_key(key: str, key_type: str = "linkage") -> bool: def _parse_hclustv_key_components(key: str) -> tuple[str, str, str] | None: - """ - Extract (group_by, hash, layer) components from a standard hclust key. + """Extract (group_by, hash, layer) components from a standard hclust + key. Returns None if the key does not follow the standard format. """ @@ -419,27 +433,24 @@ def _parse_hclustv_key_components(key: str) -> tuple[str, str, str] | None: def _resolve_hclustv_keys( adata: ad.AnnData, - linkage_key: str = 'auto', - values_key: str = 'auto', + linkage_key: str = "auto", + values_key: str = "auto", verbose: bool = True, ) -> tuple[str, str]: - """ - Resolve linkage and values keys from adata.uns. + """Resolve linkage and values keys from adata.uns. Auto-detects keys if not provided, validates existence, and returns the resolved key names. """ linkage_candidates = [ - key for key in adata.uns.keys() - if key.startswith("hclustv_linkage;") + key for key in adata.uns.keys() if key.startswith("hclustv_linkage;") ] values_candidates = [ - key for key in adata.uns.keys() - if key.startswith("hclustv_values;") + key for key in adata.uns.keys() if key.startswith("hclustv_values;") ] - linkage_auto = linkage_key == 'auto' - values_auto = values_key == 'auto' + linkage_auto = linkage_key == "auto" + values_auto = values_key == "auto" if linkage_auto: if len(linkage_candidates) == 0: @@ -502,11 +513,10 @@ def _resolve_hclustv_keys( def _resolve_hclustv_cluster_key( adata: ad.AnnData, - cluster_key: str = 'auto', + cluster_key: str = "auto", verbose: bool = True, ) -> str: - """ - Resolve cluster annotation key from adata.var columns. + """Resolve cluster annotation key from adata.var columns. Auto-detects key if not provided, validates existence, and returns the resolved key name. @@ -536,11 +546,10 @@ def _resolve_hclustv_cluster_key( If the specified ``cluster_key`` is not found in ``adata.var``. """ cluster_candidates = [ - col for col in adata.var.columns - if col.startswith("hclustv_cluster;") + col for col in adata.var.columns if col.startswith("hclustv_cluster;") ] - if cluster_key == 'auto': + if cluster_key == "auto": if len(cluster_candidates) == 0: raise ValueError( "No cluster annotations found in adata.var. " @@ -566,11 +575,10 @@ def _resolve_hclustv_cluster_key( def _resolve_hclustv_profile_key( adata: ad.AnnData, - profile_key: str = 'auto', + profile_key: str = "auto", verbose: bool = True, ) -> str: - """ - Resolve cluster profile key from adata.uns. + """Resolve cluster profile key from adata.uns. Auto-detects key if not provided, validates existence, and returns the resolved key name. @@ -600,11 +608,10 @@ def _resolve_hclustv_profile_key( If the specified ``profile_key`` is not found in ``adata.uns``. """ profile_candidates = [ - key for key in adata.uns.keys() - if key.startswith("hclustv_profiles;") + key for key in adata.uns.keys() if key.startswith("hclustv_profiles;") ] - if profile_key == 'auto': + if profile_key == "auto": if len(profile_candidates) == 0: raise ValueError( "No cluster profiles found in adata.uns. " diff --git a/proteopy/utils/stat_tests.py b/proteopy/utils/stat_tests.py index 4609a1b..ad22eeb 100644 --- a/proteopy/utils/stat_tests.py +++ b/proteopy/utils/stat_tests.py @@ -12,14 +12,10 @@ def _validate_thresholds(fc_thresh, pval_thresh): if fc_thresh is not None and fc_thresh <= 0: - raise ValueError( - "fc_thresh must be a positive number." - ) + raise ValueError("fc_thresh must be a positive number.") if pval_thresh is not None: if pval_thresh <= 0 or pval_thresh > 1: - raise ValueError( - "pval_thresh must be in (0, 1]." - ) + raise ValueError("pval_thresh must be in (0, 1].") def _filter_volcano_data(fc_vals, pvals, labels, alt_color): @@ -27,20 +23,16 @@ def _filter_volcano_data(fc_vals, pvals, labels, alt_color): nan_mask = np.isnan(fc_vals) | np.isnan(pvals) if nan_mask.any(): warnings.warn( - "Dropping entries with NaN fold changes or " - "p-values.", + "Dropping entries with NaN fold changes or " "p-values.", RuntimeWarning, ) # Drop non-finite (inf, -inf) - nonfinite_mask = ( - ~np.isfinite(fc_vals) | ~np.isfinite(pvals) - ) + nonfinite_mask = ~np.isfinite(fc_vals) | ~np.isfinite(pvals) inf_only = nonfinite_mask & ~nan_mask if inf_only.any(): warnings.warn( - "Dropping entries with non-finite fold changes " - "or p-values.", + "Dropping entries with non-finite fold changes " "or p-values.", RuntimeWarning, ) @@ -49,8 +41,7 @@ def _filter_volcano_data(fc_vals, pvals, labels, alt_color): nonpos_new = nonpos_mask & ~nonfinite_mask if nonpos_new.any(): warnings.warn( - "Dropping non-positive p-values before log " - "transform.", + "Dropping non-positive p-values before log " "transform.", RuntimeWarning, ) @@ -63,9 +54,7 @@ def _filter_volcano_data(fc_vals, pvals, labels, alt_color): alt_color = alt_color[keep] if len(fc_vals) == 0: - raise ValueError( - "No valid results available for plotting." - ) + raise ValueError("No valid results available for plotting.") return fc_vals, pvals, labels, alt_color @@ -164,9 +153,7 @@ def _annotate_top_labels( ): abs_fc = np.abs(fc_vals) label_mask = ( - sig_mask - if fc_thresh is None - else sig_mask & (abs_fc >= fc_thresh) + sig_mask if fc_thresh is None else sig_mask & (abs_fc >= fc_thresh) ) idx = np.where(label_mask)[0] @@ -228,9 +215,7 @@ def _annotate_highlight_labels( highlight_labels, ): hl_set = set(highlight_labels) - hl_idx = np.where( - np.isin(labels, list(hl_set)) - )[0] + hl_idx = np.where(np.isin(labels, list(hl_set)))[0] # Warn about missing labels found = set(labels[hl_idx]) @@ -266,121 +251,86 @@ def _annotate_highlight_labels( def _validate_volcano_plot_inputs( # noqa: C901 - fc_vals, pvals, fc_thresh, pval_thresh, - labels, top_labels, highlight_labels, - alt_color, ax, + fc_vals, + pvals, + fc_thresh, + pval_thresh, + labels, + top_labels, + highlight_labels, + alt_color, + ax, ): """Validate all volcano-plot inputs.""" # -- Numeric arrays try: fc_vals = np.asarray(fc_vals, dtype=float) except (ValueError, TypeError): - raise ValueError( - "fc_vals must contain numeric values." - ) + raise ValueError("fc_vals must contain numeric values.") try: pvals = np.asarray(pvals, dtype=float) except (ValueError, TypeError): - raise ValueError( - "pvals must contain numeric values." - ) + raise ValueError("pvals must contain numeric values.") if fc_vals.ndim != 1: raise ValueError("fc_vals must be 1D.") if pvals.ndim != 1: raise ValueError("pvals must be 1D.") if fc_vals.shape != pvals.shape: - raise ValueError( - "fc_vals and pvals must have the same " - "length." - ) + raise ValueError("fc_vals and pvals must have the same " "length.") # -- Thresholds _validate_thresholds(fc_thresh, pval_thresh) # -- Axes if ax is not None and not isinstance(ax, Axes): - raise ValueError( - "ax must be a matplotlib Axes object." - ) + raise ValueError("ax must be a matplotlib Axes object.") # -- Label arguments n_points = fc_vals.shape[0] - if ( - top_labels is not None - and highlight_labels is not None - ): + if top_labels is not None and highlight_labels is not None: raise ValueError( - "top_labels and highlight_labels are " - "mutually exclusive." + "top_labels and highlight_labels are " "mutually exclusive." ) - if ( - labels is None - and (top_labels is not None - or highlight_labels is not None) + if labels is None and ( + top_labels is not None or highlight_labels is not None ): raise ValueError( "labels must be provided when " "top_labels or highlight_labels is set." ) if top_labels is not None: - if ( - not isinstance(top_labels, int) - or top_labels <= 0 - ): - raise ValueError( - "top_labels must be a positive integer." - ) + if not isinstance(top_labels, int) or top_labels <= 0: + raise ValueError("top_labels must be a positive integer.") if highlight_labels is not None: - if len(highlight_labels) != len( - set(highlight_labels) - ): + if len(highlight_labels) != len(set(highlight_labels)): raise ValueError( - "highlight_labels must not contain " - "duplicates." + "highlight_labels must not contain " "duplicates." ) if len(highlight_labels) == 0: warnings.warn( "highlight_labels is empty.", UserWarning, ) - if not np.issubdtype( - np.asarray(highlight_labels).dtype, np.str_ - ): - raise ValueError( - "highlight_labels must contain " - "string values." - ) + if not np.issubdtype(np.asarray(highlight_labels).dtype, np.str_): + raise ValueError("highlight_labels must contain " "string values.") if labels is not None: labels = np.asarray(labels, dtype=str) if not np.issubdtype(labels.dtype, np.str_): - raise ValueError( - "labels must contain string values." - ) + raise ValueError("labels must contain string values.") if labels.shape[0] != n_points: - raise ValueError( - "labels must have the same length as " - "fc_vals." - ) + raise ValueError("labels must have the same length as " "fc_vals.") # -- alt_color if alt_color is not None: alt_color = np.asarray(alt_color) if alt_color.ndim != 1: - raise ValueError( - "alt_color must be a 1D boolean " - "sequence." - ) + raise ValueError("alt_color must be a 1D boolean " "sequence.") if alt_color.shape[0] != n_points: raise ValueError( - "alt_color must have the same length " - "as fc_vals." - ) - if not np.issubdtype( - alt_color.dtype, np.bool_ - ): - raise ValueError( - "alt_color must be boolean." + "alt_color must have the same length " "as fc_vals." ) + if not np.issubdtype(alt_color.dtype, np.bool_): + raise ValueError("alt_color must be boolean.") return fc_vals, pvals, labels, alt_color @@ -404,8 +354,7 @@ def volcano_plot( save: str | Path | None = None, ax: Axes | None = None, ) -> Axes: - """ - Volcano plot renderer (framework-agnostic). + """Volcano plot renderer (framework-agnostic). Draws a scatter plot of fold change (x-axis) versus p-value (y-axis). Points are colored by significance or by an optional @@ -532,17 +481,22 @@ def volcano_plot( ... ) """ - fc_vals, pvals, labels, alt_color = ( - _validate_volcano_plot_inputs( - fc_vals, pvals, fc_thresh, pval_thresh, - labels, top_labels, highlight_labels, - alt_color, ax, - ) + fc_vals, pvals, labels, alt_color = _validate_volcano_plot_inputs( + fc_vals, + pvals, + fc_thresh, + pval_thresh, + labels, + top_labels, + highlight_labels, + alt_color, + ax, ) - fc_vals, pvals, labels, alt_color = ( - _filter_volcano_data( - fc_vals, pvals, labels, alt_color, - ) + fc_vals, pvals, labels, alt_color = _filter_volcano_data( + fc_vals, + pvals, + labels, + alt_color, ) # -- Prepare plotting arrays @@ -574,8 +528,13 @@ def volcano_plot( _ax = ax fig = _ax.get_figure() _draw_scatter( - _ax, fc_vals, y_vals, up_mask, down_mask, - other_mask, alt_color, + _ax, + fc_vals, + y_vals, + up_mask, + down_mask, + other_mask, + alt_color, ) _draw_threshold_lines(_ax, fc_thresh, pval_thresh, yscale_log) @@ -590,21 +549,25 @@ def volcano_plot( # -- Labels if top_labels is not None and labels is not None: - label_mask = ( - alt_color if alt_color is not None - else sig_mask - ) - label_fc_thresh = ( - None if alt_color is not None - else fc_thresh - ) + label_mask = alt_color if alt_color is not None else sig_mask + label_fc_thresh = None if alt_color is not None else fc_thresh _annotate_top_labels( - _ax, fc_vals, pvals, y_vals, labels, - top_labels, label_mask, label_fc_thresh, + _ax, + fc_vals, + pvals, + y_vals, + labels, + top_labels, + label_mask, + label_fc_thresh, ) if highlight_labels is not None and labels is not None: _annotate_highlight_labels( - _ax, fc_vals, y_vals, labels, highlight_labels, + _ax, + fc_vals, + y_vals, + labels, + highlight_labels, ) if save: diff --git a/proteopy/utils/string.py b/proteopy/utils/string.py index c173703..b7f1459 100644 --- a/proteopy/utils/string.py +++ b/proteopy/utils/string.py @@ -5,8 +5,7 @@ def sanitize_string(s: str) -> str: - """ - Sanitize a string for use as a column name or identifier. + """Sanitize a string for use as a column name or identifier. Replaces any character that is not alphanumeric or underscore with an underscore. @@ -63,4 +62,4 @@ def detect_separator_from_extension(file_path: str | Path) -> str: f"Cannot auto-detect separator for extension '{suffix}'. " "Supported extensions: .csv, .tsv. " "Please provide the `sep` parameter explicitly." - ) + ) diff --git a/pyproject.toml b/pyproject.toml index 269bbb0..65d941b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,8 +19,8 @@ classifiers = [ license = "Apache-2.0" license-files = ["LICEN[CS]E*"] authors = [ - {name = "Ian Dirk Fichtner", email = "iandirk.fichtner@med.uni-heidelberg.de"}, - {name = "Isabell Bludau", email = "Isabell.Bludau@med.uni-heidelberg.de"} + {name = "Ian Dirk Fichtner", email = "iandirk.fichtner@med.uni-heidelberg.de"}, + {name = "Isabell Bludau", email = "Isabell.Bludau@med.uni-heidelberg.de"}, ] keywords = [ "proteomics", @@ -46,18 +46,19 @@ dependencies = [ "scipy", "seaborn", "statsmodels", - ] +] [project.optional-dependencies] dev = [ + "black[jupyter]==24.10.0", "bump2version", - "flake8", + "flake8==7.1.1", "ipykernel", "jupyterlab", - "pre-commit", "myst_parser", "nbsphinx", - "pylint", + "pre-commit==4.5.1", + "pylint==3.3.4", "pytest", "sphinx-autobuild", "sphinx-autodoc-typehints", @@ -67,14 +68,14 @@ dev = [ "sphinx-rtd-theme", "sphinx>=7.0", "twine", - ] +] usage = [ "ipykernel", "jupyterlab", "python_igraph", "scanpy", - ] +] [tool.setuptools.packages.find] include = ["proteopy*"] @@ -84,3 +85,7 @@ Homepage = "https://github.com/UKHD-NP/proteopy" Issues = "https://github.com/UKHD-NP/proteopy/issues" Documentation = "https://proteopy.readthedocs.io/en/stable" Repository = "https://github.com/UKHD-NP/proteopy.git" + +[tool.black] +line-length = 79 +target-version = ["py310", "py311"] diff --git a/requirements/requirements_ci-cd.txt b/requirements/requirements_ci-cd.txt new file mode 100644 index 0000000..9431f94 --- /dev/null +++ b/requirements/requirements_ci-cd.txt @@ -0,0 +1,5 @@ +black[jupyter]==24.10.0 +flake8==7.1.1 +pre-commit==4.5.1 +pylint==3.3.4 +pytest==8.3.4 diff --git a/tests/datasets/test_karayel_2020.py b/tests/datasets/test_karayel_2020.py index b7ac407..90adaaa 100644 --- a/tests/datasets/test_karayel_2020.py +++ b/tests/datasets/test_karayel_2020.py @@ -1,4 +1,5 @@ """Tests for proteopy.datasets.karayel_2020.""" + import hashlib import anndata as ad @@ -13,28 +14,30 @@ _EXPECTED_SHAPE = (20, 7758) _EXPECTED_X_HASH = ( - "3f40838356b56b8f230bdb02bde8d16d" - "c574fcc41d106bdce02ebe666f4e02db" + "3f40838356b56b8f230bdb02bde8d16d" "c574fcc41d106bdce02ebe666f4e02db" ) _EXPECTED_OBS_NAMES_HASH = ( - "fef7fd91a6e93d20b719f61c63098865" - "bbd3f886dabfa46786cde09e520c0abe" + "fef7fd91a6e93d20b719f61c63098865" "bbd3f886dabfa46786cde09e520c0abe" ) _EXPECTED_VAR_NAMES_HASH = ( - "17d3bd09174bad3544738f30ed2867c4" - "bd431feccdf36515e5fae415110fc456" + "17d3bd09174bad3544738f30ed2867c4" "bd431feccdf36515e5fae415110fc456" ) _EXPECTED_OBS_COLUMNS = ["sample_id", "cell_type", "replicate"] _EXPECTED_VAR_COLUMNS = ["protein_id", "gene_id"] _EXPECTED_CELL_TYPES = [ - "LBaso", "Ortho", "Poly", "ProE&EBaso", "Progenitor", + "LBaso", + "Ortho", + "Poly", + "ProE&EBaso", + "Progenitor", ] _EXPECTED_REPLICATES = ["rep1", "rep2", "rep3", "rep4"] # -- Fixtures -------------------------------------------------------- + @pytest.fixture(scope="module") def adata(): """Load karayel_2020 dataset once for all tests.""" @@ -43,6 +46,7 @@ def adata(): # -- Helpers --------------------------------------------------------- + def _sha256(data: bytes) -> str: return hashlib.sha256(data).hexdigest() @@ -59,6 +63,7 @@ def _encode_index(index) -> bytes: # -- Content tests --------------------------------------------------- + class TestKarayel2020: """Verify structure and content of the karayel_2020 dataset.""" @@ -75,32 +80,20 @@ def test_var_columns(self, adata): assert adata.var.columns.tolist() == _EXPECTED_VAR_COLUMNS def test_cell_types(self, adata): - assert ( - sorted(adata.obs["cell_type"].unique()) - == _EXPECTED_CELL_TYPES - ) + assert sorted(adata.obs["cell_type"].unique()) == _EXPECTED_CELL_TYPES def test_replicates(self, adata): - assert ( - sorted(adata.obs["replicate"].unique()) - == _EXPECTED_REPLICATES - ) + assert sorted(adata.obs["replicate"].unique()) == _EXPECTED_REPLICATES def test_four_replicates_per_cell_type(self, adata): counts = adata.obs.groupby("cell_type").size() assert (counts == 4).all() def test_obs_names_match_sample_id(self, adata): - assert ( - list(adata.obs_names) - == list(adata.obs["sample_id"]) - ) + assert list(adata.obs_names) == list(adata.obs["sample_id"]) def test_var_names_match_protein_id(self, adata): - assert ( - list(adata.var_names) - == list(adata.var["protein_id"]) - ) + assert list(adata.var_names) == list(adata.var["protein_id"]) def test_x_dtype(self, adata): assert adata.X.dtype == np.float64 diff --git a/tests/datasets/test_williams_2018.py b/tests/datasets/test_williams_2018.py index b4d0ada..2a8796e 100644 --- a/tests/datasets/test_williams_2018.py +++ b/tests/datasets/test_williams_2018.py @@ -1,4 +1,5 @@ """Tests for proteopy.datasets.williams_2018.""" + import hashlib import anndata as ad @@ -13,28 +14,33 @@ _EXPECTED_SHAPE = (40, 32690) _EXPECTED_X_HASH = ( - "a2406828c5c11c28c566ac2bf9f694ac" - "eb90550ab37d91f085746b8b7fddf2c5" + "a2406828c5c11c28c566ac2bf9f694ac" "eb90550ab37d91f085746b8b7fddf2c5" ) _EXPECTED_OBS_NAMES_HASH = ( - "4a510a6124dd8b917c42f4270353aee2" - "0a11fd97d0bbd38200319af5f6b602ee" + "4a510a6124dd8b917c42f4270353aee2" "0a11fd97d0bbd38200319af5f6b602ee" ) _EXPECTED_VAR_NAMES_HASH = ( - "35bac1a175466852feb110553409be8c" - "f56c6564aaa73a75e4dc910b1cbb2d0e" + "35bac1a175466852feb110553409be8c" "f56c6564aaa73a75e4dc910b1cbb2d0e" ) _EXPECTED_OBS_COLUMNS = ["tissue", "mouse_id", "sample_id"] _EXPECTED_VAR_COLUMNS = ["protein_id", "gene_id", "peptide_id"] _EXPECTED_TISSUES = ["BAT", "Brain", "Heart", "Liver", "Quad"] _EXPECTED_MOUSE_IDS = [ - "101", "45", "66", "68", "73", "80", "C57", "DBA", + "101", + "45", + "66", + "68", + "73", + "80", + "C57", + "DBA", ] # -- Fixtures -------------------------------------------------------- + @pytest.fixture(scope="module") def adata(): """Load williams_2018 dataset once for all tests.""" @@ -43,6 +49,7 @@ def adata(): # -- Helpers --------------------------------------------------------- + def _sha256(data: bytes) -> str: return hashlib.sha256(data).hexdigest() @@ -59,6 +66,7 @@ def _encode_index(index) -> bytes: # -- Content tests --------------------------------------------------- + class TestWilliams2018: """Verify structure and content of the williams_2018 dataset.""" @@ -75,32 +83,20 @@ def test_var_columns(self, adata): assert adata.var.columns.tolist() == _EXPECTED_VAR_COLUMNS def test_tissues(self, adata): - assert ( - sorted(adata.obs["tissue"].unique()) - == _EXPECTED_TISSUES - ) + assert sorted(adata.obs["tissue"].unique()) == _EXPECTED_TISSUES def test_mouse_ids(self, adata): - assert ( - sorted(adata.obs["mouse_id"].unique()) - == _EXPECTED_MOUSE_IDS - ) + assert sorted(adata.obs["mouse_id"].unique()) == _EXPECTED_MOUSE_IDS def test_eight_mice_per_tissue(self, adata): counts = adata.obs.groupby("tissue").size() assert (counts == 8).all() def test_obs_names_match_sample_id(self, adata): - assert ( - list(adata.obs_names) - == list(adata.obs["sample_id"]) - ) + assert list(adata.obs_names) == list(adata.obs["sample_id"]) def test_var_names_match_peptide_id(self, adata): - assert ( - list(adata.var_names) - == list(adata.var["peptide_id"]) - ) + assert list(adata.var_names) == list(adata.var["peptide_id"]) def test_x_dtype(self, adata): assert adata.X.dtype == np.float64 diff --git a/tests/download/test_karayel_2020.py b/tests/download/test_karayel_2020.py index 12944cc..90eed02 100644 --- a/tests/download/test_karayel_2020.py +++ b/tests/download/test_karayel_2020.py @@ -1,4 +1,5 @@ """Tests for proteopy.download.karayel_2020.""" + import hashlib import pandas as pd @@ -10,32 +11,38 @@ # -- Expected values ------------------------------------------------- _EXPECTED_INTENSITIES_HASH = ( - "0a87e35cba89def63e8745776728d1d9" - "2510fb2ecee1a3cf7dc092881cf7c660" + "0a87e35cba89def63e8745776728d1d9" "2510fb2ecee1a3cf7dc092881cf7c660" ) _EXPECTED_VAR_HASH = ( - "1932d3b6568ef923fca9079a1fa1915c" - "ea00fca33c9094c1b4b9443584967e73" + "1932d3b6568ef923fca9079a1fa1915c" "ea00fca33c9094c1b4b9443584967e73" ) _EXPECTED_SAMPLE_HASH = ( - "996521c86b23958ec642d531a79c9c7f" - "28dc8676ad9cc261a7ec86bf1feaa012" + "996521c86b23958ec642d531a79c9c7f" "28dc8676ad9cc261a7ec86bf1feaa012" ) _EXPECTED_INTENSITIES_COLUMNS = [ - "sample_id", "protein_id", "intensity", + "sample_id", + "protein_id", + "intensity", ] _EXPECTED_VAR_COLUMNS = ["protein_id", "gene_id"] _EXPECTED_SAMPLE_COLUMNS = [ - "sample_id", "cell_type", "replicate", + "sample_id", + "cell_type", + "replicate", ] _EXPECTED_CELL_TYPES = [ - "LBaso", "Ortho", "Poly", "ProE&EBaso", "Progenitor", + "LBaso", + "Ortho", + "Poly", + "ProE&EBaso", + "Progenitor", ] # -- Helpers --------------------------------------------------------- + def _files(tmp_path, ext=".tsv"): return ( tmp_path / f"intensities{ext}", @@ -50,6 +57,7 @@ def _sha256(data: bytes) -> str: # -- Content tests --------------------------------------------------- + class TestKarayel2020Download: """Verify downloaded file content, structure, and error handling.""" @@ -66,10 +74,7 @@ def test_files_created(self, files): def test_intensities_columns(self, files): df = pd.read_csv(files[0], sep="\t", nrows=0) - assert ( - df.columns.tolist() - == _EXPECTED_INTENSITIES_COLUMNS - ) + assert df.columns.tolist() == _EXPECTED_INTENSITIES_COLUMNS def test_var_annotation_columns(self, files): df = pd.read_csv(files[1], sep="\t", nrows=0) @@ -77,28 +82,16 @@ def test_var_annotation_columns(self, files): def test_sample_annotation_columns(self, files): df = pd.read_csv(files[2], sep="\t", nrows=0) - assert ( - df.columns.tolist() - == _EXPECTED_SAMPLE_COLUMNS - ) + assert df.columns.tolist() == _EXPECTED_SAMPLE_COLUMNS def test_intensities_hash(self, files): - assert ( - _sha256(files[0].read_bytes()) - == _EXPECTED_INTENSITIES_HASH - ) + assert _sha256(files[0].read_bytes()) == _EXPECTED_INTENSITIES_HASH def test_var_annotation_hash(self, files): - assert ( - _sha256(files[1].read_bytes()) - == _EXPECTED_VAR_HASH - ) + assert _sha256(files[1].read_bytes()) == _EXPECTED_VAR_HASH def test_sample_annotation_hash(self, files): - assert ( - _sha256(files[2].read_bytes()) - == _EXPECTED_SAMPLE_HASH - ) + assert _sha256(files[2].read_bytes()) == _EXPECTED_SAMPLE_HASH def test_sample_count(self, files): df = pd.read_csv(files[2], sep="\t") @@ -106,28 +99,19 @@ def test_sample_count(self, files): def test_cell_types_in_file(self, files): df = pd.read_csv(files[2], sep="\t") - assert ( - sorted(df["cell_type"].unique()) - == _EXPECTED_CELL_TYPES - ) + assert sorted(df["cell_type"].unique()) == _EXPECTED_CELL_TYPES def test_csv_extension_uses_comma(self, tmp_path): p = _files(tmp_path, ext=".csv") karayel_2020(*p) df = pd.read_csv(p[0], sep=",", nrows=0) - assert ( - df.columns.tolist() - == _EXPECTED_INTENSITIES_COLUMNS - ) + assert df.columns.tolist() == _EXPECTED_INTENSITIES_COLUMNS def test_tsv_extension_uses_tab(self, tmp_path): p = _files(tmp_path, ext=".tsv") karayel_2020(*p) df = pd.read_csv(p[0], sep="\t", nrows=0) - assert ( - df.columns.tolist() - == _EXPECTED_INTENSITIES_COLUMNS - ) + assert df.columns.tolist() == _EXPECTED_INTENSITIES_COLUMNS def test_file_exists_error(self, tmp_path): p = _files(tmp_path) @@ -143,9 +127,7 @@ def test_force_overwrites(self, tmp_path): karayel_2020(*p, force=True) for path in p: assert path.read_bytes() != dummy - assert ( - _sha256(p[0].read_bytes()) == _EXPECTED_INTENSITIES_HASH - ) + assert _sha256(p[0].read_bytes()) == _EXPECTED_INTENSITIES_HASH assert _sha256(p[1].read_bytes()) == _EXPECTED_VAR_HASH assert _sha256(p[2].read_bytes()) == _EXPECTED_SAMPLE_HASH @@ -156,7 +138,8 @@ def test_overlapping_paths_raises(self, tmp_path): def test_invalid_path_type_raises(self, tmp_path): with pytest.raises( - TypeError, match="must be str or Path", + TypeError, + match="must be str or Path", ): karayel_2020( 123, @@ -167,21 +150,24 @@ def test_invalid_path_type_raises(self, tmp_path): def test_invalid_sep_type_raises(self, tmp_path): p = _files(tmp_path) with pytest.raises( - TypeError, match="sep must be str or None", + TypeError, + match="sep must be str or None", ): karayel_2020(*p, sep=123) def test_fill_na_bool_raises(self, tmp_path): p = _files(tmp_path) with pytest.raises( - TypeError, match="fill_na must be", + TypeError, + match="fill_na must be", ): karayel_2020(*p, fill_na=True) def test_force_non_bool_raises(self, tmp_path): p = _files(tmp_path) with pytest.raises( - TypeError, match="force must be bool", + TypeError, + match="force must be bool", ): karayel_2020(*p, force=1) diff --git a/tests/download/test_williams_2018.py b/tests/download/test_williams_2018.py index fb5f0ab..840937d 100644 --- a/tests/download/test_williams_2018.py +++ b/tests/download/test_williams_2018.py @@ -1,4 +1,5 @@ """Tests for proteopy.download.williams_2018.""" + import hashlib import pandas as pd @@ -10,34 +11,42 @@ # -- Expected values ------------------------------------------------- _EXPECTED_INTENSITIES_HASH = ( - "021410ece8505f9ef1181a4f1bbb5cde" - "c884011eba53a77e72cc6d6f51f1a531" + "021410ece8505f9ef1181a4f1bbb5cde" "c884011eba53a77e72cc6d6f51f1a531" ) _EXPECTED_VAR_HASH = ( - "827b32fd2962cd18a7a990d56eab0e64" - "daa2a244b6226fe2d242106f185b2161" + "827b32fd2962cd18a7a990d56eab0e64" "daa2a244b6226fe2d242106f185b2161" ) _EXPECTED_SAMPLE_HASH = ( - "8cca98fa3a38df78b78912f3ef7daed5" - "7f82902485d61d90db5a823c1ed4f031" + "8cca98fa3a38df78b78912f3ef7daed5" "7f82902485d61d90db5a823c1ed4f031" ) _EXPECTED_INTENSITIES_COLUMNS = [ - "sample_id", "peptide_id", "intensity", + "sample_id", + "peptide_id", + "intensity", ] _EXPECTED_VAR_COLUMNS = [ - "peptide_id", "protein_id", "gene_id", + "peptide_id", + "protein_id", + "gene_id", ] _EXPECTED_SAMPLE_COLUMNS = [ - "sample_id", "tissue", "mouse_id", + "sample_id", + "tissue", + "mouse_id", ] _EXPECTED_TISSUES = [ - "BAT", "Brain", "Heart", "Liver", "Quad", + "BAT", + "Brain", + "Heart", + "Liver", + "Quad", ] # -- Helpers --------------------------------------------------------- + def _files(tmp_path, ext=".tsv"): return ( tmp_path / f"intensities{ext}", @@ -52,6 +61,7 @@ def _sha256(data: bytes) -> str: # -- Content tests --------------------------------------------------- + class TestWilliams2018Download: """Verify downloaded file content, structure, and error handling.""" @@ -68,10 +78,7 @@ def test_files_created(self, files): def test_intensities_columns(self, files): df = pd.read_csv(files[0], sep="\t", nrows=0) - assert ( - df.columns.tolist() - == _EXPECTED_INTENSITIES_COLUMNS - ) + assert df.columns.tolist() == _EXPECTED_INTENSITIES_COLUMNS def test_var_annotation_columns(self, files): df = pd.read_csv(files[1], sep="\t", nrows=0) @@ -79,28 +86,16 @@ def test_var_annotation_columns(self, files): def test_sample_annotation_columns(self, files): df = pd.read_csv(files[2], sep="\t", nrows=0) - assert ( - df.columns.tolist() - == _EXPECTED_SAMPLE_COLUMNS - ) + assert df.columns.tolist() == _EXPECTED_SAMPLE_COLUMNS def test_intensities_hash(self, files): - assert ( - _sha256(files[0].read_bytes()) - == _EXPECTED_INTENSITIES_HASH - ) + assert _sha256(files[0].read_bytes()) == _EXPECTED_INTENSITIES_HASH def test_var_annotation_hash(self, files): - assert ( - _sha256(files[1].read_bytes()) - == _EXPECTED_VAR_HASH - ) + assert _sha256(files[1].read_bytes()) == _EXPECTED_VAR_HASH def test_sample_annotation_hash(self, files): - assert ( - _sha256(files[2].read_bytes()) - == _EXPECTED_SAMPLE_HASH - ) + assert _sha256(files[2].read_bytes()) == _EXPECTED_SAMPLE_HASH def test_sample_count(self, files): df = pd.read_csv(files[2], sep="\t") @@ -108,28 +103,19 @@ def test_sample_count(self, files): def test_tissues_in_file(self, files): df = pd.read_csv(files[2], sep="\t") - assert ( - sorted(df["tissue"].unique()) - == _EXPECTED_TISSUES - ) + assert sorted(df["tissue"].unique()) == _EXPECTED_TISSUES def test_csv_extension_uses_comma(self, tmp_path): p = _files(tmp_path, ext=".csv") williams_2018(*p) df = pd.read_csv(p[0], sep=",", nrows=0) - assert ( - df.columns.tolist() - == _EXPECTED_INTENSITIES_COLUMNS - ) + assert df.columns.tolist() == _EXPECTED_INTENSITIES_COLUMNS def test_tsv_extension_uses_tab(self, tmp_path): p = _files(tmp_path, ext=".tsv") williams_2018(*p) df = pd.read_csv(p[0], sep="\t", nrows=0) - assert ( - df.columns.tolist() - == _EXPECTED_INTENSITIES_COLUMNS - ) + assert df.columns.tolist() == _EXPECTED_INTENSITIES_COLUMNS def test_file_exists_error(self, tmp_path): p = _files(tmp_path) @@ -145,9 +131,7 @@ def test_force_overwrites(self, tmp_path): williams_2018(*p, force=True) for path in p: assert path.read_bytes() != dummy - assert ( - _sha256(p[0].read_bytes()) == _EXPECTED_INTENSITIES_HASH - ) + assert _sha256(p[0].read_bytes()) == _EXPECTED_INTENSITIES_HASH assert _sha256(p[1].read_bytes()) == _EXPECTED_VAR_HASH assert _sha256(p[2].read_bytes()) == _EXPECTED_SAMPLE_HASH @@ -158,7 +142,8 @@ def test_overlapping_paths_raises(self, tmp_path): def test_invalid_path_type_raises(self, tmp_path): with pytest.raises( - TypeError, match="must be str or Path", + TypeError, + match="must be str or Path", ): williams_2018( 123, @@ -169,21 +154,24 @@ def test_invalid_path_type_raises(self, tmp_path): def test_invalid_sep_type_raises(self, tmp_path): p = _files(tmp_path) with pytest.raises( - TypeError, match="sep must be str or None", + TypeError, + match="sep must be str or None", ): williams_2018(*p, sep=123) def test_fill_na_bool_raises(self, tmp_path): p = _files(tmp_path) with pytest.raises( - TypeError, match="fill_na must be", + TypeError, + match="fill_na must be", ): williams_2018(*p, fill_na=True) def test_force_non_bool_raises(self, tmp_path): p = _files(tmp_path) with pytest.raises( - TypeError, match="force must be bool", + TypeError, + match="force must be bool", ): williams_2018(*p, force=1) diff --git a/tests/pp/test_filtering.py b/tests/pp/test_filtering.py index c86b633..e7064b3 100644 --- a/tests/pp/test_filtering.py +++ b/tests/pp/test_filtering.py @@ -13,16 +13,17 @@ def _make_adata_filter_obs_base() -> AnnData: - """Six obs, five vars with increasing missingness; some zeros present.""" + """Six obs, five vars with increasing missingness; some zeros + present.""" n = np.nan X = np.array( [ - [1, 1, 2, 2, 3], # obs0: complete - [n, 1, 2, 2, 3], # obs1: 4/5 complete - [n, n, 2, 2, 3], # obs2: 3/5 complete - [n, n, n, 2, 3], # obs3: 2/5 complete - [0, 1, 2, 2, 3], # obs4: complete and a zero - [0, n, 2, 2, 3], # obs5: 4/5 complete and a zero + [1, 1, 2, 2, 3], # obs0: complete + [n, 1, 2, 2, 3], # obs1: 4/5 complete + [n, n, 2, 2, 3], # obs2: 3/5 complete + [n, n, n, 2, 3], # obs3: 2/5 complete + [0, 1, 2, 2, 3], # obs4: complete and a zero + [0, n, 2, 2, 3], # obs5: 4/5 complete and a zero ], dtype=float, ) @@ -34,13 +35,13 @@ def _make_adata_filter_obs_base() -> AnnData: def _make_adata_filter_obs_groupby_singletons() -> AnnData: - """Two vars, two groups""" + """Two vars, two groups.""" n = np.nan X = np.array( [ - [n, n], # obs0 - [1, n], # obs1 - [1, 1], # obs2 + [n, n], # obs0 + [1, n], # obs1 + [1, 1], # obs2 ], dtype=float, ) @@ -58,15 +59,15 @@ def _make_adata_filter_obs_groupby_singletons() -> AnnData: def _make_adata_filter_obs_groupby() -> AnnData: - """Five vars, two groups""" + """Five vars, two groups.""" n = np.nan X = np.array( [ - [1, 1, 2, 2, 3], # obs0: both groups complete - [1, n, 2, 2, 3], # obs1: group 0 -> 1/2 complete - [1, 1, 2, 2, n], # obs2: group 1 -> 2/3 incomplete - [1, n, 2, 2, n], # obs3: g0 1/2, g1 2/3 - [1, n, 2, n, n], # obs4: g0 1/2, g1 1/3 + [1, 1, 2, 2, 3], # obs0: both groups complete + [1, n, 2, 2, 3], # obs1: group 0 -> 1/2 complete + [1, 1, 2, 2, n], # obs2: group 1 -> 2/3 incomplete + [1, n, 2, 2, n], # obs3: g0 1/2, g1 2/3 + [1, n, 2, n, n], # obs4: g0 1/2, g1 1/3 ], dtype=float, ) @@ -84,10 +85,8 @@ def _make_adata_filter_obs_groupby() -> AnnData: def _make_adata_filter_obs_groupby_na() -> AnnData: - """ - Same as `_make_adata_filter_obs_groupby` but with an added NA group of - four vars - """ + """Same as `_make_adata_filter_obs_groupby` but with an added NA + group of four vars.""" n = np.nan X = np.array( [ @@ -106,8 +105,15 @@ def _make_adata_filter_obs_groupby_na() -> AnnData: { "protein_id": var_names, "group": [ - "g1", "g1", "g2", "g2", "g2", - np.nan, np.nan, np.nan, np.nan, + "g1", + "g1", + "g2", + "g2", + "g2", + np.nan, + np.nan, + np.nan, + np.nan, ], }, index=var_names, @@ -116,8 +122,8 @@ def _make_adata_filter_obs_groupby_na() -> AnnData: def _make_adata_filter_var_base() -> AnnData: - """Five obs, six vars with increasing missingness - across vars; some zeros.""" + """Five obs, six vars with increasing missingness across vars; some + zeros.""" n = np.nan X = np.array( [ @@ -137,7 +143,7 @@ def _make_adata_filter_var_base() -> AnnData: def _make_adata_filter_var_groupby_singletons() -> AnnData: - """Three vars, two groups""" + """Three vars, two groups.""" n = np.nan X = np.array( [ @@ -163,7 +169,8 @@ def _make_adata_filter_var_groupby_singletons() -> AnnData: def _make_adata_filter_var_groupby() -> AnnData: - """Five obs with obs groupings; vars differ in completeness per group.""" + """Five obs with obs groupings; vars differ in completeness per + group.""" n = np.nan X = np.array( [ @@ -189,10 +196,8 @@ def _make_adata_filter_var_groupby() -> AnnData: def _make_adata_filter_var_groupby_na() -> AnnData: - """ - Same as `_make_adata_filter_var_groupby` but with an added NA group of - four obs - """ + """Same as `_make_adata_filter_var_groupby` but with an added NA + group of four obs.""" n = np.nan X = np.array( [ @@ -213,8 +218,15 @@ def _make_adata_filter_var_groupby_na() -> AnnData: { "sample_id": obs_names, "group": [ - "g1", "g1", "g2", "g2", "g2", - np.nan, np.nan, np.nan, np.nan, + "g1", + "g1", + "g2", + "g2", + "g2", + np.nan, + np.nan, + np.nan, + np.nan, ], }, index=obs_names, @@ -224,18 +236,24 @@ def _make_adata_filter_var_groupby_na() -> AnnData: { "protein_id": var_names, "group": [ - "g1", "g1", "g2", "g2", np.nan, + "g1", + "g1", + "g2", + "g2", + np.nan, ], }, index=var_names, ) return AnnData(X=X, obs=obs, var=var) + # ── helpers: remove_zero_variance_vars ────────────────────────────── def _make_adata_rzv_base() -> AnnData: - """6 obs × 5 vars. p0-p2 vary, p3 constant, p4 near-constant (<1e-8). + """6 obs × 5 vars. p0-p2 vary, p3 constant, p4 near-constant + (<1e-8). Expected kept (atol=1e-8): [p0, p1, p2]. """ @@ -313,9 +331,7 @@ def _make_adata_rzv_all_vary() -> AnnData: Expected kept (atol=1e-8): [p0, p1, p2] (nothing removed). """ X = np.array( - [[1.0, 10.0, 100.0], - [2.0, 20.0, 200.0], - [3.0, 30.0, 300.0]], + [[1.0, 10.0, 100.0], [2.0, 20.0, 200.0], [3.0, 30.0, 300.0]], ) obs_names = ["s0", "s1", "s2"] var_names = ["p0", "p1", "p2"] @@ -414,8 +430,7 @@ def _make_adata_rzv_groupby() -> AnnData: ) obs_names = [f"s{i}" for i in range(5)] obs = pd.DataFrame( - {"sample_id": obs_names, - "group": ["g1", "g1", "g2", "g2", "g2"]}, + {"sample_id": obs_names, "group": ["g1", "g1", "g2", "g2", "g2"]}, index=obs_names, ) var_names = [f"p{i}" for i in range(5)] @@ -442,8 +457,7 @@ def _make_adata_rzv_groupby_singletons() -> AnnData: ) obs_names = [f"s{i}" for i in range(5)] obs = pd.DataFrame( - {"sample_id": obs_names, - "group": ["g1", "g2", "g3", "g4", "g5"]}, + {"sample_id": obs_names, "group": ["g1", "g2", "g3", "g4", "g5"]}, index=obs_names, ) var_names = ["p0", "p1", "p2"] @@ -470,8 +484,7 @@ def _make_adata_rzv_groupby_allnan_one_group() -> AnnData: ) obs_names = ["s0", "s1", "s2", "s3"] obs = pd.DataFrame( - {"sample_id": obs_names, - "group": ["A", "A", "B", "B"]}, + {"sample_id": obs_names, "group": ["A", "A", "B", "B"]}, index=obs_names, ) var_names = ["p0", "p1"] @@ -495,8 +508,7 @@ def _make_adata_rzv_groupby_single_group() -> AnnData: ) obs_names = ["s0", "s1", "s2", "s3"] obs = pd.DataFrame( - {"sample_id": obs_names, - "group": ["A", "A", "A", "A"]}, + {"sample_id": obs_names, "group": ["A", "A", "A", "A"]}, index=obs_names, ) var_names = ["p0", "p1", "p2"] @@ -534,7 +546,7 @@ def _make_adata_rzv_peptide_level() -> AnnData: def _make_adata_remove_contaminants_base() -> AnnData: - """5 obs, 5 vars""" + """5 obs, 5 vars.""" X = np.array( [ [1, 1, 1, 1, 1], @@ -548,14 +560,13 @@ def _make_adata_remove_contaminants_base() -> AnnData: obs_names = [f"obs{i}" for i in range(5)] obs = pd.DataFrame({"sample_id": obs_names}, index=obs_names) var_names = [f"protein_{i}" for i in range(5)] - var = pd.DataFrame({ - "protein_id": var_names - }, index=var_names) + var = pd.DataFrame({"protein_id": var_names}, index=var_names) return AnnData(X=X, obs=obs, var=var) def _make_adata_remove_contaminants_peptide_level() -> AnnData: - """4 obs × 5 vars, peptide-level data for contaminant filtering tests.""" + """4 obs × 5 vars, peptide-level data for contaminant filtering + tests.""" X = np.array( [ [10, 20, 30, 40, 50], @@ -918,8 +929,11 @@ def test_filter_axis_var_min_fraction_and_min_count(): cases = { (0.4, 3): [ - "protein_0", "protein_1", "protein_2", - "protein_4", "protein_5", + "protein_0", + "protein_1", + "protein_2", + "protein_4", + "protein_5", ], (1.0, 5): ["protein_0", "protein_4"], (0.0, 0): list(adata.var_names), @@ -968,7 +982,9 @@ def test_filter_axis_var_zero_to_na(): ) assert returned is None assert list(adata_inplace.var_names) == [ - "protein_0", "protein_1", "protein_4", + "protein_0", + "protein_1", + "protein_4", ] @@ -1410,7 +1426,9 @@ def test_atol_boundary_equal_variance_removed(self): ], ) obs = pd.DataFrame({"sample_id": ["s0", "s1"]}, index=["s0", "s1"]) - var = pd.DataFrame({"protein_id": ["p0", "p1", "p2"]}, index=["p0", "p1", "p2"]) + var = pd.DataFrame( + {"protein_id": ["p0", "p1", "p2"]}, index=["p0", "p1", "p2"] + ) adata = AnnData(X=X, obs=obs, var=var) filtered = remove_zero_variance_vars(adata, atol=1.0, inplace=False) @@ -1423,7 +1441,9 @@ def test_atol_zero_keeps_tiny_variance(self): # p1: var≈3.3e-17 (> atol) → kept # p2: var≈0.667 (> atol) → kept filtered = remove_zero_variance_vars( - adata, atol=0.0, inplace=False, + adata, + atol=0.0, + inplace=False, ) assert filtered is not None assert list(filtered.var_names) == ["p1", "p2"] @@ -1432,7 +1452,9 @@ def test_large_atol_removes_everything(self): adata = _make_adata_rzv_all_vary() # all vars have variance < 1e10 → all removed filtered = remove_zero_variance_vars( - adata, atol=1e10, inplace=False, + adata, + atol=1e10, + inplace=False, ) assert filtered is not None assert list(filtered.var_names) == [] @@ -1441,7 +1463,8 @@ def test_large_atol_removes_everything(self): def test_negative_atol_raises(self): adata = _make_adata_rzv_base() with pytest.raises( - ValueError, match=r"`atol` must be non-negative.", + ValueError, + match=r"`atol` must be non-negative.", ): remove_zero_variance_vars(adata, atol=-2) @@ -1451,7 +1474,9 @@ def test_negative_atol_raises(self): def test_groupby_removes_zero_in_any_group(self, inplace): adata = _make_adata_rzv_groupby() result = remove_zero_variance_vars( - adata, group_by="group", inplace=inplace, + adata, + group_by="group", + inplace=inplace, ) target = adata if inplace else result if inplace: @@ -1469,7 +1494,9 @@ def test_groupby_singleton_groups_removes_all(self, inplace): match=r"at least one group", ): result = remove_zero_variance_vars( - adata, group_by="group", inplace=inplace, + adata, + group_by="group", + inplace=inplace, ) target = adata if inplace else result if inplace: @@ -1482,21 +1509,28 @@ def test_groupby_singleton_groups_removes_all(self, inplace): def test_groupby_all_nan_in_one_group_warns(self): adata = _make_adata_rzv_groupby_allnan_one_group() with pytest.warns( - UserWarning, match=r"at least one group", + UserWarning, + match=r"at least one group", ): filtered = remove_zero_variance_vars( - adata, group_by="group", inplace=False, + adata, + group_by="group", + inplace=False, ) assert list(filtered.var_names) == ["p1"] def test_groupby_single_group_matches_global(self): adata = _make_adata_rzv_groupby_single_group() filtered_grouped = remove_zero_variance_vars( - adata, group_by="group", inplace=False, + adata, + group_by="group", + inplace=False, ) adata2 = _make_adata_rzv_groupby_single_group() filtered_global = remove_zero_variance_vars( - adata2, group_by=None, inplace=False, + adata2, + group_by=None, + inplace=False, ) assert ( list(filtered_grouped.var_names) @@ -1508,7 +1542,9 @@ def test_groupby_categorical_column(self): adata = _make_adata_rzv_groupby() adata.obs["group"] = pd.Categorical(adata.obs["group"]) filtered = remove_zero_variance_vars( - adata, group_by="group", inplace=False, + adata, + group_by="group", + inplace=False, ) assert list(filtered.var_names) == ["p0"] @@ -1534,7 +1570,8 @@ def test_groupby_missing_column_raises(self): @pytest.mark.parametrize("bad_adata", ["not-anndata", 42, None]) def test_invalid_adata_type(self, bad_adata): with pytest.raises( - TypeError, match=r"`adata` must be an AnnData object", + TypeError, + match=r"`adata` must be an AnnData object", ): remove_zero_variance_vars(adata=bad_adata) @@ -1542,7 +1579,8 @@ def test_invalid_adata_type(self, bad_adata): def test_invalid_group_by_type(self, bad_group_by): adata = _make_adata_rzv_base() with pytest.raises( - TypeError, match=r"`group_by` must be a string or None", + TypeError, + match=r"`group_by` must be a string or None", ): remove_zero_variance_vars(adata, group_by=bad_group_by) @@ -1550,7 +1588,8 @@ def test_invalid_group_by_type(self, bad_group_by): def test_invalid_atol_type(self, bad_atol): adata = _make_adata_rzv_base() with pytest.raises( - TypeError, match=r"`atol` must be a numeric value", + TypeError, + match=r"`atol` must be a numeric value", ): remove_zero_variance_vars(adata, atol=bad_atol) @@ -1558,7 +1597,8 @@ def test_invalid_atol_type(self, bad_atol): def test_invalid_inplace_type(self, bad_inplace): adata = _make_adata_rzv_base() with pytest.raises( - TypeError, match=r"`inplace` must be a bool", + TypeError, + match=r"`inplace` must be a bool", ): remove_zero_variance_vars(adata, inplace=bad_inplace) @@ -1566,7 +1606,8 @@ def test_invalid_inplace_type(self, bad_inplace): def test_invalid_verbose_type(self, bad_verbose): adata = _make_adata_rzv_base() with pytest.raises( - TypeError, match=r"`verbose` must be a bool", + TypeError, + match=r"`verbose` must be a bool", ): remove_zero_variance_vars(adata, verbose=bad_verbose) @@ -1575,7 +1616,9 @@ def test_invalid_verbose_type(self, bad_verbose): def test_verbose_reports_correct_counts(self, capsys): adata = _make_adata_rzv_base() remove_zero_variance_vars( - adata, inplace=True, verbose=True, + adata, + inplace=True, + verbose=True, ) captured = capsys.readouterr() assert "5 variables present" in captured.out @@ -1585,7 +1628,9 @@ def test_verbose_reports_correct_counts(self, capsys): def test_verbose_false_prints_nothing(self, capsys): adata = _make_adata_rzv_base() remove_zero_variance_vars( - adata, inplace=True, verbose=False, + adata, + inplace=True, + verbose=False, ) captured = capsys.readouterr() assert captured.out == "" @@ -1615,17 +1660,15 @@ def test_idempotency(self): second = remove_zero_variance_vars(first, inplace=False) assert list(first.var_names) == list(second.var_names) np.testing.assert_array_equal( - np.asarray(first.X), np.asarray(second.X), + np.asarray(first.X), + np.asarray(second.X), ) def test_kept_var_values_unchanged(self): adata = _make_adata_rzv_base() original_X = adata.X.copy() filtered = remove_zero_variance_vars(adata, inplace=False) - kept_idx = [ - list(adata.var_names).index(v) - for v in filtered.var_names - ] + kept_idx = [list(adata.var_names).index(v) for v in filtered.var_names] np.testing.assert_array_equal( np.asarray(filtered.X), original_X[:, kept_idx], @@ -1638,7 +1681,8 @@ def test_peptide_level_data_basic(self, inplace): adata = _make_adata_rzv_peptide_level() with pytest.warns(UserWarning, match=r"1 variable\(s\)"): result = remove_zero_variance_vars( - adata, inplace=inplace, + adata, + inplace=inplace, ) target = adata if inplace else result if inplace: @@ -1658,7 +1702,9 @@ def test_peptide_level_data_with_groupby(self): # pep3: all-NaN in both groups → removed (warning) with pytest.warns(UserWarning, match=r"at least one group"): filtered = remove_zero_variance_vars( - adata, group_by="group", inplace=False, + adata, + group_by="group", + inplace=False, ) assert filtered is not None assert list(filtered.var_names) == ["pep0", "pep2"] @@ -1669,12 +1715,7 @@ def test_peptide_level_data_with_groupby(self): class TestRemoveContaminants: @pytest.fixture def fasta(self, tmp_path): - fasta_content = ( - ">sp|protein_1\n" - "AAAA\n" - ">sp|protein_2\n" - "CCCC\n" - ) + fasta_content = ">sp|protein_1\n" "AAAA\n" ">sp|protein_2\n" "CCCC\n" fasta_path = tmp_path / "test.fasta" fasta_path.write_text(fasta_content) return fasta_path @@ -1683,9 +1724,7 @@ def fasta(self, tmp_path): def csv_file(self, tmp_path): csv_path = tmp_path / "contaminants.csv" csv_path.write_text( - "contaminant,source\n" - "protein_2,db\n" - "protein_4,db\n", + "contaminant,source\n" "protein_2,db\n" "protein_4,db\n", ) return csv_path @@ -1693,9 +1732,7 @@ def csv_file(self, tmp_path): def tsv_file(self, tmp_path): tsv_path = tmp_path / "contaminants.tsv" tsv_path.write_text( - "contaminant\tcomment\n" - "protein_0\ta\n" - "protein_3\tb\n", + "contaminant\tcomment\n" "protein_0\ta\n" "protein_3\tb\n", ) return tsv_path @@ -1714,7 +1751,9 @@ def test_fasta_filters_expected_proteins(self, fasta, inplace): target = adata if inplace else result assert list(target.var_names) == [ - "protein_0", "protein_3", "protein_4", + "protein_0", + "protein_3", + "protein_4", ] assert target.n_obs == 5 @@ -1727,8 +1766,7 @@ def test_fasta_filters_expected_proteins(self, fasta, inplace): def test_no_matching_contaminants_keeps_all_variables(self, tmp_path): fasta_path = tmp_path / "none_match.fasta" fasta_path.write_text( - ">sp|not_present_a\nAAAA\n" - ">sp|not_present_b\nCCCC\n", + ">sp|not_present_a\nAAAA\n" ">sp|not_present_b\nCCCC\n", ) adata = _make_adata_remove_contaminants_base() @@ -1749,7 +1787,9 @@ def test_csv_filters_using_first_column(self, csv_file): inplace=False, ) assert list(filtered.var_names) == [ - "protein_0", "protein_1", "protein_3", + "protein_0", + "protein_1", + "protein_3", ] def test_tsv_filters_using_first_column(self, tsv_file): @@ -1760,7 +1800,9 @@ def test_tsv_filters_using_first_column(self, tsv_file): inplace=False, ) assert list(filtered.var_names) == [ - "protein_1", "protein_2", "protein_4", + "protein_1", + "protein_2", + "protein_4", ] def test_custom_protein_key_column(self, tmp_path): @@ -1769,12 +1811,15 @@ def test_custom_protein_key_column(self, tmp_path): # in a different order to confirm filtering uses protein_key, # not var_names or var.index adata.var["uniprot_id"] = [ - "Q99714", "P12345", "P67890", "O75822", "Q9Y6K9", + "Q99714", + "P12345", + "P67890", + "O75822", + "Q9Y6K9", ] fasta_path = tmp_path / "custom_key.fasta" fasta_path.write_text( - ">sp|P12345\nAAAA\n" - ">sp|P67890\nCCCC\n", + ">sp|P12345\nAAAA\n" ">sp|P67890\nCCCC\n", ) filtered = remove_contaminants( @@ -1784,14 +1829,15 @@ def test_custom_protein_key_column(self, tmp_path): inplace=False, ) assert list(filtered.var_names) == [ - "protein_0", "protein_3", "protein_4", + "protein_0", + "protein_3", + "protein_4", ] def test_custom_header_parser_is_used(self, tmp_path): fasta_path = tmp_path / "custom_header.fasta" fasta_path.write_text( - ">contam__protein_0\nAAAA\n" - ">contam__protein_4\nCCCC\n", + ">contam__protein_0\nAAAA\n" ">contam__protein_4\nCCCC\n", ) adata = _make_adata_remove_contaminants_base() @@ -1802,14 +1848,15 @@ def test_custom_header_parser_is_used(self, tmp_path): inplace=False, ) assert list(filtered.var_names) == [ - "protein_1", "protein_2", "protein_3", + "protein_1", + "protein_2", + "protein_3", ] def test_header_parser_empty_id_warns_and_skips(self, tmp_path): fasta_path = tmp_path / "empty_id.fasta" fasta_path.write_text( - ">sp|protein_1\nAAAA\n" - ">sp|protein_2\nCCCC\n", + ">sp|protein_1\nAAAA\n" ">sp|protein_2\nCCCC\n", ) adata = _make_adata_remove_contaminants_base() diff --git a/tests/pp/test_quantification.py b/tests/pp/test_quantification.py index 942b00f..8c518c8 100644 --- a/tests/pp/test_quantification.py +++ b/tests/pp/test_quantification.py @@ -12,6 +12,7 @@ # Helper constructors # ------------------------------------------------------------------ + def _make_peptide_adata( X=None, peptide_ids=None, @@ -19,8 +20,7 @@ def _make_peptide_adata( obs_names=None, extra_var_cols=None, ): - """ - Build a minimal valid peptide-level AnnData. + """Build a minimal valid peptide-level AnnData. Defaults produce two observations and four peptides belonging to two stripped groups: @@ -47,8 +47,7 @@ def _make_peptide_adata( obs_names = ["s1", "s2"] if X is None: X = np.array( - [[1, 2, 3, 4], - [5, 6, 7, 8]], + [[1, 2, 3, 4], [5, 6, 7, 8]], dtype=float, ) @@ -68,9 +67,9 @@ def _make_peptide_adata( def _make_single_group_adata(): - """ - Three peptidoforms that all strip to 'SEQA': - SEQA + """Three peptidoforms that all strip to 'SEQA': + + SEQA SEQA (Ox) SEQA (Ph) Intensities: [[10, 20, 30]] @@ -111,19 +110,20 @@ class TestSummarizeModifications: # -------------------------------------------------------------- def test_basic_grouping_strips_modifications(self): - """Peptides with modification annotations are grouped - by their stripped (bare) sequence.""" + """Peptides with modification annotations are grouped by their + stripped (bare) sequence.""" adata = _make_peptide_adata() result = summarize_modifications(adata, inplace=False) assert list(result.var_names) == [ - "PEPTIDEA", "PEPTIDEB", + "PEPTIDEA", + "PEPTIDEB", ] assert result.shape == (2, 2) def test_unmodified_peptide_passes_through(self): - """A peptide without modifications remains unchanged - after grouping.""" + """A peptide without modifications remains unchanged after + grouping.""" pids = ["SOLO"] X = np.array([[42.0]]) adata = _make_peptide_adata( @@ -133,7 +133,8 @@ def test_unmodified_peptide_passes_through(self): obs_names=["s1"], ) result = summarize_modifications( - adata, inplace=False, + adata, + inplace=False, ) assert list(result.var_names) == ["SOLO"] @@ -144,8 +145,7 @@ def test_unmodified_peptide_passes_through(self): # -------------------------------------------------------------- def test_aggregation_methods(self): - """All four methods produce correct aggregated - values.""" + """All four methods produce correct aggregated values.""" adata = _make_peptide_adata() expected = { "sum": np.array( @@ -163,19 +163,18 @@ def test_aggregation_methods(self): } for method, exp in expected.items(): result = summarize_modifications( - adata, method=method, inplace=False, + adata, + method=method, + inplace=False, ) np.testing.assert_allclose( - result.X, exp, - err_msg=( - f"method='{method}' produced " - f"wrong values" - ), + result.X, + exp, + err_msg=(f"method='{method}' produced " f"wrong values"), ) def test_aggregation_methods_single_group(self): - """Methods applied to a single group of three - peptidoforms.""" + """Methods applied to a single group of three peptidoforms.""" adata = _make_single_group_adata() expected = { "sum": np.array([[60.0]]), @@ -185,13 +184,14 @@ def test_aggregation_methods_single_group(self): } for method, exp in expected.items(): result = summarize_modifications( - adata, method=method, inplace=False, + adata, + method=method, + inplace=False, ) np.testing.assert_allclose( - result.X, exp, - err_msg=( - f"method='{method}' on single group" - ), + result.X, + exp, + err_msg=(f"method='{method}' on single group"), ) # -------------------------------------------------------------- @@ -199,28 +199,31 @@ def test_aggregation_methods_single_group(self): # -------------------------------------------------------------- def test_inplace_true_modifies_original(self): - """inplace=True modifies the original AnnData and - returns None.""" + """Inplace=True modifies the original AnnData and returns + None.""" adata = _make_peptide_adata() returned = summarize_modifications( - adata, inplace=True, + adata, + inplace=True, ) assert returned is None assert adata.shape == (2, 2) assert list(adata.var_names) == [ - "PEPTIDEA", "PEPTIDEB", + "PEPTIDEA", + "PEPTIDEB", ] def test_inplace_false_returns_copy(self): - """inplace=False returns a new AnnData; original is + """Inplace=False returns a new AnnData; original is unchanged.""" adata = _make_peptide_adata() original_var_names = list(adata.var_names) original_shape = adata.shape result = summarize_modifications( - adata, inplace=False, + adata, + inplace=False, ) assert result is not adata @@ -233,44 +236,48 @@ def test_inplace_false_returns_copy(self): # -------------------------------------------------------------- def test_skip_na_true_ignores_nan(self): - """skip_na=True aggregates over non-NaN values - only.""" + """skip_na=True aggregates over non-NaN values only.""" n = np.nan X = np.array([[1.0, n, 3.0, 4.0]]) adata = _make_peptide_adata(X=X, obs_names=["s1"]) result = summarize_modifications( - adata, method="sum", - skip_na=True, inplace=False, + adata, + method="sum", + skip_na=True, + inplace=False, ) np.testing.assert_allclose( - result.X, [[1.0, 7.0]], + result.X, + [[1.0, 7.0]], ) def test_skip_na_false_propagates_nan(self): - """skip_na=False produces NaN when any group member - is NaN.""" + """skip_na=False produces NaN when any group member is NaN.""" n = np.nan X = np.array([[1.0, n, 3.0, 4.0]]) adata = _make_peptide_adata(X=X, obs_names=["s1"]) result = summarize_modifications( - adata, method="sum", - skip_na=False, inplace=False, + adata, + method="sum", + skip_na=False, + inplace=False, ) assert np.isnan(result.X[0, 0]) np.testing.assert_allclose(result.X[0, 1], 7.0) def test_skip_na_false_with_mean(self): - """skip_na=False propagates NaN for mean aggregation - too.""" + """skip_na=False propagates NaN for mean aggregation too.""" n = np.nan X = np.array([[10.0, n, 3.0, 4.0]]) adata = _make_peptide_adata(X=X, obs_names=["s1"]) result = summarize_modifications( - adata, method="mean", - skip_na=False, inplace=False, + adata, + method="mean", + skip_na=False, + inplace=False, ) assert np.isnan(result.X[0, 0]) np.testing.assert_allclose(result.X[0, 1], 3.5) @@ -280,11 +287,12 @@ def test_skip_na_false_with_mean(self): # -------------------------------------------------------------- def test_sort_true_alphabetical_order(self): - """sort=True orders output variables - alphabetically.""" + """Sort=True orders output variables alphabetically.""" pids = [ - "ZZZ", "ZZZ (Ox)", - "AAA", "AAA (Ph)", + "ZZZ", + "ZZZ (Ox)", + "AAA", + "AAA (Ph)", ] X = np.array([[1.0, 2.0, 3.0, 4.0]]) adata = _make_peptide_adata( @@ -294,18 +302,21 @@ def test_sort_true_alphabetical_order(self): obs_names=["s1"], ) result = summarize_modifications( - adata, sort=True, inplace=False, + adata, + sort=True, + inplace=False, ) assert list(result.var_names) == ["AAA", "ZZZ"] def test_sort_false_preserves_first_appearance_order( self, ): - """sort=False preserves the order of first - appearance.""" + """Sort=False preserves the order of first appearance.""" pids = [ - "ZZZ", "ZZZ (Ox)", - "AAA", "AAA (Ph)", + "ZZZ", + "ZZZ (Ox)", + "AAA", + "AAA (Ph)", ] X = np.array([[1.0, 2.0, 3.0, 4.0]]) adata = _make_peptide_adata( @@ -315,7 +326,9 @@ def test_sort_false_preserves_first_appearance_order( obs_names=["s1"], ) result = summarize_modifications( - adata, sort=False, inplace=False, + adata, + sort=False, + inplace=False, ) assert list(result.var_names) == ["ZZZ", "AAA"] @@ -326,54 +339,58 @@ def test_sort_false_preserves_first_appearance_order( def test_keep_var_cols_none_has_default_columns_only( self, ): - """With keep_var_cols=None, output .var has only the - mandatory columns.""" + """With keep_var_cols=None, output .var has only the mandatory + columns.""" adata = _make_peptide_adata( extra_var_cols={ "gene": ["G1", "G1", "G2", "G2"], }, ) result = summarize_modifications( - adata, keep_var_cols=None, inplace=False, + adata, + keep_var_cols=None, + inplace=False, ) expected_cols = { - "peptide_id", "protein_id", - "n_peptidoforms", "n_modifications", + "peptide_id", + "protein_id", + "n_peptidoforms", + "n_modifications", } assert set(result.var.columns) == expected_cols def test_keep_var_cols_carries_over_extra_columns(self): - """Specifying keep_var_cols includes those columns in - the output .var.""" + """Specifying keep_var_cols includes those columns in the output + .var.""" adata = _make_peptide_adata( extra_var_cols={ "gene": ["G1", "G1", "G2", "G2"], }, ) result = summarize_modifications( - adata, keep_var_cols=["gene"], inplace=False, + adata, + keep_var_cols=["gene"], + inplace=False, ) assert "gene" in result.var.columns assert result.var.loc["PEPTIDEA", "gene"] == "G1" assert result.var.loc["PEPTIDEB", "gene"] == "G2" def test_keep_var_cols_joins_differing_values(self): - """When group members have different values for a - kept column, they are joined with ';'.""" + """When group members have different values for a kept column, + they are joined with ';'.""" adata = _make_peptide_adata( extra_var_cols={ "source": ["db1", "db2", "db1", "db1"], }, ) result = summarize_modifications( - adata, keep_var_cols=["source"], inplace=False, - ) - assert ( - result.var.loc["PEPTIDEA", "source"] == "db1;db2" - ) - assert ( - result.var.loc["PEPTIDEB", "source"] == "db1" + adata, + keep_var_cols=["source"], + inplace=False, ) + assert result.var.loc["PEPTIDEA", "source"] == "db1;db2" + assert result.var.loc["PEPTIDEB", "source"] == "db1" def test_keep_var_cols_with_nan_values(self): """NaN entries in a kept column are dropped before @@ -384,21 +401,25 @@ def test_keep_var_cols_with_nan_values(self): }, ) result = summarize_modifications( - adata, keep_var_cols=["note"], inplace=False, + adata, + keep_var_cols=["note"], + inplace=False, ) assert result.var.loc["PEPTIDEA", "note"] == "x" assert result.var.loc["PEPTIDEB", "note"] == "y" def test_keep_var_cols_all_nan_produces_nan(self): - """When all values in a kept column are NaN for a - group, the result is NaN.""" + """When all values in a kept column are NaN for a group, the + result is NaN.""" adata = _make_peptide_adata( extra_var_cols={ "note": [np.nan, np.nan, "y", np.nan], }, ) result = summarize_modifications( - adata, keep_var_cols=["note"], inplace=False, + adata, + keep_var_cols=["note"], + inplace=False, ) assert pd.isna( result.var.loc["PEPTIDEA", "note"], @@ -412,29 +433,34 @@ def test_keep_var_cols_all_nan_produces_nan(self): def test_zero_to_na_converts_zeros_before_aggregation( self, ): - """Zeros are replaced with NaN before aggregation - when zero_to_na=True.""" + """Zeros are replaced with NaN before aggregation when + zero_to_na=True.""" X = np.array([[0.0, 2.0, 3.0, 4.0]]) adata = _make_peptide_adata(X=X, obs_names=["s1"]) result = summarize_modifications( - adata, method="sum", - zero_to_na=True, skip_na=True, + adata, + method="sum", + zero_to_na=True, + skip_na=True, inplace=False, ) np.testing.assert_allclose( - result.X, [[2.0, 7.0]], + result.X, + [[2.0, 7.0]], ) def test_zero_to_na_with_skip_na_false(self): - """zero_to_na=True combined with skip_na=False - propagates NaN from zeros.""" + """zero_to_na=True combined with skip_na=False propagates NaN + from zeros.""" X = np.array([[0.0, 2.0, 3.0, 4.0]]) adata = _make_peptide_adata(X=X, obs_names=["s1"]) result = summarize_modifications( - adata, method="sum", - zero_to_na=True, skip_na=False, + adata, + method="sum", + zero_to_na=True, + skip_na=False, inplace=False, ) assert np.isnan(result.X[0, 0]) @@ -451,11 +477,14 @@ def test_fill_na_replaces_nan_before_aggregation(self): adata = _make_peptide_adata(X=X, obs_names=["s1"]) result = summarize_modifications( - adata, method="sum", - fill_na=0.0, inplace=False, + adata, + method="sum", + fill_na=0.0, + inplace=False, ) np.testing.assert_allclose( - result.X, [[2.0, 7.0]], + result.X, + [[2.0, 7.0]], ) def test_fill_na_with_nonzero_value(self): @@ -465,11 +494,14 @@ def test_fill_na_with_nonzero_value(self): adata = _make_peptide_adata(X=X, obs_names=["s1"]) result = summarize_modifications( - adata, method="sum", - fill_na=100.0, inplace=False, + adata, + method="sum", + fill_na=100.0, + inplace=False, ) np.testing.assert_allclose( - result.X[0, 0], 200.0, + result.X[0, 0], + 200.0, ) # -------------------------------------------------------------- @@ -477,18 +509,19 @@ def test_fill_na_with_nonzero_value(self): # -------------------------------------------------------------- def test_layer_uses_specified_layer(self): - """When layer is specified, data comes from that - layer instead of .X.""" + """When layer is specified, data comes from that layer instead + of .X.""" adata = _make_peptide_adata() layer_data = np.array( - [[10.0, 20.0, 30.0, 40.0], - [50.0, 60.0, 70.0, 80.0]], + [[10.0, 20.0, 30.0, 40.0], [50.0, 60.0, 70.0, 80.0]], ) adata.layers["raw"] = layer_data result = summarize_modifications( - adata, layer="raw", - method="sum", inplace=False, + adata, + layer="raw", + method="sum", + inplace=False, ) np.testing.assert_allclose( result.X, @@ -500,11 +533,12 @@ def test_layer_uses_specified_layer(self): # -------------------------------------------------------------- def test_verbose_prints_message(self, capsys): - """verbose=True prints a status message to - stdout.""" + """Verbose=True prints a status message to stdout.""" adata = _make_peptide_adata() summarize_modifications( - adata, verbose=True, inplace=False, + adata, + verbose=True, + inplace=False, ) captured = capsys.readouterr().out assert "Stripping modifications" in captured @@ -512,10 +546,12 @@ def test_verbose_prints_message(self, capsys): assert "2 unique stripped sequences" in captured def test_verbose_false_prints_nothing(self, capsys): - """verbose=False produces no stdout output.""" + """Verbose=False produces no stdout output.""" adata = _make_peptide_adata() summarize_modifications( - adata, verbose=False, inplace=False, + adata, + verbose=False, + inplace=False, ) captured = capsys.readouterr().out assert captured == "" @@ -531,28 +567,32 @@ def test_sparse_input_produces_sparse_output(self): assert sparse.issparse(adata.X) result = summarize_modifications( - adata, method="sum", inplace=False, + adata, + method="sum", + inplace=False, ) assert sparse.issparse(result.X) def test_sparse_output_has_correct_values(self): - """Sparse output matches the expected dense - values.""" + """Sparse output matches the expected dense values.""" adata = _make_peptide_adata() adata.X = sparse.csr_matrix(adata.X) result = summarize_modifications( - adata, method="sum", inplace=False, + adata, + method="sum", + inplace=False, ) expected = np.array( [[3.0, 7.0], [11.0, 15.0]], ) np.testing.assert_allclose( - result.X.toarray(), expected, + result.X.toarray(), + expected, ) def test_sparse_inplace_preserves_sparsity(self): - """inplace=True on sparse input keeps .X sparse.""" + """Inplace=True on sparse input keeps .X sparse.""" adata = _make_peptide_adata() adata.X = sparse.csr_matrix(adata.X) @@ -565,7 +605,9 @@ def test_dense_input_stays_dense(self): assert not sparse.issparse(adata.X) result = summarize_modifications( - adata, method="sum", inplace=False, + adata, + method="sum", + inplace=False, ) assert not sparse.issparse(result.X) @@ -574,25 +616,21 @@ def test_dense_input_stays_dense(self): # -------------------------------------------------------------- def test_n_peptidoforms_counts_variants(self): - """n_peptidoforms reflects total variants per - stripped sequence group.""" + """n_peptidoforms reflects total variants per stripped sequence + group.""" adata = _make_peptide_adata() result = summarize_modifications( - adata, inplace=False, + adata, + inplace=False, ) - val_a = result.var.loc[ - "PEPTIDEA", "n_peptidoforms" - ] + val_a = result.var.loc["PEPTIDEA", "n_peptidoforms"] assert val_a == 2 - val_b = result.var.loc[ - "PEPTIDEB", "n_peptidoforms" - ] + val_b = result.var.loc["PEPTIDEB", "n_peptidoforms"] assert val_b == 2 def test_n_peptidoforms_single_variant(self): - """An unmodified peptide alone counts as 1 - peptidoform.""" + """An unmodified peptide alone counts as 1 peptidoform.""" pids = ["SOLO"] X = np.array([[1.0]]) adata = _make_peptide_adata( @@ -602,22 +640,19 @@ def test_n_peptidoforms_single_variant(self): obs_names=["s1"], ) result = summarize_modifications( - adata, inplace=False, - ) - assert ( - result.var.loc["SOLO", "n_peptidoforms"] == 1 + adata, + inplace=False, ) + assert result.var.loc["SOLO", "n_peptidoforms"] == 1 def test_n_peptidoforms_three_variants(self): - """Three peptidoforms sharing the same stripped - sequence.""" + """Three peptidoforms sharing the same stripped sequence.""" adata = _make_single_group_adata() result = summarize_modifications( - adata, inplace=False, - ) - assert ( - result.var.loc["SEQA", "n_peptidoforms"] == 3 + adata, + inplace=False, ) + assert result.var.loc["SEQA", "n_peptidoforms"] == 3 # -------------------------------------------------------------- # n_modifications (position-aware counting) @@ -628,23 +663,19 @@ def test_n_modifications_basic(self): modification.""" adata = _make_peptide_adata() result = summarize_modifications( - adata, inplace=False, + adata, + inplace=False, ) - val_a = result.var.loc[ - "PEPTIDEA", "n_modifications" - ] + val_a = result.var.loc["PEPTIDEA", "n_modifications"] assert val_a == 1 - val_b = result.var.loc[ - "PEPTIDEB", "n_modifications" - ] + val_b = result.var.loc["PEPTIDEB", "n_modifications"] assert val_b == 1 def test_n_modifications_unmodified_contributes_zero( self, ): - """A group with only unmodified peptides has 0 - modifications.""" + """A group with only unmodified peptides has 0 modifications.""" pids = ["BARE"] X = np.array([[5.0]]) adata = _make_peptide_adata( @@ -654,15 +685,14 @@ def test_n_modifications_unmodified_contributes_zero( obs_names=["s1"], ) result = summarize_modifications( - adata, inplace=False, - ) - assert ( - result.var.loc["BARE", "n_modifications"] == 0 + adata, + inplace=False, ) + assert result.var.loc["BARE", "n_modifications"] == 0 def test_n_modifications_same_mod_same_pos_dedup(self): - """Same modification at same position across - peptidoforms counts only once.""" + """Same modification at same position across peptidoforms counts + only once.""" pids = [ "ABC (Ox)DEF", "ABC (Ox)DEF (Ph)", @@ -675,16 +705,15 @@ def test_n_modifications_same_mod_same_pos_dedup(self): obs_names=["s1"], ) result = summarize_modifications( - adata, inplace=False, + adata, + inplace=False, ) - val = result.var.loc[ - "ABCDEF", "n_modifications" - ] + val = result.var.loc["ABCDEF", "n_modifications"] assert val == 2 def test_n_modifications_same_text_different_pos(self): - """Same modification text at different positions - counts as distinct modifications.""" + """Same modification text at different positions counts as + distinct modifications.""" pids = [ "A (Ox)BCD", "ABCD (Ox)", @@ -697,17 +726,16 @@ def test_n_modifications_same_text_different_pos(self): obs_names=["s1"], ) result = summarize_modifications( - adata, inplace=False, - ) - assert ( - result.var.loc["ABCD", "n_modifications"] == 2 + adata, + inplace=False, ) + assert result.var.loc["ABCD", "n_modifications"] == 2 def test_n_modifications_multiple_mods_in_one_peptide( self, ): - """A single peptide with multiple modifications - contributes all of them.""" + """A single peptide with multiple modifications contributes all + of them.""" pids = ["A (Ox)B (Ph)C"] X = np.array([[1.0]]) adata = _make_peptide_adata( @@ -717,19 +745,17 @@ def test_n_modifications_multiple_mods_in_one_peptide( obs_names=["s1"], ) result = summarize_modifications( - adata, inplace=False, - ) - assert ( - result.var.loc["ABC", "n_modifications"] == 2 + adata, + inplace=False, ) + assert result.var.loc["ABC", "n_modifications"] == 2 # -------------------------------------------------------------- # Custom mod_regex # -------------------------------------------------------------- def test_custom_mod_regex(self): - """A custom regex strips different annotation - formats.""" + """A custom regex strips different annotation formats.""" pids = [ "PEP[Ox]TIDE", "PEP[Ph]TIDE", @@ -743,27 +769,27 @@ def test_custom_mod_regex(self): obs_names=["s1"], ) result = summarize_modifications( - adata, mod_regex=r"\[.*?\]", - method="sum", inplace=False, + adata, + mod_regex=r"\[.*?\]", + method="sum", + inplace=False, ) assert list(result.var_names) == ["PEPTIDE"] np.testing.assert_allclose(result.X, [[6.0]]) - val_pf = result.var.loc[ - "PEPTIDE", "n_peptidoforms" - ] + val_pf = result.var.loc["PEPTIDE", "n_peptidoforms"] assert val_pf == 3 - val_nm = result.var.loc[ - "PEPTIDE", "n_modifications" - ] + val_nm = result.var.loc["PEPTIDE", "n_modifications"] assert val_nm == 2 def test_custom_mod_regex_no_matches(self): - """When the regex matches nothing, peptides pass - through unchanged.""" + """When the regex matches nothing, peptides pass through + unchanged.""" adata = _make_peptide_adata() result = summarize_modifications( - adata, mod_regex=r"\[NOMATCH\]", - method="sum", inplace=False, + adata, + mod_regex=r"\[NOMATCH\]", + method="sum", + inplace=False, ) assert result.shape[1] == 4 @@ -775,34 +801,36 @@ def test_error_protein_level_data(self): """Protein-level AnnData raises ValueError.""" adata = _make_protein_level_adata() with pytest.raises( - ValueError, match="peptide-level", + ValueError, + match="peptide-level", ): summarize_modifications(adata) def test_error_invalid_method(self): - """An unsupported method string raises - ValueError.""" + """An unsupported method string raises ValueError.""" adata = _make_peptide_adata() with pytest.raises( - ValueError, match="method must be one of", + ValueError, + match="method must be one of", ): summarize_modifications(adata, method="min") def test_error_both_zero_to_na_and_fill_na(self): - """Setting both zero_to_na and fill_na raises - ValueError.""" + """Setting both zero_to_na and fill_na raises ValueError.""" adata = _make_peptide_adata() with pytest.raises( ValueError, match="Cannot set both zero_to_na and fill_na", ): summarize_modifications( - adata, zero_to_na=True, fill_na=0.0, + adata, + zero_to_na=True, + fill_na=0.0, ) def test_error_conflicting_protein_ids(self): - """Peptides that strip to the same sequence but map - to different protein_ids raise ValueError.""" + """Peptides that strip to the same sequence but map to different + protein_ids raise ValueError.""" pids = ["SHARED", "SHARED (Ox)"] X = np.array([[1.0, 2.0]]) adata = _make_peptide_adata( @@ -812,57 +840,64 @@ def test_error_conflicting_protein_ids(self): obs_names=["s1"], ) with pytest.raises( - ValueError, match="multiple protein_ids", + ValueError, + match="multiple protein_ids", ): summarize_modifications( - adata, inplace=False, + adata, + inplace=False, ) def test_error_keep_var_cols_missing_column(self): - """keep_var_cols with a column not in adata.var - raises KeyError.""" + """keep_var_cols with a column not in adata.var raises + KeyError.""" adata = _make_peptide_adata() with pytest.raises( - KeyError, match="not found in adata.var", + KeyError, + match="not found in adata.var", ): summarize_modifications( - adata, keep_var_cols=["nonexistent"], + adata, + keep_var_cols=["nonexistent"], ) def test_error_keep_var_cols_multiple_missing(self): - """All missing keep_var_cols entries are - reported.""" + """All missing keep_var_cols entries are reported.""" adata = _make_peptide_adata() with pytest.raises( - KeyError, match="nonexistent", + KeyError, + match="nonexistent", ): summarize_modifications( adata, keep_var_cols=[ - "nonexistent", "also_bad", + "nonexistent", + "also_bad", ], ) def test_error_keep_var_cols_reserved_peptide_id(self): - """keep_var_cols containing 'peptide_id' raises - ValueError.""" + """keep_var_cols containing 'peptide_id' raises ValueError.""" adata = _make_peptide_adata() with pytest.raises( - ValueError, match="reserved columns", + ValueError, + match="reserved columns", ): summarize_modifications( - adata, keep_var_cols=["peptide_id"], + adata, + keep_var_cols=["peptide_id"], ) def test_error_keep_var_cols_reserved_protein_id(self): - """keep_var_cols containing 'protein_id' raises - ValueError.""" + """keep_var_cols containing 'protein_id' raises ValueError.""" adata = _make_peptide_adata() with pytest.raises( - ValueError, match="reserved columns", + ValueError, + match="reserved columns", ): summarize_modifications( - adata, keep_var_cols=["protein_id"], + adata, + keep_var_cols=["protein_id"], ) def test_error_keep_var_cols_reserved_n_peptidoforms( @@ -876,7 +911,8 @@ def test_error_keep_var_cols_reserved_n_peptidoforms( }, ) with pytest.raises( - ValueError, match="reserved columns", + ValueError, + match="reserved columns", ): summarize_modifications( adata, @@ -894,7 +930,8 @@ def test_error_keep_var_cols_reserved_n_modifications( }, ) with pytest.raises( - ValueError, match="reserved columns", + ValueError, + match="reserved columns", ): summarize_modifications( adata, @@ -902,42 +939,45 @@ def test_error_keep_var_cols_reserved_n_modifications( ) def test_error_invalid_mod_regex(self): - """A malformed regex raises ValueError with a - descriptive message.""" + """A malformed regex raises ValueError with a descriptive + message.""" adata = _make_peptide_adata() with pytest.raises( - ValueError, match="Invalid mod_regex", + ValueError, + match="Invalid mod_regex", ): summarize_modifications( - adata, mod_regex=r"(unclosed", + adata, + mod_regex=r"(unclosed", ) def test_error_layer_with_infinite_values(self): - """A layer containing infinite values is rejected - upfront.""" + """A layer containing infinite values is rejected upfront.""" adata = _make_peptide_adata() layer_data = np.array( - [[1.0, np.inf, 3.0, 4.0], - [5.0, 6.0, 7.0, 8.0]], + [[1.0, np.inf, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], ) adata.layers["bad"] = layer_data with pytest.raises( - ValueError, match="infinite", + ValueError, + match="infinite", ): summarize_modifications( - adata, layer="bad", + adata, + layer="bad", ) def test_var_column_named_stripped_not_clobbered(self): - """A user .var column named '_stripped' is preserved - when included via keep_var_cols.""" + """A user .var column named '_stripped' is preserved when + included via keep_var_cols.""" adata = _make_peptide_adata( extra_var_cols={ "_stripped": ["a", "b", "c", "d"], }, ) result = summarize_modifications( - adata, keep_var_cols=["_stripped"], + adata, + keep_var_cols=["_stripped"], inplace=False, ) assert "_stripped" in result.var.columns @@ -962,16 +1002,17 @@ def test_output_passes_check_proteodata_copy(self): check_proteodata.""" adata = _make_peptide_adata() result = summarize_modifications( - adata, inplace=False, + adata, + inplace=False, ) check_proteodata(result) def test_output_peptide_id_matches_var_names(self): - """Output .var['peptide_id'] matches .var_names - exactly.""" + """Output .var['peptide_id'] matches .var_names exactly.""" adata = _make_peptide_adata() result = summarize_modifications( - adata, inplace=False, + adata, + inplace=False, ) np.testing.assert_array_equal( @@ -980,11 +1021,12 @@ def test_output_peptide_id_matches_var_names(self): ) def test_output_protein_id_is_single_mapped(self): - """Each output peptide maps to exactly one - protein_id (no multi-mapping).""" + """Each output peptide maps to exactly one protein_id (no multi- + mapping).""" adata = _make_peptide_adata() result = summarize_modifications( - adata, inplace=False, + adata, + inplace=False, ) for pid in result.var["protein_id"]: @@ -996,82 +1038,90 @@ def test_output_protein_id_is_single_mapped(self): # -------------------------------------------------------------- def test_all_nan_matrix_sum(self): - """An all-NaN matrix produces all-NaN output with - sum.""" + """An all-NaN matrix produces all-NaN output with sum.""" n = np.nan X = np.array([[n, n, n, n]]) adata = _make_peptide_adata( - X=X, obs_names=["s1"], + X=X, + obs_names=["s1"], ) result = summarize_modifications( - adata, method="sum", inplace=False, + adata, + method="sum", + inplace=False, ) assert np.all(np.isnan(result.X)) def test_all_nan_matrix_mean(self): - """An all-NaN matrix produces all-NaN output with - mean.""" + """An all-NaN matrix produces all-NaN output with mean.""" n = np.nan X = np.array([[n, n, n, n]]) adata = _make_peptide_adata( - X=X, obs_names=["s1"], + X=X, + obs_names=["s1"], ) result = summarize_modifications( - adata, method="mean", inplace=False, + adata, + method="mean", + inplace=False, ) assert np.all(np.isnan(result.X)) def test_all_zero_matrix_with_zero_to_na(self): - """An all-zero matrix with zero_to_na=True produces - all-NaN output for sum.""" + """An all-zero matrix with zero_to_na=True produces all-NaN + output for sum.""" X = np.zeros((1, 4)) adata = _make_peptide_adata( - X=X, obs_names=["s1"], + X=X, + obs_names=["s1"], ) result = summarize_modifications( - adata, method="sum", - zero_to_na=True, inplace=False, + adata, + method="sum", + zero_to_na=True, + inplace=False, ) assert np.all(np.isnan(result.X)) def test_multiple_observations_independent(self): - """Each observation is aggregated - independently.""" + """Each observation is aggregated independently.""" X = np.array( - [[1.0, 2.0, 3.0, 4.0], - [10.0, 20.0, 30.0, 40.0], - [100.0, 200.0, 300.0, 400.0]], + [ + [1.0, 2.0, 3.0, 4.0], + [10.0, 20.0, 30.0, 40.0], + [100.0, 200.0, 300.0, 400.0], + ], ) adata = _make_peptide_adata( X=X, obs_names=["s1", "s2", "s3"], ) result = summarize_modifications( - adata, method="sum", inplace=False, + adata, + method="sum", + inplace=False, ) expected = np.array( - [[3.0, 7.0], - [30.0, 70.0], - [300.0, 700.0]], + [[3.0, 7.0], [30.0, 70.0], [300.0, 700.0]], ) np.testing.assert_allclose(result.X, expected) def test_mixed_nan_across_observations(self): - """NaN patterns can differ across - observations.""" + """NaN patterns can differ across observations.""" n = np.nan X = np.array( - [[1.0, n, 3.0, 4.0], - [n, 6.0, 7.0, 8.0]], + [[1.0, n, 3.0, 4.0], [n, 6.0, 7.0, 8.0]], ) adata = _make_peptide_adata(X=X) result = summarize_modifications( - adata, method="sum", - skip_na=True, inplace=False, + adata, + method="sum", + skip_na=True, + inplace=False, ) np.testing.assert_allclose( result.X, @@ -1083,14 +1133,15 @@ def test_mixed_nan_skip_na_false_per_observation(self): independently.""" n = np.nan X = np.array( - [[1.0, n, 3.0, 4.0], - [5.0, 6.0, 7.0, 8.0]], + [[1.0, n, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], ) adata = _make_peptide_adata(X=X) result = summarize_modifications( - adata, method="sum", - skip_na=False, inplace=False, + adata, + method="sum", + skip_na=False, + inplace=False, ) assert np.isnan(result.X[0, 0]) np.testing.assert_allclose(result.X[0, 1], 7.0) @@ -1098,8 +1149,8 @@ def test_mixed_nan_skip_na_false_per_observation(self): np.testing.assert_allclose(result.X[1, 1], 15.0) def test_sparse_with_nan_values(self): - """Sparse input with stored NaN values aggregates - correctly and the output remains sparse.""" + """Sparse input with stored NaN values aggregates correctly and + the output remains sparse.""" n = np.nan X_dense = np.array([[1.0, n, 3.0, 4.0]]) adata = _make_peptide_adata( @@ -1108,10 +1159,13 @@ def test_sparse_with_nan_values(self): ) result = summarize_modifications( - adata, method="sum", - skip_na=True, inplace=False, + adata, + method="sum", + skip_na=True, + inplace=False, ) assert sparse.issparse(result.X) np.testing.assert_allclose( - result.X.toarray(), [[1.0, 7.0]], + result.X.toarray(), + [[1.0, 7.0]], ) diff --git a/tests/read/test_long.py b/tests/read/test_long.py index 4427ed8..6b0c9aa 100644 --- a/tests/read/test_long.py +++ b/tests/read/test_long.py @@ -15,6 +15,7 @@ # Helper constructors # ------------------------------------------------------------------ + def _make_peptide_intensities( sample_ids=None, peptide_ids=None, @@ -72,15 +73,18 @@ def _make_protein_intensities( protein_ids = ["PROT1", "PROT2", "PROT1", "PROT2"] if intensities is None: intensities = [1.0, 2.0, 3.0, 4.0] - return pd.DataFrame({ - "sample_id": sample_ids, - "protein_id": protein_ids, - "intensity": intensities, - }) + return pd.DataFrame( + { + "sample_id": sample_ids, + "protein_id": protein_ids, + "intensity": intensities, + } + ) def _make_sample_annotation(sample_ids, extra=None): - """Build a sample annotation DataFrame with an optional extra col.""" + """Build a sample annotation DataFrame with an optional extra + col.""" data = {"sample_id": list(sample_ids)} if extra: data.update(extra) @@ -131,6 +135,7 @@ def _as_intensities(df, fmt, tmp_path): # Tests # ------------------------------------------------------------------ + class TestLong: """Comprehensive tests for :func:`proteopy.read.long`.""" @@ -207,7 +212,9 @@ def test_duplicate_sample_peptide_rows_raise(self): @pytest.mark.parametrize("fmt", ["dataframe", "csv", "tsv"]) def test_peptide_minimal_with_protein_in_intensities( - self, fmt, tmp_path, + self, + fmt, + tmp_path, ): """Basic case: intensities carry ``protein_id`` directly. @@ -270,8 +277,12 @@ def test_peptide_with_multiple_protein_mappings(self): sample_ids=["s1", "s1", "s1", "s2", "s2", "s2"], peptide_ids=["PEP1", "PEP2", "PEP3"] * 2, protein_ids=[ - "PROT1", "PROT2", "PROT2", - "PROT1", "PROT2", "PROT2", + "PROT1", + "PROT2", + "PROT2", + "PROT1", + "PROT2", + "PROT2", ], intensities=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], ) @@ -280,7 +291,9 @@ def test_peptide_with_multiple_protein_mappings(self): assert adata.shape == (2, 3) assert list(adata.var_names) == ["PEP1", "PEP2", "PEP3"] assert list(adata.var["protein_id"]) == [ - "PROT1", "PROT2", "PROT2", + "PROT1", + "PROT2", + "PROT2", ] np.testing.assert_allclose( adata.X, @@ -329,7 +342,9 @@ def test_sample_annotation_columns_merged(self): extra={"group": ["A", "B"]}, ) adata = long( - df, level="peptide", sample_annotation=sample_ann, + df, + level="peptide", + sample_annotation=sample_ann, ) assert "group" in adata.obs.columns @@ -350,13 +365,17 @@ def test_peptide_annotation_columns_merged(self): def test_duplicate_sample_annotation_warns_and_dedupes(self): """Duplicate annotation rows warn and keep the first.""" df = _make_peptide_intensities() - sample_ann = pd.DataFrame({ - "sample_id": ["s1", "s1", "s2"], - "group": ["FIRST", "SECOND", "OTHER"], - }) + sample_ann = pd.DataFrame( + { + "sample_id": ["s1", "s1", "s2"], + "group": ["FIRST", "SECOND", "OTHER"], + } + ) with pytest.warns(UserWarning, match="Duplicate sample"): adata = long( - df, level="peptide", sample_annotation=sample_ann, + df, + level="peptide", + sample_annotation=sample_ann, ) # First occurrence kept. @@ -366,10 +385,12 @@ def test_duplicate_sample_annotation_warns_and_dedupes(self): def test_duplicate_peptide_annotation_warns_and_dedupes(self): """Duplicate peptide annotation rows warn and keep first.""" df = _make_peptide_intensities() - var_ann = pd.DataFrame({ - "peptide_id": ["PEP1", "PEP1", "PEP2"], - "charge": [2, 3, 4], - }) + var_ann = pd.DataFrame( + { + "peptide_id": ["PEP1", "PEP1", "PEP2"], + "charge": [2, 3, 4], + } + ) with pytest.warns(UserWarning, match="Duplicate peptide"): adata = long(df, level="peptide", var_annotation=var_ann) @@ -380,12 +401,14 @@ def test_duplicate_peptide_annotation_warns_and_dedupes(self): def test_column_map_remaps_peptide_level(self): """Non-standard peptide columns are canonicalized.""" - df = pd.DataFrame({ - "run": ["s1", "s1", "s2", "s2"], - "seq": ["PEP1", "PEP2", "PEP1", "PEP2"], - "prot": ["PROT1", "PROT1", "PROT1", "PROT1"], - "quant": [1.0, 2.0, 3.0, 4.0], - }) + df = pd.DataFrame( + { + "run": ["s1", "s1", "s2", "s2"], + "seq": ["PEP1", "PEP2", "PEP1", "PEP2"], + "prot": ["PROT1", "PROT1", "PROT1", "PROT1"], + "quant": [1.0, 2.0, 3.0, 4.0], + } + ) adata = long( df, level="peptide", @@ -408,11 +431,13 @@ def test_column_map_remaps_peptide_level(self): def test_column_map_remaps_protein_level(self): """Non-standard protein columns are canonicalized.""" - df = pd.DataFrame({ - "run": ["s1", "s1", "s2", "s2"], - "prot": ["PROT1", "PROT2", "PROT1", "PROT2"], - "quant": [1.0, 2.0, 3.0, 4.0], - }) + df = pd.DataFrame( + { + "run": ["s1", "s1", "s2", "s2"], + "prot": ["PROT1", "PROT2", "PROT1", "PROT2"], + "quant": [1.0, 2.0, 3.0, 4.0], + } + ) adata = long( df, level="protein", @@ -434,7 +459,8 @@ def test_column_map_remaps_protein_level(self): # -- Missing-value handling --------------------------------------- def test_missing_pair_is_nan_peptide_level(self): - """Missing (sample, peptide) pairs become ``np.nan`` in ``.X``.""" + """Missing (sample, peptide) pairs become ``np.nan`` in + ``.X``.""" df = _make_peptide_intensities( sample_ids=["s1", "s1", "s2"], peptide_ids=["PEP1", "PEP2", "PEP1"], @@ -550,7 +576,8 @@ def test_zero_to_na_converts_zeros(self): expected = np.array([[np.nan, 2.0], [3.0, 4.0]]) # Compare NaN-aware. np.testing.assert_array_equal( - np.isnan(adata.X), np.isnan(expected), + np.isnan(adata.X), + np.isnan(expected), ) np.testing.assert_allclose( adata.X[~np.isnan(adata.X)], diff --git a/tests/tl/test_copro.py b/tests/tl/test_copro.py index 8d80969..e695e7f 100644 --- a/tests/tl/test_copro.py +++ b/tests/tl/test_copro.py @@ -13,7 +13,7 @@ peptide_dendograms_by_correlation_, peptide_clusters_from_dendograms_, proteoform_scores_, - ) +) from proteopy.utils.data_structures import ListDict @@ -22,23 +22,23 @@ remap_dendogram_leaf_order, reconstruct_corrs_df_symmetric_from_long_df, check_dendogram_equality, - ) +) TEST_DIR = Path(__file__).parent.parent DATA_DIR = TEST_DIR / "data" NOISE = 1e6 + def compare_clusters_dsVlist(ds, ref): - ''' - Check if peptide cluster annotations defined in a pd.DataSeries are the - same as a refrence list of clusters annotations. + """Check if peptide cluster annotations defined in a pd.DataSeries + are the same as a refrence list of clusters annotations. Args: ds (pd.DataSeries): values are the categorical annotations (clusters) and the indices represent the (peptide) labels. ref (list): reference cluster annotations. - ''' + """ groups = ds.groupby(ds).groups clusters_ds = [v.tolist() for _, v in groups.items()] clusters_ds = [tuple(sorted(c)) for c in clusters_ds] @@ -46,136 +46,144 @@ def compare_clusters_dsVlist(ds, ref): ref = [tuple(sorted(c)) for c in ref] ref_log = set(ref) - counter=0 + counter = 0 for c in clusters_ds: - counter+=1 + counter += 1 assert c in ref ref_log.remove(c) assert len(ref_log) == 0 + @pytest.fixture def traces_preproc(): - ''' - Get COPF mouse tissue pre-processed traces df. - ''' - traces_path = DATA_DIR / 'mouse_tissue/traces_pre-processed_rcopf.tsv' - traces = pd.read_csv(traces_path, sep='\t', header=0) - traces = traces.rename(columns={'id': 'peptide_id'}) + """Get COPF mouse tissue pre-processed traces df.""" + traces_path = DATA_DIR / "mouse_tissue/traces_pre-processed_rcopf.tsv" + traces = pd.read_csv(traces_path, sep="\t", header=0) + traces = traces.rename(columns={"id": "peptide_id"}) return traces + @pytest.fixture def traces_preproc_anns(): - ''' - Get COPF mouse tissue pre-processed traces annotations df. - ''' + """Get COPF mouse tissue pre-processed traces annotations df.""" anns_path = ( - DATA_DIR / 'mouse_tissue/traces_pre-processed_trace-annotations_rcopf.tsv' - ) - anns = pd.read_csv(anns_path, sep='\t', header=0) - anns = anns.rename(columns={'id': 'peptide_id'}) + DATA_DIR + / "mouse_tissue/traces_pre-processed_trace-annotations_rcopf.tsv" + ) + anns = pd.read_csv(anns_path, sep="\t", header=0) + anns = anns.rename(columns={"id": "peptide_id"}) return anns @pytest.fixture def traces_preproc_ext(traces_preproc, traces_preproc_anns): - ''' - Extend pre-processed traces with annotations. - ''' - anns_select = traces_preproc_anns[['peptide_id', 'protein_id']] - traces_ext = traces_preproc.merge(anns_select, on='peptide_id') - traces_ext = pd.melt(traces_ext, id_vars=('protein_id', 'peptide_id')) - traces_ext = traces_ext.rename(columns={'value': 'intensity', 'variable': 'sample'}) + """Extend pre-processed traces with annotations.""" + anns_select = traces_preproc_anns[["peptide_id", "protein_id"]] + traces_ext = traces_preproc.merge(anns_select, on="peptide_id") + traces_ext = pd.melt(traces_ext, id_vars=("protein_id", "peptide_id")) + traces_ext = traces_ext.rename( + columns={"value": "intensity", "variable": "sample"} + ) return traces_ext @pytest.fixture def fraction_annotation(): - frac_ann_path = DATA_DIR / \ - 'mouse_tissue/fraction_annotation.tsv' - frac_ann = pd.read_csv(frac_ann_path, sep='\t', header=0) - frac_ann = frac_ann.copy(deep=True).set_index('filename') + frac_ann_path = DATA_DIR / "mouse_tissue/fraction_annotation.tsv" + frac_ann = pd.read_csv(frac_ann_path, sep="\t", header=0) + frac_ann = frac_ann.copy(deep=True).set_index("filename") return frac_ann @pytest.fixture def traces_corrs(): - ''' - Get COPF mouse tissue correlations df. - ''' - traces_corrs_path = DATA_DIR / 'mouse_tissue/traces_correlations_rcopf.tsv' - col_names = ['pepA', 'pepB', 'PCC', 'protein_id'] - df = pd.read_csv(traces_corrs_path, sep='\t', names=col_names) + """Get COPF mouse tissue correlations df.""" + traces_corrs_path = DATA_DIR / "mouse_tissue/traces_correlations_rcopf.tsv" + col_names = ["pepA", "pepB", "PCC", "protein_id"] + df = pd.read_csv(traces_corrs_path, sep="\t", names=col_names) return df @pytest.fixture def traces_corrs_ref(traces_corrs): - ''' - Filter COPF mouse tissue correlations df for - unique (non-symmetrical) correlation values. - ''' - corrs_ref = traces_corrs.set_index('protein_id') - corrs_ref = corrs_ref[corrs_ref['PCC'] != 1] + """Filter COPF mouse tissue correlations df for unique (non- + symmetrical) correlation values.""" + corrs_ref = traces_corrs.set_index("protein_id") + corrs_ref = corrs_ref[corrs_ref["PCC"] != 1] - sort_peps_ab = lambda row: tuple(sorted([row['pepA'], row['pepB']])) + def sort_peps_ab(row): + return tuple(sorted([row["pepA"], row["pepB"]])) - corrs_ref['sorted_pair'] = corrs_ref.apply(sort_peps_ab, axis=1) - corrs_ref = corrs_ref.drop_duplicates(subset=['sorted_pair']) - corrs_ref = corrs_ref.drop(columns=['sorted_pair']) - corrs_ref = corrs_ref.sort_values(['pepA', 'pepB']).sort_index() + corrs_ref["sorted_pair"] = corrs_ref.apply(sort_peps_ab, axis=1) + corrs_ref = corrs_ref.drop_duplicates(subset=["sorted_pair"]) + corrs_ref = corrs_ref.drop(columns=["sorted_pair"]) + corrs_ref = corrs_ref.sort_values(["pepA", "pepB"]).sort_index() return corrs_ref -def test_pairwise_peptide_correlations_vs_rcopf(traces_preproc_ext, traces_corrs_ref): - ''' - Test pairwise_peptide_correlations() application for equality to rCOPF correlations df. +def test_pairwise_peptide_correlations_vs_rcopf( + traces_preproc_ext, traces_corrs_ref +): + """Test pairwise_peptide_correlations() application for equality to + rCOPF correlations df. + Uses COPF mouse tissue dataset as reference results. - ''' + """ # Apply pairwise_peptide_correlations on the entire mouse tissue df - pep_corrs = lambda x: pairwise_peptide_correlations_(x, - sample_column='sample', - peptide_column='peptide_id', - value_column='intensity') - + def pep_corrs(x): + return pairwise_peptide_correlations_( + x, + sample_column="sample", + peptide_column="peptide_id", + value_column="intensity", + ) - corrs = traces_preproc_ext.groupby('protein_id').apply(pep_corrs, include_groups=False) + corrs = traces_preproc_ext.groupby("protein_id").apply( + pep_corrs, include_groups=False + ) corrs = corrs.droplevel(1, axis=0) - corrs = corrs.sort_values(['pepA', 'pepB']).sort_index() + corrs = corrs.sort_values(["pepA", "pepB"]).sort_index() # Compare to rCOPF reference output - pep_cols = ['pepA', 'pepB'] - assert corrs[pep_cols].equals(traces_corrs_ref[pep_cols]) # Both prev. sorted - - abs_tolerance = 1e-14 # loaded reference corrs precision 1e-15 - assert corrs['PCC'].values == approx(traces_corrs_ref['PCC'].values, abs=abs_tolerance) + pep_cols = ["pepA", "pepB"] + assert corrs[pep_cols].equals( + traces_corrs_ref[pep_cols] + ) # Both prev. sorted + + abs_tolerance = 1e-14 # loaded reference corrs precision 1e-15 + assert corrs["PCC"].values == approx( + traces_corrs_ref["PCC"].values, abs=abs_tolerance + ) @pytest.fixture def prot_dends(): - clusts_ref_path = DATA_DIR / 'mouse_tissue/traces_cluster-dendograms_rcopf.json' + clusts_ref_path = ( + DATA_DIR / "mouse_tissue/traces_cluster-dendograms_rcopf.json" + ) - with open(clusts_ref_path, 'r') as f: + with open(clusts_ref_path) as f: dends_R = json.load(f) # Reformat to match python sklearn dendograms dends = {} for prot_id in dends_R: - + dend = dends_R[prot_id] dend = transform_dendogram_r2py(dend) - if isinstance(dend['heights'], float): - dend['heights'] = [dend['heights']] + if isinstance(dend["heights"], float): + dend["heights"] = [dend["heights"]] dends[prot_id] = dend @@ -187,18 +195,22 @@ def test_peptide_dendograms_by_correlation_vs_rcopf(traces_corrs, prot_dends): # Construct map: {protein: dendogram} dends = {} - for protein_id, df in traces_corrs.groupby('protein_id'): + for protein_id, df in traces_corrs.groupby("protein_id"): - corr_df_sym = reconstruct_corrs_df_symmetric_from_long_df(df, var_a_col='pepA', var_b_col='pepB', corr_col='PCC') + corr_df_sym = reconstruct_corrs_df_symmetric_from_long_df( + df, var_a_col="pepA", var_b_col="pepB", corr_col="PCC" + ) corr_dists = 1 - corr_df_sym dends[protein_id] = peptide_dendograms_by_correlation_(corr_dists) dends_ref = copy.deepcopy(prot_dends) - # Remap + # Remap for prot_id, dend in dends_ref.items(): - dend_corrected = remap_dendogram_leaf_order(dend, ref_labels=dends[prot_id]['labels']) + dend_corrected = remap_dendogram_leaf_order( + dend, ref_labels=dends[prot_id]["labels"] + ) dends_ref[prot_id] = dend_corrected # Equal dendogram dict structure @@ -206,14 +218,15 @@ def test_peptide_dendograms_by_correlation_vs_rcopf(traces_corrs, prot_dends): assert len(dends_ref.keys()) == len(dends.keys()) for prot_id in dends_ref.keys(): - abs_tolerance = 1e-4 # loaded reference heights precision = 1e-4 - check_dendogram_equality(dends[prot_id], - dends_ref[prot_id], - abs_tolerance=abs_tolerance) + abs_tolerance = 1e-4 # loaded reference heights precision = 1e-4 + check_dendogram_equality( + dends[prot_id], dends_ref[prot_id], abs_tolerance=abs_tolerance + ) def test_peptide_clusters_from_dendograms_(): - '''Test protein-level peptide_clusters_from_dendograms_() on a single peptide group.''' + """Test protein-level peptide_clusters_from_dendograms_() on a + single peptide group.""" # Using dendogram-based toy data # # (11) @@ -231,64 +244,58 @@ def test_peptide_clusters_from_dendograms_(): # 2 2 1 1 0 0 n_clust=3, min_pep=1 # 2 2 1 1 0 0 n_clust=3, min_pep=2 # Expected cluster after cutting with different configurations, above: - # cluster numbers may be different order, which is accounted for in test comparisons. + # cluster numbers may be different order, which is accounted for in test + # comparisons. dendogram = { - 'type': 'sklearn_agglomerative_clustering', - 'labels': ['pepA', 'pepB', 'pepC', 'pepD', 'pepE', 'pepF'], - 'merge': [[0,1], [2,3], [4,5], [6,7], [8,9]], - 'heights': [0.1, 0.2, 0.4, 0.8, 0.9] - } + "type": "sklearn_agglomerative_clustering", + "labels": ["pepA", "pepB", "pepC", "pepD", "pepE", "pepF"], + "merge": [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], + "heights": [0.1, 0.2, 0.4, 0.8, 0.9], + } # Config 1 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 1, - min_peptides_per_cluster=1) - expected_clusters = [['pepA', 'pepB', 'pepC', 'pepD', 'pepE', 'pepF']] + dendogram, n_clusters=1, min_peptides_per_cluster=1 + ) + expected_clusters = [["pepA", "pepB", "pepC", "pepD", "pepE", "pepF"]] compare_clusters_dsVlist(clusters, expected_clusters) # Config 2 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 1, - min_peptides_per_cluster=2) - expected_clusters = [['pepA', 'pepB', 'pepC', 'pepD', 'pepE', 'pepF']] + dendogram, n_clusters=1, min_peptides_per_cluster=2 + ) + expected_clusters = [["pepA", "pepB", "pepC", "pepD", "pepE", "pepF"]] compare_clusters_dsVlist(clusters, expected_clusters) # Config 3 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 2, - min_peptides_per_cluster=1) - expected_clusters = [['pepA', 'pepB', 'pepC', 'pepD'], ['pepE', 'pepF']] + dendogram, n_clusters=2, min_peptides_per_cluster=1 + ) + expected_clusters = [["pepA", "pepB", "pepC", "pepD"], ["pepE", "pepF"]] compare_clusters_dsVlist(clusters, expected_clusters) # Config 4 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 2, - min_peptides_per_cluster=2) - expected_clusters = [['pepA', 'pepB', 'pepC', 'pepD'], ['pepE', 'pepF']] + dendogram, n_clusters=2, min_peptides_per_cluster=2 + ) + expected_clusters = [["pepA", "pepB", "pepC", "pepD"], ["pepE", "pepF"]] compare_clusters_dsVlist(clusters, expected_clusters) # Config 5 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 3, - min_peptides_per_cluster=1) - expected_clusters = [['pepA', 'pepB'], ['pepC', 'pepD'], ['pepE', 'pepF']] + dendogram, n_clusters=3, min_peptides_per_cluster=1 + ) + expected_clusters = [["pepA", "pepB"], ["pepC", "pepD"], ["pepE", "pepF"]] compare_clusters_dsVlist(clusters, expected_clusters) # Config 6 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 3, - min_peptides_per_cluster=2) - expected_clusters = [['pepA', 'pepB'], ['pepC', 'pepD'], ['pepE', 'pepF']] + dendogram, n_clusters=3, min_peptides_per_cluster=2 + ) + expected_clusters = [["pepA", "pepB"], ["pepC", "pepD"], ["pepE", "pepF"]] compare_clusters_dsVlist(clusters, expected_clusters) - # Using dendogram-based toy data # # (8) @@ -304,68 +311,60 @@ def test_peptide_clusters_from_dendograms_(): # 1 1 1 1 0 n_clust=2, min_pep=1 # 1 1 0 0 x n_clust=2, min_pep=2 # 2 2 1 1 0 n_clust=3, min_pep=1 - # 2 1 0 0 x n_clust=3, min_pep=2 + # 2 1 0 0 x n_clust=3, min_pep=2 dendogram = { - 'type': 'sklearn_agglomerative_clustering', - 'labels': ['pepA', 'pepB', 'pepC', 'pepD', 'pepE'], - 'merge': [[0,1], [2,3], [5,6], [4,7]], - 'heights': [0.1, 0.2, 0.4, 0.8] - } + "type": "sklearn_agglomerative_clustering", + "labels": ["pepA", "pepB", "pepC", "pepD", "pepE"], + "merge": [[0, 1], [2, 3], [5, 6], [4, 7]], + "heights": [0.1, 0.2, 0.4, 0.8], + } # Config 1 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 1, - min_peptides_per_cluster=1) - expected_clusters = [['pepA', 'pepB', 'pepC', 'pepD', 'pepE']] + dendogram, n_clusters=1, min_peptides_per_cluster=1 + ) + expected_clusters = [["pepA", "pepB", "pepC", "pepD", "pepE"]] compare_clusters_dsVlist(clusters, expected_clusters) # Config 2 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 1, - min_peptides_per_cluster=2) - expected_clusters = [['pepA', 'pepB', 'pepC', 'pepD', 'pepE']] + dendogram, n_clusters=1, min_peptides_per_cluster=2 + ) + expected_clusters = [["pepA", "pepB", "pepC", "pepD", "pepE"]] compare_clusters_dsVlist(clusters, expected_clusters) # Config 3 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 2, - min_peptides_per_cluster=1) - expected_clusters = [['pepA', 'pepB', 'pepC', 'pepD'], ['pepE']] + dendogram, n_clusters=2, min_peptides_per_cluster=1 + ) + expected_clusters = [["pepA", "pepB", "pepC", "pepD"], ["pepE"]] compare_clusters_dsVlist(clusters, expected_clusters) # Config 4 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 2, - min_peptides_per_cluster=2) - expected_clusters = [['pepA', 'pepB'], ['pepC', 'pepD'], ['pepE']] + dendogram, n_clusters=2, min_peptides_per_cluster=2 + ) + expected_clusters = [["pepA", "pepB"], ["pepC", "pepD"], ["pepE"]] compare_clusters_dsVlist(clusters, expected_clusters) - assert clusters['pepE'] == NOISE + assert clusters["pepE"] == NOISE # Config 5 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 3, - min_peptides_per_cluster=1) - expected_clusters = [['pepA', 'pepB'], ['pepC', 'pepD'], ['pepE']] + dendogram, n_clusters=3, min_peptides_per_cluster=1 + ) + expected_clusters = [["pepA", "pepB"], ["pepC", "pepD"], ["pepE"]] compare_clusters_dsVlist(clusters, expected_clusters) # Config 6 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 3, - min_peptides_per_cluster=2) - expected_clusters = [['pepA', 'pepB', 'pepC', 'pepD', 'pepE']] + dendogram, n_clusters=3, min_peptides_per_cluster=2 + ) + expected_clusters = [["pepA", "pepB", "pepC", "pepD", "pepE"]] compare_clusters_dsVlist(clusters, expected_clusters) assert clusters.nunique() == 1 assert clusters.iloc[0] == NOISE - - # Using correlation based toy data # # (8) @@ -381,30 +380,35 @@ def test_peptide_clusters_from_dendograms_(): # 1 1 1 1 0 n_clust=2, min_pep=1 # 1 1 0 0 x n_clust=2, min_pep=2 # 2 2 1 1 0 n_clust=3, min_pep=1 - # 2 1 0 0 x n_clust=3, min_pep=2 - - corrs = pd.DataFrame({ - 'pepA': [0, 1, 3, 3, 4], - 'pepB': [1, 0, 3, 3, 4], - 'pepC': [3, 3, 0, 2, 4], - 'pepD': [3, 3, 2, 0, 4], - 'pepE': [4, 4, 4, 4, 0], - }, index=['pepA', 'pepB', 'pepC', 'pepD', 'pepE']) - - model = AgglomerativeClustering(n_clusters=None, - metric='precomputed', - linkage='average', - distance_threshold=0, - compute_distances=True) + # 2 1 0 0 x n_clust=3, min_pep=2 + + corrs = pd.DataFrame( + { + "pepA": [0, 1, 3, 3, 4], + "pepB": [1, 0, 3, 3, 4], + "pepC": [3, 3, 0, 2, 4], + "pepD": [3, 3, 2, 0, 4], + "pepE": [4, 4, 4, 4, 0], + }, + index=["pepA", "pepB", "pepC", "pepD", "pepE"], + ) + + model = AgglomerativeClustering( + n_clusters=None, + metric="precomputed", + linkage="average", + distance_threshold=0, + compute_distances=True, + ) model.fit(corrs) # pylint: disable=no-member dendogram = { - 'type': 'sklearn_agglomerative_clustering', - 'labels': model.feature_names_in_.tolist(), - 'heights': model.distances_.tolist(), - 'merge': model.children_.tolist() + "type": "sklearn_agglomerative_clustering", + "labels": model.feature_names_in_.tolist(), + "heights": model.distances_.tolist(), + "merge": model.children_.tolist(), } # pylint: enable=no-member @@ -412,94 +416,88 @@ def test_peptide_clusters_from_dendograms_(): # Config 1 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 1, - min_peptides_per_cluster=1) - expected_clusters = [['pepA', 'pepB', 'pepC', 'pepD', 'pepE']] + dendogram, n_clusters=1, min_peptides_per_cluster=1 + ) + expected_clusters = [["pepA", "pepB", "pepC", "pepD", "pepE"]] compare_clusters_dsVlist(clusters, expected_clusters) # Config 2 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 1, - min_peptides_per_cluster=2) - expected_clusters = [['pepA', 'pepB', 'pepC', 'pepD', 'pepE']] + dendogram, n_clusters=1, min_peptides_per_cluster=2 + ) + expected_clusters = [["pepA", "pepB", "pepC", "pepD", "pepE"]] compare_clusters_dsVlist(clusters, expected_clusters) # Config 3 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 2, - min_peptides_per_cluster=1) - expected_clusters = [['pepA', 'pepB', 'pepC', 'pepD'], ['pepE']] + dendogram, n_clusters=2, min_peptides_per_cluster=1 + ) + expected_clusters = [["pepA", "pepB", "pepC", "pepD"], ["pepE"]] compare_clusters_dsVlist(clusters, expected_clusters) # Config 4 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 2, - min_peptides_per_cluster=2) - expected_clusters = [['pepA', 'pepB'], ['pepC', 'pepD'], ['pepE']] + dendogram, n_clusters=2, min_peptides_per_cluster=2 + ) + expected_clusters = [["pepA", "pepB"], ["pepC", "pepD"], ["pepE"]] compare_clusters_dsVlist(clusters, expected_clusters) - assert clusters['pepE'] == NOISE + assert clusters["pepE"] == NOISE # Config 5 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 3, - min_peptides_per_cluster=1) - expected_clusters = [['pepA', 'pepB'], ['pepC', 'pepD'], ['pepE']] + dendogram, n_clusters=3, min_peptides_per_cluster=1 + ) + expected_clusters = [["pepA", "pepB"], ["pepC", "pepD"], ["pepE"]] compare_clusters_dsVlist(clusters, expected_clusters) # Config 6 clusters = peptide_clusters_from_dendograms_( - dendogram, - n_clusters = 3, - min_peptides_per_cluster=2) - expected_clusters = [['pepA', 'pepB', 'pepC', 'pepD', 'pepE']] + dendogram, n_clusters=3, min_peptides_per_cluster=2 + ) + expected_clusters = [["pepA", "pepB", "pepC", "pepD", "pepE"]] compare_clusters_dsVlist(clusters, expected_clusters) assert clusters.nunique() == 1 assert clusters.iloc[0] == NOISE + @pytest.fixture def prot_clust_ann(): - '''Get protein-level peptide cluster annotations from COPF mouse tissue df.''' + """Get protein-level peptide cluster annotations from COPF mouse + tissue df.""" clusts_ann_path = DATA_DIR / ( - 'mouse_tissue/traces_annotation_cluster-assignment_rcopf.tsv' - ) - clusts_ann = pd.read_csv(clusts_ann_path, sep='\t', header=0) + "mouse_tissue/traces_annotation_cluster-assignment_rcopf.tsv" + ) + clusts_ann = pd.read_csv(clusts_ann_path, sep="\t", header=0) return clusts_ann -def test_peptide_clusters_from_dendograms_vs_rcopf_(prot_dends, prot_clust_ann): - ''' - Test protein-level peptide_clusters_from_dendograms_() - on an rCOPF-derived reference dataset. - ''' + +def test_peptide_clusters_from_dendograms_vs_rcopf_( + prot_dends, prot_clust_ann +): + """Test protein-level peptide_clusters_from_dendograms_() on an + rCOPF-derived reference dataset.""" cluster_ann_ref_df = copy.deepcopy(prot_clust_ann) # Compute protein-level cluster annotations cluster_ann = {} for prot, dend in prot_dends.items(): dend_upd = copy.deepcopy(dend) - dend_upd['type'] = 'sklearn_agglomerative_clustering' + dend_upd["type"] = "sklearn_agglomerative_clustering" clusters = peptide_clusters_from_dendograms_( - dend_upd, - n_clusters=2, - min_peptides_per_cluster=2, - noise=NOISE - ) + dend_upd, n_clusters=2, min_peptides_per_cluster=2, noise=NOISE + ) cluster_ann[prot] = clusters - + # Format reference cluster annotations - cluster_ann_ref_df = cluster_ann_ref_df.set_index('id') - prot_ids = cluster_ann_ref_df['protein_id'].unique() + cluster_ann_ref_df = cluster_ann_ref_df.set_index("id") + prot_ids = cluster_ann_ref_df["protein_id"].unique() cluster_ann_ref = {} for p in prot_ids: - clusters = cluster_ann_ref_df[cluster_ann_ref_df['protein_id'] == p] - clusters_map = clusters['cluster'].to_dict() + clusters = cluster_ann_ref_df[cluster_ann_ref_df["protein_id"] == p] + clusters_map = clusters["cluster"].to_dict() clusters_map_inv = ListDict() for pep, clust in clusters_map.items(): @@ -521,66 +519,68 @@ def test_peptide_clusters_from_dendograms_vs_rcopf_(prot_dends, prot_clust_ann): @pytest.fixture def trace_annotation_proteoform_scores(): - '''Get protein-level proteoform annotations from COPF mouse tissue dataset.''' + """Get protein-level proteoform annotations from COPF mouse tissue + dataset.""" annotation_path = DATA_DIR / ( - 'mouse_tissue/trace_annotation_proteoform-scores_rcopf.tsv' - ) - clusts_ann = pd.read_csv(annotation_path, sep='\t', header=0) + "mouse_tissue/trace_annotation_proteoform-scores_rcopf.tsv" + ) + clusts_ann = pd.read_csv(annotation_path, sep="\t", header=0) return clusts_ann + def test_proteoform_scores_vs_rcopf_( traces_corrs, prot_clust_ann, fraction_annotation, trace_annotation_proteoform_scores, summary_func=np.mean, - ): +): n_fractions = len(fraction_annotation) columns = [ - 'protein_id', - 'proteoform_score', - 'proteoform_score_z', - 'proteoform_score_dz', - 'proteoform_score_pval', - ] + "protein_id", + "proteoform_score", + "proteoform_score_z", + "proteoform_score_dz", + "proteoform_score_pval", + ] proteoform_scores_list = [] - for prot, corrs in traces_corrs.groupby('protein_id'): + for prot, corrs in traces_corrs.groupby("protein_id"): corrs_mat = reconstruct_corrs_df_symmetric_from_long_df( - corrs, - var_a_col='pepA', - var_b_col='pepB', - corr_col='PCC') + corrs, var_a_col="pepA", var_b_col="pepB", corr_col="PCC" + ) - clusters = prot_clust_ann[prot_clust_ann['protein_id'] == prot] - clusters = clusters.set_index('id')['cluster'] + clusters = prot_clust_ann[prot_clust_ann["protein_id"] == prot] + clusters = clusters.set_index("id")["cluster"] clusters[clusters == 100] = NOISE scores = proteoform_scores_( - corrs_mat, - clusters, - n_fractions, - summary_func=np.mean) + corrs_mat, clusters, n_fractions, summary_func=np.mean + ) - scores_entry = {column:value for column, value in zip(columns[1:5], scores)} - scores_entry['protein_id'] = prot + scores_entry = { + column: value for column, value in zip(columns[1:5], scores) + } + scores_entry["protein_id"] = prot scores_entry = pd.DataFrame([scores_entry]) proteoform_scores_list.append(scores_entry) proteoform_scores = pd.concat(proteoform_scores_list, ignore_index=True) - proteoform_scores = proteoform_scores.loc[:, columns].set_index('protein_id') + proteoform_scores = proteoform_scores.loc[:, columns].set_index( + "protein_id" + ) # Format reference proteoforms proteoform_scores_ref = trace_annotation_proteoform_scores[columns] proteoform_scores_ref = proteoform_scores_ref.drop_duplicates() - proteoform_scores_ref = proteoform_scores_ref.set_index('protein_id') + proteoform_scores_ref = proteoform_scores_ref.set_index("protein_id") # Compare pfs_idx = set(proteoform_scores.index) @@ -588,13 +588,23 @@ def test_proteoform_scores_vs_rcopf_( assert len(pfs_idx.symmetric_difference(pfs_ref_idx)) == 0 assert len(pfs_idx) == len(pfs_ref_idx) - assert len(set(proteoform_scores.columns).symmetric_difference(set(proteoform_scores_ref.columns))) == 0 + assert ( + len( + set(proteoform_scores.columns).symmetric_difference( + set(proteoform_scores_ref.columns) + ) + ) + == 0 + ) - proteoform_scores = proteoform_scores.loc[proteoform_scores_ref.index,proteoform_scores_ref.columns] + proteoform_scores = proteoform_scores.loc[ + proteoform_scores_ref.index, proteoform_scores_ref.columns + ] assert np.allclose( proteoform_scores, proteoform_scores_ref, rtol=0, atol=1e-12, - equal_nan=True) + equal_nan=True, + ) diff --git a/tests/utils/helpers.py b/tests/utils/helpers.py index 3bbba95..f8d2e80 100644 --- a/tests/utils/helpers.py +++ b/tests/utils/helpers.py @@ -5,6 +5,7 @@ from proteopy.utils.copf import reconstruct_corrs_df_symmetric_from_long_df + def test_reconstruct_corrs_df_symmetric_from_long_df(): # labels: a-c @@ -13,19 +14,22 @@ def test_reconstruct_corrs_df_symmetric_from_long_df(): # [ x, 1 , x ], ==> [ 0.1, 1 , 0.4], # [ x 0.4, 1 ]] [ 0.5, 0.4, 1 ]] - df = pd.DataFrame({ - 'colA': ['a', 'a', 'b', 'c', 'c'], - 'colB': ['b', 'c', 'b', 'b', 'c'], - 'value':[0.1, 0.5, 1, 0.4, 1] - }) - - df_expected = pd.DataFrame({ - 'a': [1, 0.1, 0.5], - 'b': [0.1, 1, 0.4], - 'c':[0.5, 0.4, 1] - }, index=['a', 'b', 'c']) - - df_reconstructed = reconstruct_corrs_df_symmetric_from_long_df(df, 'colA', 'colB', 2) + df = pd.DataFrame( + { + "colA": ["a", "a", "b", "c", "c"], + "colB": ["b", "c", "b", "b", "c"], + "value": [0.1, 0.5, 1, 0.4, 1], + } + ) + + df_expected = pd.DataFrame( + {"a": [1, 0.1, 0.5], "b": [0.1, 1, 0.4], "c": [0.5, 0.4, 1]}, + index=["a", "b", "c"], + ) + + df_reconstructed = reconstruct_corrs_df_symmetric_from_long_df( + df, "colA", "colB", 2 + ) assert np.isclose(df_reconstructed, df_expected, atol=1e-4).all().all() assert all(df_reconstructed.index == df_reconstructed.columns) @@ -37,35 +41,34 @@ def transform_dendogram_merge_arr_r2py(merge_arr: list): merge_new = np.where(merge_new > 0, merge_new + n_samples, merge_new) merge_new = np.abs(merge_new) - merge_new = merge_new - 1 # 1-based -> 0-based order + merge_new = merge_new - 1 # 1-based -> 0-based order return merge_new def transform_dendogram_r2py(dendogram: dict): - ''' + """ Parameters: ----------- dendogram: dict Dictionary with the following structure: {..., merge = [[int, int], ...]} - ''' + """ - if not 'merge' in dendogram.keys(): - raise ValueError('Dendogram slot missing!') + if "merge" not in dendogram.keys(): + raise ValueError("Dendogram slot missing!") - merge = dendogram['merge'] + merge = dendogram["merge"] merge_new = transform_dendogram_merge_arr_r2py(merge) dendogram_new = copy.deepcopy(dendogram) - dendogram_new['merge'] = merge_new.tolist() + dendogram_new["merge"] = merge_new.tolist() return dendogram_new def remap_dendogram_leaf_order(dendogram: dict, ref_labels: list): - ''' - Remap nodes in dendogram['merge'] using a reference label order. - + """Remap nodes in dendogram['merge'] using a reference label order. + Parameters: ----------- - dendogram: dict @@ -73,31 +76,33 @@ def remap_dendogram_leaf_order(dendogram: dict, ref_labels: list): - merge: np.ndarray of shape (n_samples-1, 2) - heights: list of length n_samples - ref_annotation: list of labels in desired new leaf order - + Returns: -------- - dendogram with updated node indices remapped to match ref_annotation order - ''' - orig_labels = dendogram['labels'] + """ + orig_labels = dendogram["labels"] n_samples = len(orig_labels) - assert set(orig_labels) == set(ref_labels), f'orig_labels: {orig_labels},\nref_labels: {ref_labels}' + assert set(orig_labels) == set( + ref_labels + ), f"orig_labels: {orig_labels},\nref_labels: {ref_labels}" assert len(orig_labels) == len(ref_labels) - assert len(orig_labels) == len(dendogram['merge']) + 1 + assert len(orig_labels) == len(dendogram["merge"]) + 1 + + merge_arr = np.array(dendogram["merge"]) - merge_arr = np.array(dendogram['merge']) - # Mapping from original index to ref index orig_label_to_index = {label: i for i, label in enumerate(orig_labels)} ref_label_to_index = {label: i for i, label in enumerate(ref_labels)} - + # Create remapping array leaf_map = np.zeros(n_samples, dtype=int) for label in orig_labels: orig_idx = orig_label_to_index[label] ref_idx = ref_label_to_index[label] leaf_map[orig_idx] = ref_idx - + # Now remap only values < n_leaves merge_remapped = merge_arr.copy() @@ -108,37 +113,43 @@ def remap_dendogram_leaf_order(dendogram: dict, ref_labels: list): # Replace old merge dendogram_remapped = copy.deepcopy(dendogram) - dendogram_remapped['labels'] = ref_labels - dendogram_remapped['merge'] = merge_remapped.tolist() - + dendogram_remapped["labels"] = ref_labels + dendogram_remapped["merge"] = merge_remapped.tolist() + return dendogram_remapped -def check_dendogram_equality(dend, dend_ref, rel_tolerance=None, abs_tolerance=None): - ''' +def check_dendogram_equality( + dend, dend_ref, rel_tolerance=None, abs_tolerance=None +): + """ Note: To choose the tolerances view API: pytest.approx - ''' + """ - keys = ('labels', 'merge', 'heights') + keys = ("labels", "merge", "heights") # Correct dict keys - assert all([key in dend_ref.keys() for key in keys]), f'dend.keys: {list(dend.keys())}\nkeys:{keys}' - assert all([key in dend.keys() for key in keys]), f'dend.keys: {list(dend.keys())}\nkeys:{keys}' + assert all( + [key in dend_ref.keys() for key in keys] + ), f"dend.keys: {list(dend.keys())}\nkeys:{keys}" + assert all( + [key in dend.keys() for key in keys] + ), f"dend.keys: {list(dend.keys())}\nkeys:{keys}" # Equal labels - labels_ref = dend_ref['labels'] - labels = dend['labels'] + labels_ref = dend_ref["labels"] + labels = dend["labels"] assert labels_ref == labels # Equal merge arrays - merge_arr_ref = dend_ref['merge'] - merge_arr = dend['merge'] + merge_arr_ref = dend_ref["merge"] + merge_arr = dend["merge"] for i, (pair_ref, pair) in enumerate(zip(merge_arr, merge_arr_ref)): - assert pair_ref == pair or pair_ref == pair[::-1], f'{i}' + assert pair_ref == pair or pair_ref == pair[::-1], f"{i}" # Equal heights - heights_ref = dend_ref['heights'] - heights = dend['heights'] + heights_ref = dend_ref["heights"] + heights = dend["heights"] assert heights == approx(heights_ref, rel=rel_tolerance, abs=abs_tolerance) diff --git a/tests/utils/test_anndata.py b/tests/utils/test_anndata.py index bf3fcfe..616291f 100644 --- a/tests/utils/test_anndata.py +++ b/tests/utils/test_anndata.py @@ -42,8 +42,7 @@ def test_peptide_id_must_be_unique(self): proteins = ["PROT_A", "PROT_B"] with pytest.warns(UserWarning, match="Variable names are not unique"): adata = AnnData( - np.arange(4).reshape(2, 2), - var=pd.DataFrame(index=peptides) + np.arange(4).reshape(2, 2), var=pd.DataFrame(index=peptides) ) adata.var["peptide_id"] = peptides adata.var["protein_id"] = proteins @@ -120,8 +119,7 @@ def test_protein_id_must_be_unique(self): proteins = ["PROT_A", "PROT_A"] with pytest.warns(UserWarning, match="Variable names are not unique"): adata = AnnData( - np.arange(4).reshape(2, 2), - var=pd.DataFrame(index=proteins) + np.arange(4).reshape(2, 2), var=pd.DataFrame(index=proteins) ) adata.var["protein_id"] = proteins @@ -155,7 +153,8 @@ def test_sample_id_matching_obs_names_passes(self): adata = AnnData( np.arange(4).reshape(2, 2), obs=pd.DataFrame( - {"sample_id": obs_names}, index=obs_names, + {"sample_id": obs_names}, + index=obs_names, ), var=pd.DataFrame(index=proteins), ) @@ -207,7 +206,8 @@ def test_nan_in_peptide_id_returns_false(self): adata = AnnData( np.arange(4).reshape(2, 2), obs=pd.DataFrame( - {"sample_id": obs_names}, index=obs_names, + {"sample_id": obs_names}, + index=obs_names, ), var=pd.DataFrame(index=peptides), ) @@ -228,7 +228,8 @@ def test_nan_in_protein_id_peptide_level_returns_false(self): adata = AnnData( np.arange(4).reshape(2, 2), obs=pd.DataFrame( - {"sample_id": obs_names}, index=obs_names, + {"sample_id": obs_names}, + index=obs_names, ), var=pd.DataFrame(index=peptides), ) @@ -249,7 +250,8 @@ def test_nan_in_protein_id_protein_level_returns_false(self): adata = AnnData( np.arange(4).reshape(2, 2), obs=pd.DataFrame( - {"sample_id": obs_names}, index=obs_names, + {"sample_id": obs_names}, + index=obs_names, ), var=pd.DataFrame(index=proteins), ) @@ -271,14 +273,16 @@ def test_layers_missing_key_returns_false(self): adata = AnnData( np.arange(4).reshape(2, 2), obs=pd.DataFrame( - {"sample_id": obs_names}, index=obs_names, + {"sample_id": obs_names}, + index=obs_names, ), var=pd.DataFrame(index=proteins), ) adata.var["protein_id"] = proteins result = is_proteodata( - adata, layers="nonexistent", + adata, + layers="nonexistent", ) assert result == (False, None) @@ -299,7 +303,8 @@ def test_layers_with_infinite_values_returns_false(self): adata = AnnData( X, obs=pd.DataFrame( - {"sample_id": obs_names}, index=obs_names, + {"sample_id": obs_names}, + index=obs_names, ), var=pd.DataFrame(index=proteins), ) @@ -328,7 +333,8 @@ def test_layers_valid_passes(self): adata = AnnData( X, obs=pd.DataFrame( - {"sample_id": obs_names}, index=obs_names, + {"sample_id": obs_names}, + index=obs_names, ), var=pd.DataFrame(index=proteins), ) @@ -336,7 +342,8 @@ def test_layers_valid_passes(self): adata.layers["raw"] = X.copy() assert is_proteodata( - adata, layers="raw", + adata, + layers="raw", ) == (True, "protein") def test_layers_multiple_keys(self): @@ -346,7 +353,8 @@ def test_layers_multiple_keys(self): adata = AnnData( X, obs=pd.DataFrame( - {"sample_id": obs_names}, index=obs_names, + {"sample_id": obs_names}, + index=obs_names, ), var=pd.DataFrame(index=proteins), ) @@ -375,7 +383,8 @@ def test_check_proteodata_propagates_layers(self): adata = AnnData( X, obs=pd.DataFrame( - {"sample_id": obs_names}, index=obs_names, + {"sample_id": obs_names}, + index=obs_names, ), var=pd.DataFrame(index=proteins), ) @@ -383,7 +392,8 @@ def test_check_proteodata_propagates_layers(self): adata.layers["raw"] = X.copy() assert check_proteodata( - adata, layers="raw", + adata, + layers="raw", ) == (True, "protein") bad_layer = X.copy() diff --git a/tests/utils/test_data_structures.py b/tests/utils/test_data_structures.py index 9a0b1ae..6633dec 100644 --- a/tests/utils/test_data_structures.py +++ b/tests/utils/test_data_structures.py @@ -1,125 +1,139 @@ import pytest from proteopy.utils.data_structures import BinaryClusterTree, ListDict + def test_ListDict(): ld = ListDict() - ld['a'].append(0) - assert ld['a'] == [0] + ld["a"].append(0) + assert ld["a"] == [0] + + ld["a"].append(1) + assert ld["a"] == [0, 1] - ld['a'].append(1) - assert ld['a'] == [0,1] + ld["a"].extend(["a", "b"]) + assert ld["a"] == [0, 1, "a", "b"] - ld['a'].extend(['a', 'b']) - assert ld['a'] == [0,1,'a', 'b'] + ld["exception"] = 100 + assert isinstance(ld["exception"], int) + assert ld["exception"] == 100 - ld['exception'] = 100 - assert isinstance(ld['exception'], int) - assert ld['exception'] == 100 @pytest.fixture def example_tree(): constructor = { - 'type': 'sklearn_agglomerative_clustering', - 'labels': ['label_0', 'label_1', 'label_2', 'label_3'], - 'merge': [[2, 3], [4, 0], [1, 5]], - 'heights': [0.1, 0.3, 0.8] + "type": "sklearn_agglomerative_clustering", + "labels": ["label_0", "label_1", "label_2", "label_3"], + "merge": [[2, 3], [4, 0], [1, 5]], + "heights": [0.1, 0.3, 0.8], } return BinaryClusterTree(constructor) + def test_tree_structure(example_tree): - labels = ['label_0', 'label_1', 'label_2', 'label_3'] + labels = ["label_0", "label_1", "label_2", "label_3"] root = example_tree.root assert example_tree.size == 7 assert example_tree.labels == labels - assert example_tree.get_labels([0,1,2,3]) == labels + assert example_tree.get_labels([0, 1, 2, 3]) == labels assert example_tree.count_leaves() == 4 assert root.value == 6 assert root.height == 0.8 - + assert root.left.value == 1 assert root.left.height == 0.8 assert root.left.is_leaf() - + assert root.right.value == 5 assert root.right.height == 0.3 assert not root.right.is_leaf() - + node5 = root.right assert node5.left.value == 4 assert node5.left.height == 0.1 assert not node5.left.is_leaf() - + assert node5.right.value == 0 assert node5.right.height == 0.3 assert node5.right.is_leaf() - + node4 = node5.left assert node4.left.value == 2 assert node4.left.height == 0.1 assert node4.left.is_leaf() - + assert node4.right.value == 3 assert node4.right.height == 0.1 assert node4.right.is_leaf() + def test_count_leaves(example_tree): assert example_tree.count_leaves() == 4 - + root = example_tree.root node5 = root.right assert BinaryClusterTree._count_leaves(node5) == 3 # Leaves: 0,2,3 - + node4 = node5.left assert BinaryClusterTree._count_leaves(node4) == 2 # Leaves: 2,3 + def test_cut_k1(example_tree): df = example_tree.cut(1, use_labels=True) - assert all([i in ['label_0', 'label_1', 'label_2', 'label_3'] for i in df.index]) + assert all( + [i in ["label_0", "label_1", "label_2", "label_3"] for i in df.index] + ) assert len(df) == 4 assert df.nunique() == 1 assert df.iloc[0] == 6 # Root cluster + def test_cut_k2(example_tree): df = example_tree.cut(2, use_labels=True) - assert all([i in ['label_0', 'label_1', 'label_2', 'label_3'] for i in df.index]) + assert all( + [i in ["label_0", "label_1", "label_2", "label_3"] for i in df.index] + ) assert len(df) == 4 assert df.nunique() == 2 - + cluster1 = df[df == 1] assert cluster1.iloc[0] == 1 - - cluster5 = df[df.index != 'label_1'] + + cluster5 = df[df.index != "label_1"] assert all(cluster5 == 5) + def test_cut_k3(example_tree): # use_labels=True df = example_tree.cut(3, use_labels=True) - assert all([i in ['label_0', 'label_1', 'label_2', 'label_3'] for i in df.index]) + assert all( + [i in ["label_0", "label_1", "label_2", "label_3"] for i in df.index] + ) assert len(df) == 4 assert df.nunique() == 3 - - assert df[df.index == 'label_0'].iloc[0] == 0 - assert df[df.index == 'label_1'].iloc[0] == 1 - cluster4 = df[df.index.isin(['label_2', 'label_3'])] + + assert df[df.index == "label_0"].iloc[0] == 0 + assert df[df.index == "label_1"].iloc[0] == 1 + cluster4 = df[df.index.isin(["label_2", "label_3"])] assert all(cluster4 == 4) # use_labels=False df = example_tree.cut(3, use_labels=False) - assert all([i in [0,1,2,3] for i in df.index]) + assert all([i in [0, 1, 2, 3] for i in df.index]) assert len(df) == 4 assert df.nunique() == 3 - + assert df[df.index == 0].iloc[0] == 0 assert df[df.index == 1].iloc[0] == 1 - cluster4 = df[df.index.isin([2,3])] + cluster4 = df[df.index.isin([2, 3])] assert all(cluster4 == 4) + def test_cut_k4(example_tree): # use_labels=False - labels = [0,1,2,3] + labels = [0, 1, 2, 3] df = example_tree.cut(4, use_labels=False) print(df) assert all([i in labels for i in df.index]) @@ -131,7 +145,7 @@ def test_cut_k4(example_tree): assert label == cluster_id # use_labels=True - labels = ['label_0', 'label_1', 'label_2', 'label_3'] + labels = ["label_0", "label_1", "label_2", "label_3"] df = example_tree.cut(4, use_labels=True) assert all([i in labels for i in df.index]) assert len(df) == 4 @@ -139,4 +153,4 @@ def test_cut_k4(example_tree): for label in labels: cluster_id = df[df.index == label].iloc[0] - assert label == f'label_{cluster_id}' + assert label == f"label_{cluster_id}" diff --git a/tests/utils/test_parsers.py b/tests/utils/test_parsers.py index d77c068..c5691c2 100644 --- a/tests/utils/test_parsers.py +++ b/tests/utils/test_parsers.py @@ -1,6 +1,6 @@ -""" -Tests for proteopy.utils.parsers.parse_stat_test_varm_slot function. -""" +"""Tests for proteopy.utils.parsers.parse_stat_test_varm_slot +function.""" + import numpy as np import pandas as pd import pytest @@ -13,7 +13,8 @@ class TestParseStatTestVarmSlot: """Tests for parse_stat_test_varm_slot function.""" def test_parse_welch_two_group_no_layer(self): - """Test parsing a Welch's t-test two-group slot without layer.""" + """Test parsing a Welch's t-test two-group slot without + layer.""" slot = "welch;condition;treated_vs_control" result = parse_stat_test_varm_slot(slot) @@ -58,12 +59,12 @@ def test_parse_with_layer(self): assert result["layer"] == "raw_intensities" def test_layer_resolution_with_adata(self): - """Test that layer is resolved to original name when adata provided.""" + """Test that layer is resolved to original name when adata + provided.""" # Create AnnData with a layer that has spaces proteins = ["PROT_A", "PROT_B"] adata = AnnData( - np.arange(4).reshape(2, 2), - var=pd.DataFrame(index=proteins) + np.arange(4).reshape(2, 2), var=pd.DataFrame(index=proteins) ) adata.var["protein_id"] = proteins adata.layers["Raw Intensities"] = np.arange(4).reshape(2, 2) @@ -136,7 +137,8 @@ def test_vs_rest_missing_group_raises(self): parse_stat_test_varm_slot(slot) def test_sanitized_group_names(self): - """Test parsing with sanitized group names containing underscores.""" + """Test parsing with sanitized group names containing + underscores.""" slot = "welch;sample_condition;Group_A_vs_Group_B" result = parse_stat_test_varm_slot(slot)