UKHD-NP · leventetn · May 29, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/proteopy/ann/__init__.py b/proteopy/ann/__init__.py
@@ -1,2 +1,3 @@
 from .base_anndata import var, obs, samples
 from .proteins import proteins_from_csv
+from .contaminants import contaminants
diff --git a/proteopy/ann/contaminants.py b/proteopy/ann/contaminants.py
@@ -0,0 +1,183 @@
+"""
+Annotation helpers for marking contaminant variables in an AnnData.
+"""
+
+import os
+import warnings
+from collections.abc import Callable
+
+from anndata import AnnData
+
+from proteopy.utils.anndata import check_proteodata, is_proteodata
+from proteopy.utils.parsers import read_protein_ids
+
+
+def _print_contaminant_summary(
+    adata: AnnData,
+    mask_values,
+    protein_key: str,
+    key_added: str,
+) -> None:
+    """Print a level-aware summary of annotated contaminants."""
+    _, level = is_proteodata(adata)
+    if level == "peptide":
+        n_pep = int(mask_values.sum())
+        if n_pep == 0:
+            n_prot = 0
+        else:
+            n_prot = int(adata.var.loc[mask_values, protein_key].nunique())
+        print(
+            f"Annotated {n_pep} peptides from {n_prot} "
+            f"contaminating proteins at adata.var['{key_added}']."
+        )
+    elif level == "protein":
+        n_prot = int(mask_values.sum())
+        print(
+            f"Annotated {n_prot} contaminating proteins at "
+            f"adata.var['{key_added}']."
+        )
+    else:
+        n = int(mask_values.sum())
+        print(
+            f"Annotated {n} contaminating variables at "
+            f"adata.var['{key_added}']."
+        )
+
+
+def contaminants(
+    adata: AnnData,
+    contaminant_path: str | os.PathLike,
+    *,
+    protein_key: str = "protein_id",
+    key_added: str = "is_contaminant",
+    header_parser: Callable[[str], str] | None = None,
+    has_header: bool = True,
+    inplace: bool = True,
+    verbose: bool = False,
+) -> AnnData | None:
+    """
+    Annotate contaminant variables by flagging them in ``adata.var``.
+
+    Reads protein identifiers from a contaminant list (FASTA / CSV /
+    TSV) and writes a boolean column ``adata.var[key_added]`` that is
+    ``True`` where ``adata.var[protein_key]`` matches an entry in the
+    list. Unlike :func:`proteopy.pp.remove_contaminants`, the
+    contaminants are kept in the AnnData and only flagged, so the
+    annotation can be used for downstream filtering decisions, QC
+    plots, or contaminant-aware normalization.
+
+    Parameters
+    ----------
+    adata : AnnData
+        :class:`~anndata.AnnData` annotated data matrix.
+    contaminant_path : str | os.PathLike
+        Path to the contaminant list. Supported formats: FASTA
+        (``.fasta`` / ``.fa`` / ``.faa``), CSV (``.csv``), TSV
+        (``.tsv``). See :func:`proteopy.utils.parsers.read_protein_ids`
+        for parsing details.
+    protein_key : str, optional
+        Column in ``adata.var`` holding protein identifiers used for
+        matching. Defaults to ``"protein_id"``.
+    key_added : str, optional
+        Name of the boolean column written to ``adata.var``. Defaults
+        to ``"is_contaminant"``.
+    header_parser : callable, optional
+        Function to extract protein IDs from FASTA headers. Defaults
+        to splitting the header on ``"|"`` and returning the second
+        element, falling back to the full header. Ignored (with a
+        warning) for tabular formats.
+    has_header : bool, optional
+        For CSV / TSV files, whether the first row is a header line.
+        Set to ``False`` for plain single-column ID lists.
+
+    Returns
+    -------
+    AnnData or None
+        ``None`` when ``inplace=True``; otherwise the annotated copy
+        of ``adata``.
+
+    Raises
+    ------
+    KeyError
+        If ``protein_key`` is not a column of ``adata.var``.
+
+    Warns
+    -----
+    UserWarning
+        Emitted when ``key_added`` already exists in ``adata.var``
+        (the column is overwritten). All warnings raised by
+        :func:`~proteopy.utils.parsers.read_protein_ids` (empty /
+        non-string parsed IDs, no IDs parsed, parser ignored for
+        tabular files) are propagated.
+
+    See Also
+    --------
+    proteopy.pp.remove_contaminants : Remove rather than annotate.
+    proteopy.utils.parsers.read_protein_ids : Underlying file reader.
+
+    Examples
+    --------
+    Flag two of three proteins as contaminants using an inline FASTA list.
+    The default header parser extracts the accession from the second
+    pipe-separated field (Swiss-Prot style):
+
+    >>> import tempfile
+    >>> from pathlib import Path
+    >>> import numpy as np
+    >>> import pandas as pd
+    >>> from anndata import AnnData
+    >>> import proteopy as pr
+    >>> with tempfile.TemporaryDirectory() as d:
+    ...     fasta = Path(d) / "contaminants.fasta"
+    ...     _ = fasta.write_text(
+    ...         ">sp|P00001|HUMAN_A\\nACDEF\\n"
+    ...         ">sp|P00002|HUMAN_B\\nGHIKL\\n"
+    ...     )
+    ...     adata = AnnData(
+    ...         X=np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]),
+    ...         obs=pd.DataFrame(
+    ...             {"sample_id": ["s1", "s2"]}, index=["s1", "s2"],
+    ...         ),
+    ...         var=pd.DataFrame(
+    ...             {"protein_id": ["P00001", "P00002", "P00003"]},
+    ...             index=["P00001", "P00002", "P00003"],
+    ...         ),
+    ...     )
+    ...     pr.ann.contaminants(adata, fasta, verbose=True)
+    ...     print(int(adata.var["is_contaminant"].sum()))
+    Annotated 2 contaminating proteins at adata.var['is_contaminant'].
+    2
+    """
+    check_proteodata(adata)
+
+    if protein_key not in adata.var.columns:
+        raise KeyError(f"`protein_key`='{protein_key}' not found in adata.var")
+    if key_added in adata.var.columns:
+        warnings.warn(
+            f"`key_added`='{key_added}' already exists in adata.var; "
+            "overwriting.",
+            UserWarning,
+        )
+
+    adata_target = adata if inplace else adata.copy()
+
+    contaminant_ids = read_protein_ids(
+        contaminant_path,
+        header_parser=header_parser,
+        has_header=has_header,
+    )
+
+    mask = adata_target.var[protein_key].isin(contaminant_ids)
+    mask_values = mask.astype(bool).values
+    adata_target.var[key_added] = mask_values
+
+    if verbose:
+        _print_contaminant_summary(
+            adata_target,
+            mask_values,
+            protein_key,
+            key_added,
+        )
+
+    check_proteodata(adata_target)
+    return None if inplace else adata_target
diff --git a/proteopy/download/contaminants.py b/proteopy/download/contaminants.py
@@ -74,6 +74,7 @@ def _format_fasta(
             destination_path,
             "w",
             encoding="utf-8",
+            newline="\n",
         ) as dest,
     ):
         for line in src: