breimanntools · breimanntools · Jul 3, 2026 · Jun 30, 2026 · Jul 1, 2026 · Jul 1, 2026
diff --git a/aaanalysis/__init__.py b/aaanalysis/__init__.py
@@ -1,4 +1,4 @@
-from .data_handling import (load_dataset, load_scales, load_features,
+from .data_handling import (load_dataset, load_scales, load_features, get_labels,
                             read_fasta, to_fasta,
                             SequencePreprocessor,
                             EmbeddingPreprocessor,
@@ -14,6 +14,8 @@
                       comp_per_protein_ap, comp_detection_metrics,
                       comp_bootstrap_ci, comp_smooth_scores)
 from .config import options
+from ._constants import (COLOR_SAMPLES_POS, COLOR_SAMPLES_NEG,
+                        COLOR_SAMPLES_UNL, COLOR_SAMPLES_REL_NEG)
 
 from importlib.metadata import version as _version, PackageNotFoundError
 
@@ -28,6 +30,7 @@
     "load_dataset",
     "load_scales",
     "load_features",
+    "get_labels",
     "read_fasta",
     "to_fasta",
     "SequencePreprocessor",
@@ -72,6 +75,10 @@
     "comp_detection_metrics",
     "comp_bootstrap_ci",
     "comp_smooth_scores",
+    "COLOR_SAMPLES_POS",
+    "COLOR_SAMPLES_NEG",
+    "COLOR_SAMPLES_UNL",
+    "COLOR_SAMPLES_REL_NEG",
     "options"
 ]
 

diff --git a/aaanalysis/_constants.py b/aaanalysis/_constants.py
@@ -478,6 +478,15 @@ def _folder_path(super_folder, folder_name):
 COLOR_NEG = "#ad4570"   # (173,69,112)
 COLOR_REL_NEG = "#ad9745" # (173, 151, 69)
 
+# Public, named aliases for the canonical sample-group colors (positive / negative /
+# unlabeled / reliable-negative). They mirror the ``DICT_COLOR["SAMPLES_*"]`` entries
+# exactly, so users can reference a named constant (``aa.COLOR_SAMPLES_POS``) instead
+# of indexing ``plot_get_cdict("DICT_COLOR")`` by string key.
+COLOR_SAMPLES_POS = COLOR_POS
+COLOR_SAMPLES_NEG = COLOR_NEG
+COLOR_SAMPLES_UNL = COLOR_UNL
+COLOR_SAMPLES_REL_NEG = COLOR_REL_NEG
+
 DICT_COLOR = {"SHAP_POS": COLOR_SHAP_POS,
               "SHAP_NEG": COLOR_SHAP_NEG,
               "FEAT_POS": COLOR_FEAT_POS,

diff --git a/aaanalysis/data_handling/__init__.py b/aaanalysis/data_handling/__init__.py
@@ -1,8 +1,8 @@
 """
 Data loading and sequence/embedding preprocessing — the package's data entry point.
 
-Public objects: load_dataset, load_scales, load_features, read_fasta, to_fasta,
-SequencePreprocessor, EmbeddingPreprocessor, combine_dict_nums.
+Public objects: load_dataset, load_scales, load_features, get_labels, read_fasta,
+to_fasta, SequencePreprocessor, EmbeddingPreprocessor, combine_dict_nums.
 Produces the core data objects the rest of the pipeline consumes: ``load_dataset``
 yields ``df_seq``, ``load_scales`` yields ``df_scales`` (fed to
 ``feature_engineering.AAclust`` / ``CPP``), ``load_features`` yields a reference
@@ -17,6 +17,7 @@
 from ._load_dataset import load_dataset
 from ._load_scales import load_scales
 from ._load_features import load_features
+from ._get_labels import get_labels
 from ._read_fasta import read_fasta
 from ._to_fasta import to_fasta
 from ._seq_preproc import SequencePreprocessor
@@ -27,6 +28,7 @@
     "load_dataset",
     "load_scales",
     "load_features",
+    "get_labels",
     "read_fasta",
     "to_fasta",
     "SequencePreprocessor",

diff --git a/aaanalysis/data_handling/_get_labels.py b/aaanalysis/data_handling/_get_labels.py
@@ -0,0 +1,70 @@
+"""
+This is a script for the frontend of the get_labels function, deriving a binary
+label vector from a sequence DataFrame's label column.
+"""
+from typing import Any
+import numpy as np
+import pandas as pd
+
+import aaanalysis.utils as ut
+
+
+# I Helper Functions
+def check_match_df_positive_label(df=None, col_label=None, positive_label=None) -> None:
+    """Check that the positive label value is present in the label column."""
+    present = set(df[col_label].tolist())
+    if positive_label not in present:
+        raise ValueError(f"'positive_label' ({positive_label}) is not among the values of "
+                         f"column '{col_label}' ({sorted(present, key=str)}).")
+
+
+# II Main Functions
+def get_labels(df: pd.DataFrame,
+               positive_label: Any = 1,
+               col_label: str = "label",
+               ) -> np.ndarray:
+    """
+    Derive a binary ``int`` label vector from a column of a sequence DataFrame.
+
+    Maps the value flagged as positive (``positive_label``) onto ``1`` and every other
+    value onto ``0``, the binary encoding consumed across the package (e.g. by
+    :meth:`CPP.run`, :class:`TreeModel`, and the ``labels`` argument of most tools).
+    This is the single-call form of the recurring ``(df[col] == x).astype(int).to_numpy()``
+    expression.
+
+    .. versionadded:: 1.1.0
+
+    Parameters
+    ----------
+    df : pd.DataFrame, shape (n_samples, n_seq_info)
+        Sequence DataFrame (``df_seq``) containing the label column ``col_label``.
+    positive_label : int or str, default=1
+        Value in ``col_label`` marking the positive class. All rows equal to it become
+        ``1``; all remaining rows become ``0``. Must be present in ``col_label``.
+    col_label : str, default='label'
+        Name of the column holding the (multi-value or already binary) labels.
+
+    Returns
+    -------
+    labels : array-like, shape (n_samples,)
+        Binary ``int`` label vector (``1`` = positive, ``0`` = otherwise), row-aligned
+        to ``df``.
+
+    Notes
+    -----
+    * The result equals ``(df[col_label] == positive_label).astype(int).to_numpy()``.
+    * Pass the resulting vector directly as the ``labels`` argument of CPP, TreeModel,
+      or other tools. For Positive-Unlabeled mining keep the package ``1`` (positive) /
+      ``2`` (unlabeled) markers instead and pass ``X_pos`` / ``X_unlabeled`` to :meth:`dPULearn.fit`.
+
+    Examples
+    --------
+    .. include:: examples/get_labels.rst
+    """
+    # Check input
+    ut.check_str(name="col_label", val=col_label, accept_none=False)
+    ut.check_df(name="df", df=df, cols_required=col_label)
+    check_match_df_positive_label(df=df, col_label=col_label, positive_label=positive_label)
+    # Derive binary int label vector
+    labels = (df[col_label] == positive_label).astype(int).to_numpy()
+    return labels
diff --git a/aaanalysis/pu_learning/_dpulearn.py b/aaanalysis/pu_learning/_dpulearn.py
@@ -133,6 +133,15 @@ def check_match_X_X_neg(X=None, X_neg=None) -> None:
         raise ValueError(f"'n_features' does not match between 'X' (n={n_features}) and 'X_neg' (n={n_features_neg})")
 
 
+def check_match_X_pos_X_unlabeled(X_pos=None, X_unlabeled=None) -> None:
+    """Check that positive and unlabeled feature matrices share the same feature dimension."""
+    n_features_pos = X_pos.shape[1]
+    n_features_unl = X_unlabeled.shape[1]
+    if n_features_pos != n_features_unl:
+        raise ValueError(f"'n_features' does not match between 'X_pos' (n={n_features_pos}) and "
+                         f"'X_unlabeled' (n={n_features_unl})")
+
+
 # II Main Functions
 class dPULearn(Wrapper):
     """
@@ -210,11 +219,14 @@ def __init__(self,
         # Output parameters (will be set during model fitting)
         self.labels_ = None
         self.df_pu_ = None
+        self.mask_neg_ = None
 
     # Main method
     def fit(self,
-            X: ut.ArrayLike2D,
-            labels: ut.ArrayLike1D,
+            X: Optional[ut.ArrayLike2D] = None,
+            labels: Optional[ut.ArrayLike1D] = None,
+            X_pos: Optional[ut.ArrayLike2D] = None,
+            X_unlabeled: Optional[ut.ArrayLike2D] = None,
             label_pos: int = 1,
             label_unl: int = 2,
             label_neg: Optional[int] = None,
@@ -239,15 +251,30 @@ def fit(self,
 
         .. versionadded:: 0.1.0
 
+        There are two input modes (provide exactly one): pass ``X`` + ``labels`` (a single feature
+        matrix with per-sample markers), or — for the common positives-vs-unlabeled setup — pass the
+        two matrices ``X_pos`` and ``X_unlabeled`` separately, which are stacked internally with the
+        package markers. Either way, after fitting :attr:`dPULearn.mask_neg_` is the boolean mask of
+        reliable negatives (over ``X_unlabeled`` in the split mode, over ``X`` otherwise).
+
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features)
-            Feature matrix. `Rows` typically correspond to proteins and `columns` to features.
-        labels : array-like, shape (n_samples,)
+        X : array-like, shape (n_samples, n_features), optional
+            Feature matrix. `Rows` typically correspond to proteins and `columns` to features. Provide
+            ``X`` + ``labels``, or ``X_pos`` + ``X_unlabeled`` (exactly one of the two modes).
+        labels : array-like, shape (n_samples,), optional
             Dataset labels of samples in ``X``. Must contain the positive marker (``label_pos``) and the
             unlabeled marker (``label_unl``); pre-labeled negatives (``label_neg``) are optional. By
             default positives are ``1`` and unlabeled are ``2``; set ``label_unl=0`` to pass the standard
             ``{0, 1}`` encoding directly (``0`` = unlabeled, ``1`` = positive).
+        X_pos : array-like, shape (n_pos, n_features), optional
+            Feature matrix of the positive samples (split-input mode). Provided together with
+            ``X_unlabeled`` instead of ``X`` + ``labels``; the two are stacked and marked internally
+            (positives ``label_pos``, unlabeled ``label_unl``), so no manual label vector is needed.
+        X_unlabeled : array-like, shape (n_unl, n_features), optional
+            Feature matrix of the unlabeled candidate pool (split-input mode). Must have the same number
+            of features as ``X_pos``. After fitting, :attr:`dPULearn.mask_neg_` is a boolean mask over its
+            rows marking the identified reliable negatives.
         label_pos : int, default=1
             Value marking positive samples in ``labels``. Must be present.
         label_unl : int, default=2
@@ -322,6 +349,24 @@ def fit(self,
         --------
         .. include:: examples/dpul_fit.rst
         """
+        # Resolve the input mode: (X, labels) or the positives/unlabeled split. In the split
+        # mode, stack X_pos over X_unlabeled and build the label vector internally with the
+        # package markers, so the caller does not hand-roll the vstack + 1/2 vector + slice.
+        split_mode = X_pos is not None or X_unlabeled is not None
+        n_pos = None
+        if split_mode:
+            if X is not None or labels is not None:
+                raise ValueError("Pass either 'X'/'labels' or 'X_pos'/'X_unlabeled', not both.")
+            if X_pos is None or X_unlabeled is None:
+                raise ValueError("'X_pos' and 'X_unlabeled' must both be given for the split-input mode.")
+            X_pos = ut.check_X(X=X_pos, X_name="X_pos", min_n_samples=1)
+            X_unlabeled = ut.check_X(X=X_unlabeled, X_name="X_unlabeled", min_n_samples=1)
+            check_match_X_pos_X_unlabeled(X_pos=X_pos, X_unlabeled=X_unlabeled)
+            n_pos = X_pos.shape[0]
+            X = np.vstack([X_pos, X_unlabeled])
+            labels = np.array([label_pos] * n_pos + [label_unl] * X_unlabeled.shape[0])
+        elif X is None or labels is None:
+            raise ValueError("'X' and 'labels' are required (or pass 'X_pos' + 'X_unlabeled').")
         # Check input
         X = ut.check_X(X=X)
         check_match_labels_markers(label_pos=label_pos, label_unl=label_unl, label_neg=label_neg)
@@ -353,9 +398,11 @@ def fit(self,
         # Identify most far away negatives in PCA compressed feature space
         else:
             new_labels, df_pu = get_neg_via_pca(**args, n_components=n_components, **self._model_kwargs)
-        # Set new labels
+        # Set new labels + the reliable-negative mask. In the split-input mode the mask is over
+        # the rows of X_unlabeled (True = mined reliable negative); otherwise over all rows of X.
         self.labels_ = np.asarray(new_labels)
         self.df_pu_ = df_pu
+        self.mask_neg_ = self.labels_[n_pos:] == 0 if n_pos is not None else self.labels_ == 0
         return self
 
     @staticmethod

diff --git a/docs/_cheatsheet/content.py b/docs/_cheatsheet/content.py
@@ -188,6 +188,7 @@
          ("Load benchmark sequences", "load_dataset(name) → df_seq", None),
          ("Load AAontology scales", "load_scales() → df_scales", None),
          ("Load precomputed features", "load_features(name) → df_feat", None),
+         ("Binary labels from df column", "get_labels(df, positive_label) → labels", None, "v1.1"),
          ("Read / write FASTA", "read_fasta(file) → df_seq", None),
          ("Cluster redundant homologs", "filter_seq(df_seq) → df_clust  [pro]", None),
      ]},
@@ -221,6 +222,7 @@
     {"name": "Modeling & Explainability", "tag": "PU · classify · SHAP",
      "rows": [
          ("Train with positives + unlabeled data", "dPULearn().fit(X, labels)  [Wrapper]", None),
+         ("Mine reliable negatives (mask)", "dPULearn().fit(X_pos=, X_unlabeled=).mask_neg_ → mask", None, "v1.1"),
          ("Train + RFE + MC importance", "TreeModel().fit(X, labels)  [Wrapper]", None),
          ("Per-feature / sample SHAP impact", "ShapModel().fit(X, labels)  [pro]", None),
      ]},

diff --git a/docs/source/index/release_notes.rst b/docs/source/index/release_notes.rst
@@ -35,6 +35,11 @@ Added
   per-residue PTM and functional-site annotations and encodes them into tensors
   (``fetch_uniprot``, ``ingest``, ``register_feature``, ``encode``, ``build_scales``,
   ``build_cat``, ``to_df_seq``).
+- **combine_dict_nums**: Concatenates per-residue tensors (embedding / structure /
+  annotation) along the feature axis into one combined ``CPP.run_num`` input.
+- **get_labels**: Derives a binary ``int`` label vector from a sequence DataFrame's
+  label column (``positive_label`` mapped to ``1``, everything else to ``0``) — the
+  single-call form of the recurring ``(df[col] == x).astype(int).to_numpy()`` expression.
 - :func:`~aaanalysis.combine_dict_nums`: Concatenates per-residue tensors (embedding / structure /
   annotation) along the feature axis into one combined :meth:`~aaanalysis.CPP.run_num` input.
 
@@ -132,6 +137,16 @@ Added
   switches the pre-computed prediction per P1 (feature map + structure restyle) with no kernel,
   keeping the column-residue linking (warned past 40 sites, hard-capped at 200).
 
+**PU Learning**
+
+- **dPULearn.fit — positives/unlabeled split input**: for the common positive / unlabeled
+  setup, ``fit`` now accepts ``X_pos`` and ``X_unlabeled`` separately (an alternative to
+  ``X`` + ``labels``) instead of stacking them by hand and building a ``1`` / ``2`` label
+  vector. After fitting, the new ``dPULearn.mask_neg_`` attribute holds the **boolean mask
+  of reliable negatives** — over the rows of ``X_unlabeled`` in the split mode, over ``X``
+  otherwise (equal to the manual ``labels_[len(X_pos):] == 0`` result exactly). ``fit`` still
+  returns ``self`` and the existing ``fit(X, labels=...)`` path is unchanged.
+
 **Sequence Analysis**
 
 - :class:`~aaanalysis.AAWindowSampler`: Samples fixed-length sequence windows for PU-learning and
@@ -191,6 +206,10 @@ Added
 
 - :func:`~aaanalysis.plot_rank`: Standalone per-protein max-score-vs-rank scatter with group coloring and
   optional threshold lines (pairs with the new ``aa.metrics`` functions).
+- **COLOR_SAMPLES_POS / COLOR_SAMPLES_NEG / COLOR_SAMPLES_UNL / COLOR_SAMPLES_REL_NEG**:
+  Public, named constants for the canonical sample-group colors (positive / negative /
+  unlabeled / reliable-negative). They equal the ``plot_get_cdict("DICT_COLOR")["SAMPLES_*"]``
+  values exactly, so a named constant replaces indexing the color dict by string key.
 
 **Golden Pipelines**