Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion aaanalysis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .data_handling import (load_dataset, load_scales, load_features,
from .data_handling import (load_dataset, load_scales, load_features, get_labels,
read_fasta, to_fasta,
SequencePreprocessor,
EmbeddingPreprocessor,
Expand All @@ -14,6 +14,8 @@
comp_per_protein_ap, comp_detection_metrics,
comp_bootstrap_ci, comp_smooth_scores)
from .config import options
from ._constants import (COLOR_SAMPLES_POS, COLOR_SAMPLES_NEG,
COLOR_SAMPLES_UNL, COLOR_SAMPLES_REL_NEG)

from importlib.metadata import version as _version, PackageNotFoundError

Expand All @@ -28,6 +30,7 @@
"load_dataset",
"load_scales",
"load_features",
"get_labels",
"read_fasta",
"to_fasta",
"SequencePreprocessor",
Expand Down Expand Up @@ -72,6 +75,10 @@
"comp_detection_metrics",
"comp_bootstrap_ci",
"comp_smooth_scores",
"COLOR_SAMPLES_POS",
"COLOR_SAMPLES_NEG",
"COLOR_SAMPLES_UNL",
"COLOR_SAMPLES_REL_NEG",
"options"
]

Expand Down
9 changes: 9 additions & 0 deletions aaanalysis/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,15 @@ def _folder_path(super_folder, folder_name):
COLOR_NEG = "#ad4570" # (173,69,112)
COLOR_REL_NEG = "#ad9745" # (173, 151, 69)

# Public, named aliases for the canonical sample-group colors (positive / negative /
# unlabeled / reliable-negative). They mirror the ``DICT_COLOR["SAMPLES_*"]`` entries
# exactly, so users can reference a named constant (``aa.COLOR_SAMPLES_POS``) instead
# of indexing ``plot_get_cdict("DICT_COLOR")`` by string key.
COLOR_SAMPLES_POS = COLOR_POS
COLOR_SAMPLES_NEG = COLOR_NEG
COLOR_SAMPLES_UNL = COLOR_UNL
COLOR_SAMPLES_REL_NEG = COLOR_REL_NEG

DICT_COLOR = {"SHAP_POS": COLOR_SHAP_POS,
"SHAP_NEG": COLOR_SHAP_NEG,
"FEAT_POS": COLOR_FEAT_POS,
Expand Down
6 changes: 4 additions & 2 deletions aaanalysis/data_handling/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""
Data loading and sequence/embedding preprocessing — the package's data entry point.

Public objects: load_dataset, load_scales, load_features, read_fasta, to_fasta,
SequencePreprocessor, EmbeddingPreprocessor, combine_dict_nums.
Public objects: load_dataset, load_scales, load_features, get_labels, read_fasta,
to_fasta, SequencePreprocessor, EmbeddingPreprocessor, combine_dict_nums.
Produces the core data objects the rest of the pipeline consumes: ``load_dataset``
yields ``df_seq``, ``load_scales`` yields ``df_scales`` (fed to
``feature_engineering.AAclust`` / ``CPP``), ``load_features`` yields a reference
Expand All @@ -17,6 +17,7 @@
from ._load_dataset import load_dataset
from ._load_scales import load_scales
from ._load_features import load_features
from ._get_labels import get_labels
from ._read_fasta import read_fasta
from ._to_fasta import to_fasta
from ._seq_preproc import SequencePreprocessor
Expand All @@ -27,6 +28,7 @@
"load_dataset",
"load_scales",
"load_features",
"get_labels",
"read_fasta",
"to_fasta",
"SequencePreprocessor",
Expand Down
70 changes: 70 additions & 0 deletions aaanalysis/data_handling/_get_labels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""
This is a script for the frontend of the get_labels function, deriving a binary
label vector from a sequence DataFrame's label column.
"""
from typing import Any
import numpy as np
import pandas as pd

import aaanalysis.utils as ut


# I Helper Functions
def check_match_df_positive_label(df=None, col_label=None, positive_label=None) -> None:
"""Check that the positive label value is present in the label column."""
present = set(df[col_label].tolist())
if positive_label not in present:
raise ValueError(f"'positive_label' ({positive_label}) is not among the values of "
f"column '{col_label}' ({sorted(present, key=str)}).")


# II Main Functions
def get_labels(df: pd.DataFrame,
positive_label: Any = 1,
col_label: str = "label",
) -> np.ndarray:
"""
Derive a binary ``int`` label vector from a column of a sequence DataFrame.

Maps the value flagged as positive (``positive_label``) onto ``1`` and every other
value onto ``0``, the binary encoding consumed across the package (e.g. by
:meth:`CPP.run`, :class:`TreeModel`, and the ``labels`` argument of most tools).
This is the single-call form of the recurring ``(df[col] == x).astype(int).to_numpy()``
expression.

.. versionadded:: 1.1.0

Parameters
----------
df : pd.DataFrame, shape (n_samples, n_seq_info)
Sequence DataFrame (``df_seq``) containing the label column ``col_label``.
positive_label : int or str, default=1
Value in ``col_label`` marking the positive class. All rows equal to it become
``1``; all remaining rows become ``0``. Must be present in ``col_label``.
col_label : str, default='label'
Name of the column holding the (multi-value or already binary) labels.

Returns
-------
labels : array-like, shape (n_samples,)
Binary ``int`` label vector (``1`` = positive, ``0`` = otherwise), row-aligned
to ``df``.

Notes
-----
* The result equals ``(df[col_label] == positive_label).astype(int).to_numpy()``.
* Pass the resulting vector directly as the ``labels`` argument of CPP, TreeModel,
or other tools. For Positive-Unlabeled mining keep the package ``1`` (positive) /
``2`` (unlabeled) markers instead and pass ``X_pos`` / ``X_unlabeled`` to :meth:`dPULearn.fit`.

Examples
--------
.. include:: examples/get_labels.rst
"""
# Check input
ut.check_str(name="col_label", val=col_label, accept_none=False)
ut.check_df(name="df", df=df, cols_required=col_label)
check_match_df_positive_label(df=df, col_label=col_label, positive_label=positive_label)
# Derive binary int label vector
labels = (df[col_label] == positive_label).astype(int).to_numpy()
return labels
59 changes: 53 additions & 6 deletions aaanalysis/pu_learning/_dpulearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,15 @@ def check_match_X_X_neg(X=None, X_neg=None) -> None:
raise ValueError(f"'n_features' does not match between 'X' (n={n_features}) and 'X_neg' (n={n_features_neg})")


def check_match_X_pos_X_unlabeled(X_pos=None, X_unlabeled=None) -> None:
"""Check that positive and unlabeled feature matrices share the same feature dimension."""
n_features_pos = X_pos.shape[1]
n_features_unl = X_unlabeled.shape[1]
if n_features_pos != n_features_unl:
raise ValueError(f"'n_features' does not match between 'X_pos' (n={n_features_pos}) and "
f"'X_unlabeled' (n={n_features_unl})")


# II Main Functions
class dPULearn(Wrapper):
"""
Expand Down Expand Up @@ -210,11 +219,14 @@ def __init__(self,
# Output parameters (will be set during model fitting)
self.labels_ = None
self.df_pu_ = None
self.mask_neg_ = None

# Main method
def fit(self,
X: ut.ArrayLike2D,
labels: ut.ArrayLike1D,
X: Optional[ut.ArrayLike2D] = None,
labels: Optional[ut.ArrayLike1D] = None,
X_pos: Optional[ut.ArrayLike2D] = None,
X_unlabeled: Optional[ut.ArrayLike2D] = None,
label_pos: int = 1,
label_unl: int = 2,
label_neg: Optional[int] = None,
Expand All @@ -239,15 +251,30 @@ def fit(self,

.. versionadded:: 0.1.0

There are two input modes (provide exactly one): pass ``X`` + ``labels`` (a single feature
matrix with per-sample markers), or — for the common positives-vs-unlabeled setup — pass the
two matrices ``X_pos`` and ``X_unlabeled`` separately, which are stacked internally with the
package markers. Either way, after fitting :attr:`dPULearn.mask_neg_` is the boolean mask of
reliable negatives (over ``X_unlabeled`` in the split mode, over ``X`` otherwise).

Parameters
----------
X : array-like, shape (n_samples, n_features)
Feature matrix. `Rows` typically correspond to proteins and `columns` to features.
labels : array-like, shape (n_samples,)
X : array-like, shape (n_samples, n_features), optional
Feature matrix. `Rows` typically correspond to proteins and `columns` to features. Provide
``X`` + ``labels``, or ``X_pos`` + ``X_unlabeled`` (exactly one of the two modes).
labels : array-like, shape (n_samples,), optional
Dataset labels of samples in ``X``. Must contain the positive marker (``label_pos``) and the
unlabeled marker (``label_unl``); pre-labeled negatives (``label_neg``) are optional. By
default positives are ``1`` and unlabeled are ``2``; set ``label_unl=0`` to pass the standard
``{0, 1}`` encoding directly (``0`` = unlabeled, ``1`` = positive).
X_pos : array-like, shape (n_pos, n_features), optional
Feature matrix of the positive samples (split-input mode). Provided together with
``X_unlabeled`` instead of ``X`` + ``labels``; the two are stacked and marked internally
(positives ``label_pos``, unlabeled ``label_unl``), so no manual label vector is needed.
X_unlabeled : array-like, shape (n_unl, n_features), optional
Feature matrix of the unlabeled candidate pool (split-input mode). Must have the same number
of features as ``X_pos``. After fitting, :attr:`dPULearn.mask_neg_` is a boolean mask over its
rows marking the identified reliable negatives.
label_pos : int, default=1
Value marking positive samples in ``labels``. Must be present.
label_unl : int, default=2
Expand Down Expand Up @@ -322,6 +349,24 @@ def fit(self,
--------
.. include:: examples/dpul_fit.rst
"""
# Resolve the input mode: (X, labels) or the positives/unlabeled split. In the split
# mode, stack X_pos over X_unlabeled and build the label vector internally with the
# package markers, so the caller does not hand-roll the vstack + 1/2 vector + slice.
split_mode = X_pos is not None or X_unlabeled is not None
n_pos = None
if split_mode:
if X is not None or labels is not None:
raise ValueError("Pass either 'X'/'labels' or 'X_pos'/'X_unlabeled', not both.")
if X_pos is None or X_unlabeled is None:
raise ValueError("'X_pos' and 'X_unlabeled' must both be given for the split-input mode.")
X_pos = ut.check_X(X=X_pos, X_name="X_pos", min_n_samples=1)
X_unlabeled = ut.check_X(X=X_unlabeled, X_name="X_unlabeled", min_n_samples=1)
check_match_X_pos_X_unlabeled(X_pos=X_pos, X_unlabeled=X_unlabeled)
n_pos = X_pos.shape[0]
X = np.vstack([X_pos, X_unlabeled])
labels = np.array([label_pos] * n_pos + [label_unl] * X_unlabeled.shape[0])
elif X is None or labels is None:
raise ValueError("'X' and 'labels' are required (or pass 'X_pos' + 'X_unlabeled').")
# Check input
X = ut.check_X(X=X)
check_match_labels_markers(label_pos=label_pos, label_unl=label_unl, label_neg=label_neg)
Expand Down Expand Up @@ -353,9 +398,11 @@ def fit(self,
# Identify most far away negatives in PCA compressed feature space
else:
new_labels, df_pu = get_neg_via_pca(**args, n_components=n_components, **self._model_kwargs)
# Set new labels
# Set new labels + the reliable-negative mask. In the split-input mode the mask is over
# the rows of X_unlabeled (True = mined reliable negative); otherwise over all rows of X.
self.labels_ = np.asarray(new_labels)
self.df_pu_ = df_pu
self.mask_neg_ = self.labels_[n_pos:] == 0 if n_pos is not None else self.labels_ == 0
return self

@staticmethod
Expand Down
2 changes: 2 additions & 0 deletions docs/_cheatsheet/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@
("Load benchmark sequences", "load_dataset(name) → df_seq", None),
("Load AAontology scales", "load_scales() → df_scales", None),
("Load precomputed features", "load_features(name) → df_feat", None),
("Binary labels from df column", "get_labels(df, positive_label) → labels", None, "v1.1"),
("Read / write FASTA", "read_fasta(file) → df_seq", None),
("Cluster redundant homologs", "filter_seq(df_seq) → df_clust [pro]", None),
]},
Expand Down Expand Up @@ -221,6 +222,7 @@
{"name": "Modeling & Explainability", "tag": "PU · classify · SHAP",
"rows": [
("Train with positives + unlabeled data", "dPULearn().fit(X, labels) [Wrapper]", None),
("Mine reliable negatives (mask)", "dPULearn().fit(X_pos=, X_unlabeled=).mask_neg_ → mask", None, "v1.1"),
("Train + RFE + MC importance", "TreeModel().fit(X, labels) [Wrapper]", None),
("Per-feature / sample SHAP impact", "ShapModel().fit(X, labels) [pro]", None),
]},
Expand Down
19 changes: 19 additions & 0 deletions docs/source/index/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ Added
per-residue PTM and functional-site annotations and encodes them into tensors
(``fetch_uniprot``, ``ingest``, ``register_feature``, ``encode``, ``build_scales``,
``build_cat``, ``to_df_seq``).
- **combine_dict_nums**: Concatenates per-residue tensors (embedding / structure /
annotation) along the feature axis into one combined ``CPP.run_num`` input.
- **get_labels**: Derives a binary ``int`` label vector from a sequence DataFrame's
label column (``positive_label`` mapped to ``1``, everything else to ``0``) — the
single-call form of the recurring ``(df[col] == x).astype(int).to_numpy()`` expression.
- :func:`~aaanalysis.combine_dict_nums`: Concatenates per-residue tensors (embedding / structure /
annotation) along the feature axis into one combined :meth:`~aaanalysis.CPP.run_num` input.

Expand Down Expand Up @@ -132,6 +137,16 @@ Added
switches the pre-computed prediction per P1 (feature map + structure restyle) with no kernel,
keeping the column-residue linking (warned past 40 sites, hard-capped at 200).

**PU Learning**

- **dPULearn.fit — positives/unlabeled split input**: for the common positive / unlabeled
setup, ``fit`` now accepts ``X_pos`` and ``X_unlabeled`` separately (an alternative to
``X`` + ``labels``) instead of stacking them by hand and building a ``1`` / ``2`` label
vector. After fitting, the new ``dPULearn.mask_neg_`` attribute holds the **boolean mask
of reliable negatives** — over the rows of ``X_unlabeled`` in the split mode, over ``X``
otherwise (equal to the manual ``labels_[len(X_pos):] == 0`` result exactly). ``fit`` still
returns ``self`` and the existing ``fit(X, labels=...)`` path is unchanged.

**Sequence Analysis**

- :class:`~aaanalysis.AAWindowSampler`: Samples fixed-length sequence windows for PU-learning and
Expand Down Expand Up @@ -191,6 +206,10 @@ Added

- :func:`~aaanalysis.plot_rank`: Standalone per-protein max-score-vs-rank scatter with group coloring and
optional threshold lines (pairs with the new ``aa.metrics`` functions).
- **COLOR_SAMPLES_POS / COLOR_SAMPLES_NEG / COLOR_SAMPLES_UNL / COLOR_SAMPLES_REL_NEG**:
Public, named constants for the canonical sample-group colors (positive / negative /
unlabeled / reliable-negative). They equal the ``plot_get_cdict("DICT_COLOR")["SAMPLES_*"]``
values exactly, so a named constant replaces indexing the color dict by string key.

**Golden Pipelines**

Expand Down
Loading
Loading