Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 37 additions & 24 deletions aaanalysis/feature_engineering/_backend/check_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,32 @@ def _get_max_pos_split(split=None):
return n_max


def _get_max_pos_split_kws(split_kws=None):
"""Get maximum position required for splits basd on split_kws"""
list_n_max = []
def _get_split_kws_requirements(split_kws=None):
"""Get per-split-type minimum part length required by split_kws.

Each split type imposes a minimum length on every sequence part: a ``Segment``
needs at least ``n_split_max`` residues (it cannot be split into more pieces than
it has residues), a ``Pattern`` at least ``len_max`` residues, and a
``PeriodicPattern`` at least its first step (``steps[0]``). Returns a dict
``{split_type: (required_len, param_name)}`` so callers can name the exact
parameter that binds the requirement.
"""
reqs = {}
if ut.STR_SEGMENT in split_kws:
n_max = split_kws[ut.STR_SEGMENT]["n_split_max"]
list_n_max.append(n_max)
reqs[ut.STR_SEGMENT] = (split_kws[ut.STR_SEGMENT]["n_split_max"], "n_split_max")
if ut.STR_PATTERN in split_kws:
n_max = split_kws[ut.STR_PATTERN]["len_max"]
list_n_max.append(n_max)
reqs[ut.STR_PATTERN] = (split_kws[ut.STR_PATTERN]["len_max"], "len_max")
if ut.STR_PERIODIC_PATTERN in split_kws:
n_max = split_kws[ut.STR_PERIODIC_PATTERN]["steps"][0]
list_n_max.append(n_max)
if len(list_n_max) == 0:
reqs[ut.STR_PERIODIC_PATTERN] = (split_kws[ut.STR_PERIODIC_PATTERN]["steps"][0], "steps[0]")
return reqs


def _get_max_pos_split_kws(split_kws=None):
"""Get maximum position required for splits basd on split_kws"""
reqs = _get_split_kws_requirements(split_kws=split_kws)
if len(reqs) == 0:
raise ValueError(f"Wrong 'split_kws' ({split_kws})")
n_max = max(list_n_max)
n_max = max(val for val, _ in reqs.values())
return n_max


Expand Down Expand Up @@ -374,20 +385,22 @@ def check_match_df_parts_features(df_parts=None, features=None):
def check_match_df_parts_split_kws(df_parts=None, split_kws=None):
"""Check if df_parts and split_kws match regarding the sequence size"""
n_max = _get_max_pos_split_kws(split_kws=split_kws)
reqs = _get_split_kws_requirements(split_kws=split_kws)
# Name the split type(s) whose required length binds 'n_max' so the message can point at the
# exact parameter to lower (or split type to drop) rather than only reporting the number.
driver = "/".join(f"{st} ({param}={val})" for st, (val, param) in reqs.items() if val == n_max)
for part in list(df_parts):
if any(df_parts[part.lower()].map(len) < n_max):
mask = df_parts[part.lower()].map(len) < n_max
list_seq = df_parts[mask][part.lower()].to_list()
if len(list_seq) == 1:
seq = list_seq[0]
raise ValueError(
f"'{part}' part contains too short sequence ('{seq}', n={len(seq)})"
f"\n for '{split_kws}' split_kws (n_max={n_max})")
else:
seq = list_seq[0]
raise ValueError(
f"For split_kws (n_max={n_max}): '{split_kws}',"
f"\n following '{part}' part contains too short sequences (e.g., '{seq}', n={len(seq)}).")
lengths = df_parts[part.lower()].map(len)
if any(lengths < n_max):
list_seq = df_parts[lengths < n_max][part.lower()].to_list()
seq = list_seq[0]
count = "a sequence" if len(list_seq) == 1 else f"{len(list_seq)} sequences"
raise ValueError(
f"'{part}' part contains {count} too short (e.g. '{seq}', n={len(seq)}) for the "
f"{driver} split length (n_max={n_max})."
f"\n For free peptides with no flanking context, use Segment-only splits or reduce "
f"'len_max'/'n_split_max' (via SequenceFeature.get_split_kws),"
f"\n or add flanking context (jmd_n/jmd_c).")


# Check df_scales & df_cat
Expand Down
84 changes: 78 additions & 6 deletions aaanalysis/pipe/_find_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
more models.
"""
from typing import Optional, List, Tuple, Union, Dict
import warnings
import numpy as np
import pandas as pd
from matplotlib.axes import Axes
Expand Down Expand Up @@ -79,9 +80,56 @@
(ut.STR_SEGMENT, ut.STR_PERIODIC_PATTERN): "p2",
(ut.STR_SEGMENT, ut.STR_PATTERN, ut.STR_PERIODIC_PATTERN): "p1+p2"}
# Levers a power user may pin via the bounded ``kws`` dict (unknown keys raise).
_KWS_KEYS = {"n_explain", "n_split_max", "n_filter", "n_jmd", "simplify_strategy",
_KWS_KEYS = {"n_explain", "n_split_max", "n_filter", "n_jmd", "len_max", "simplify_strategy",
"max_cor", "max_overlap"}
_LIST_MODELS = [ut.MODEL_SVM, ut.MODEL_RF, ut.MODEL_LOG_REG]
# The first (default) PeriodicPattern step; a part must be at least this long to carry one.
_PERIODIC_STEP0 = 3


def _split_kws_for(split_types=None, n_split_max=15, len_max=15):
"""Build ``split_kws`` from the frontend ``SequenceFeature.get_split_kws``.

Threads the Segment ``n_split_max`` and Pattern ``len_max`` levers through the public
front door so ``find_features`` can request shorter splits (needed for free peptides).
"""
return SequenceFeature.get_split_kws(split_types=list(split_types),
n_split_max=n_split_max, len_max=len_max)


def _fit_split_kws_to_parts(split_types=None, n_split_max=15, len_max=15, df_parts=None):
"""Adapt the requested split config to the shortest sequence part (free-peptide safety net).

A part of length ``L`` can only carry a ``Segment`` with ``n_split_max <= L``, a ``Pattern``
with ``len_max <= L``, and a ``PeriodicPattern`` with ``steps[0] (=3) <= L``. When the shortest
part is too short for the requested config, ``Pattern`` / ``PeriodicPattern`` are dropped and the
``Segment`` ``n_split_max`` is clamped so the run still works (``Segment``-only at minimum). A
single ``UserWarning`` names what changed. For parts long enough for the requested config nothing
is dropped or clamped, so the built ``split_kws`` is byte-identical to the requested one.
"""
min_len = int(min(df_parts[c].map(len).min() for c in df_parts.columns))
kept, changes = list(split_types), []
if ut.STR_PERIODIC_PATTERN in kept and min_len < _PERIODIC_STEP0:
kept.remove(ut.STR_PERIODIC_PATTERN)
changes.append(f"dropped 'PeriodicPattern' (needs >= {_PERIODIC_STEP0} residues)")
if ut.STR_PATTERN in kept and min_len < len_max:
kept.remove(ut.STR_PATTERN)
changes.append(f"dropped 'Pattern' (len_max={len_max} > shortest part n={min_len})")
fitted_n_split_max = n_split_max
if ut.STR_SEGMENT not in kept:
# Never leave zero split types; Segment is the universal fallback (works down to n=1).
kept.insert(0, ut.STR_SEGMENT)
if n_split_max > min_len:
fitted_n_split_max = min_len
changes.append(f"clamped Segment 'n_split_max' {n_split_max} -> {min_len}")
split_kws = _split_kws_for(split_types=kept, n_split_max=fitted_n_split_max, len_max=len_max)
if changes:
warnings.warn(
f"'find_features': the shortest sequence part (n={min_len}) is too short for the "
f"requested splits; {'; '.join(changes)}. This keeps the run working on free peptides / "
f"short parts. Set 'kws' (n_split_max / len_max / n_jmd) or add flanking context to "
f"control this.", UserWarning)
return split_kws


def _resolve_model(model, random_state=None):
Expand Down Expand Up @@ -158,6 +206,9 @@ def _resolve_config(search="balanced", kws=None):
"n_jmd_vals": list(mode["n_jmd_vals"]),
"simplify_strategy": mode["simplify_strategy"],
"max_cor": 0.5, "max_overlap": 0.5,
# Pattern span (default 15 = the SequenceFeature.get_split_kws default). A power user lowers
# it via kws["len_max"] to request shorter Pattern splits on short / free-peptide parts.
"len_max": 15,
}
if kws is not None:
ut.check_dict(name="kws", val=kws)
Expand All @@ -169,6 +220,8 @@ def _resolve_config(search="balanced", kws=None):
cfg["sweep_scales"] = False
if "n_split_max" in kws:
cfg["n_split_max_vals"] = [kws["n_split_max"]]
if "len_max" in kws:
cfg["len_max"] = kws["len_max"]
if "n_filter" in kws:
cfg["n_filter_vals"] = [kws["n_filter"]]
if "n_jmd" in kws:
Expand Down Expand Up @@ -286,8 +339,13 @@ def find_features(labels: ut.ArrayLike1D,
Cross-validation scoring metric(s). A list triggers multi-objective Pareto selection.
kws : dict, optional
Bounded power-user overrides; each pins a swept lever to a single value (unknown keys raise).
Recognized keys: ``n_explain``, ``n_split_max``, ``n_filter``, ``n_jmd`` (the symmetric JMD
length ``jmd_n_len = jmd_c_len``), ``simplify_strategy``, ``max_cor``, ``max_overlap``.
Recognized keys: ``n_explain``, ``n_split_max`` (max ``Segment`` splits), ``len_max`` (max
``Pattern`` span), ``n_filter``, ``n_jmd`` (the symmetric JMD length ``jmd_n_len =
jmd_c_len``), ``simplify_strategy``, ``max_cor``, ``max_overlap``. For **free peptides / short
parts** (no flanking context), pass ``kws={"n_jmd": 0}`` so no JMD is carved out; the split
config then auto-fits to the shortest part (``Pattern`` / ``PeriodicPattern`` are dropped and
``n_split_max`` is clamped, with a ``UserWarning``). Lower ``n_split_max`` / ``len_max``
yourself to control which splits are used.
subcategories : list of str, optional
AAontology subcategories to restrict the scale sets to. If ``None``, all scales of the grade.
top_n : int, optional
Expand Down Expand Up @@ -416,7 +474,15 @@ def _run_fast(sf=None, labels=None, df_seq=None, cfg=None, simplify=True, models
df_scales = _load_scale_spec(spec, subcategories=subcategories)
if df_scales is None:
raise ValueError(f"'subcategories' ({subcategories}) should be names that match a scale.")
cpp = CPP(df_parts=df_parts, df_scales=df_scales, random_state=random_state, verbose=verbose)
# Thread the requested Segment n_split_max / Pattern len_max through to the split_kws (was
# always the default before, silently ignoring kws), and auto-fit to the shortest part so a
# free peptide / short part drops Pattern-type splits + clamps n_split_max instead of hard
# erroring. For long-enough parts this is byte-identical to the default split_kws.
split_kws = _fit_split_kws_to_parts(split_types=split_types,
n_split_max=cfg["n_split_max_vals"][0],
len_max=cfg["len_max"], df_parts=df_parts)
cpp = CPP(df_parts=df_parts, df_scales=df_scales, split_kws=split_kws,
random_state=random_state, verbose=verbose)
df_feat = cpp.run(labels=labels, label_test=label_test, label_ref=label_ref,
n_filter=cfg["n_filter_vals"][0], max_cor=cfg["max_cor"],
max_overlap=cfg["max_overlap"], n_jobs=n_jobs)
Expand Down Expand Up @@ -459,7 +525,8 @@ def _grid_stage(sf=None, df_seq=None, parts=None, split_sets=None, n_split_vals=
raise ValueError(f"'subcategories' ({subcategories}) should match at least one scale.")
params_parts = {"list_parts": [list(p) for p in parts],
"jmd_n_len": list(n_jmd_vals), "jmd_c_len": list(n_jmd_vals)}
params_split = {"split_types": [list(s) for s in split_sets], "n_split_max": list(n_split_vals)}
params_split = {"split_types": [list(s) for s in split_sets], "n_split_max": list(n_split_vals),
"len_max": cfg["len_max"]}
params_cpp = {"n_filter": list(n_filters), "label_test": label_test, "label_ref": label_ref,
"max_cor": cfg["max_cor"], "max_overlap": cfg["max_overlap"]}
cppg = CPPGrid(df_seq=df_seq, labels=labels, random_state=random_state, verbose=verbose,
Expand Down Expand Up @@ -556,7 +623,12 @@ def _run_search(sf=None, labels=None, df_seq=None, cfg=None, simplify=True, mode
base = _cv_scores(X_win, labels, models=models, cv=cv, metrics=metrics, random_state=random_state)
rows3 = []
if simplify:
cpp_win = CPP(df_parts=df_parts_win, df_scales=df_scales_win,
# Rebuild the winner's split_kws (Segment n_split_max / Pattern len_max) so the CPP used for
# simplify validates against short / free-peptide parts. simplify operates on the existing
# df_feat (it never reads split_kws), so this does not change the result for normal parts.
split_kws_win = _split_kws_for(split_types=win2["split_types"],
n_split_max=win2["n_split_max"], len_max=cfg["len_max"])
cpp_win = CPP(df_parts=df_parts_win, df_scales=df_scales_win, split_kws=split_kws_win,
random_state=random_state, verbose=verbose)
df_simpl = cpp_win.simplify(df_feat=df_feat_win, labels=labels,
strategy=cfg["simplify_strategy"], ml_cv=cv,
Expand Down
17 changes: 17 additions & 0 deletions docs/source/index/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,23 @@ Changed
full-path import such as ``from aaanalysis.protein_design import SeqMut`` must become
``from aaanalysis.protein_engineering import SeqMut``.

Fixed
~~~~~

- **CPP splits on free peptides / short parts (#338)**: ``aap.find_features`` and the
``Pattern`` / ``PeriodicPattern`` splits were unusable on free peptides with no flanking
context (the linear-epitope case). ``find_features(search="fast")`` and its Stage-3
simplify step ignored the requested / winning split configuration and always used the
default (``len_max=15``, ``n_split_max=15``), so any target region shorter than ~15
residues raised. The bounded ``kws`` dict now accepts ``len_max`` (and actually honors
``n_split_max``) so shorter ``Pattern`` / ``Segment`` splits can be requested; the fast
path auto-fits the split configuration to the shortest part — dropping
``Pattern`` / ``PeriodicPattern`` and clamping ``n_split_max`` with a ``UserWarning`` — so
free peptides run out of the box; and the too-short-part ``ValueError`` now names the
binding split length and how to fix it (Segment-only splits, lower
``len_max`` / ``n_split_max``, or add ``jmd_n`` / ``jmd_c`` context). Results for flanked
inputs are unchanged.


Version 1.0 (Stable Version)
--------------------------------
Expand Down
36 changes: 36 additions & 0 deletions tests/unit/cpp_tests/test_check_feature_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,42 @@ def test_match_df_parts_split_kws_too_short(self):
check_match_df_parts_split_kws(
df_parts=self._df_parts(["AC"]), split_kws=kws)

def test_match_df_parts_split_kws_message_is_actionable(self):
# #338: a free peptide too short for the default splits must get a message that states the
# real cause (which split length binds n_max) AND how to fix it.
kws = {"Segment": {"n_split_min": 1, "n_split_max": 15},
"Pattern": {"len_max": 15, "n_max": 4, "n_min": 2, "steps": [3, 4]},
"PeriodicPattern": {"steps": [3, 4]}}
with pytest.raises(ValueError) as exc:
check_match_df_parts_split_kws(df_parts=self._df_parts(["PQFTIFGT"]), split_kws=kws)
msg = str(exc.value)
# Cause: names the binding split type + its parameter and the offending length.
assert "n_max=15" in msg and "len_max=15" in msg and "n=8" in msg
# Fix: points at the concrete remedies.
assert "Segment-only" in msg
assert "len_max" in msg and "n_split_max" in msg
assert "jmd_n" in msg and "jmd_c" in msg

def test_match_df_parts_split_kws_names_binding_split_type(self):
# Segment n_split_max is the sole binding requirement here -> named as the driver.
kws = {"Segment": {"n_split_min": 1, "n_split_max": 12}}
with pytest.raises(ValueError, match=r"Segment \(n_split_max=12\)"):
check_match_df_parts_split_kws(df_parts=self._df_parts(["ACDEF"]), split_kws=kws)

def test_match_df_parts_split_kws_segment_only_short_ok(self):
# Positive: Segment-only with n_split_max <= part length passes (free-peptide path).
kws = {"Segment": {"n_split_min": 1, "n_split_max": 8}}
assert check_match_df_parts_split_kws(
df_parts=self._df_parts(["PQFTIFGT", "AIVMWFLL"]), split_kws=kws) is None

def test_match_df_parts_split_kws_reduced_len_max_ok(self):
# Positive: reducing len_max/n_split_max lets short parts pass with all split types.
kws = {"Segment": {"n_split_min": 1, "n_split_max": 8},
"Pattern": {"len_max": 8, "n_max": 4, "n_min": 2, "steps": [3, 4]},
"PeriodicPattern": {"steps": [3, 4]}}
assert check_match_df_parts_split_kws(
df_parts=self._df_parts(["PQFTIFGT", "AIVMWFLL"]), split_kws=kws) is None

def test_match_df_parts_df_scales_missing_char(self):
# 'B' is not a canonical AA in df_scales index -> missing char, no gaps
order = "ACDEFGHIKLMNPQRSTVWY"
Expand Down
Loading
Loading