From b755e5c7315004bbff884db8d4e6250853faf1b5 Mon Sep 17 00:00:00 2001 From: Stephan Breimann Date: Sat, 4 Jul 2026 16:11:44 +0200 Subject: [PATCH] fix(cpp): make CPP splits usable on free peptides with no flanks (#338) Pattern/PeriodicPattern splits and aap.find_features were unusable on free peptides with no flanking context (linear-epitope case): the default split config requires each part to be >= 15 residues, so any target region shorter than ~15 aa raised an opaque "too short" ValueError. - Actionable error: check_match_df_parts_split_kws now names the binding split length (which split type / parameter drives n_max) and the concrete fix (Segment-only splits, lower len_max/n_split_max, or add jmd_n/jmd_c context). - Honor kws: find_features 'kws' now accepts 'len_max' and actually threads 'n_split_max'/'len_max' into the split config for the fast path (previously ignored) and the CPPGrid search stages, so shorter Pattern/Segment splits can be requested. - Auto-fit (fast path): the split config auto-fits to the shortest part, dropping Pattern/PeriodicPattern and clamping n_split_max with a UserWarning, so free peptides run out of the box. Byte-identical when parts are long enough. - Search Stage 3: the simplify CPP now uses the winner's split_kws instead of the default, which previously hard-errored on short parts even though the grid had gracefully soft-dropped the non-fitting configs. Results for flanked inputs are unchanged (byte-identical). Adds unit tests for the actionable message, Segment-only / reduced-len_max short-part paths, the auto-fit helper (drop/clamp + warning), and free-peptide fast/balanced runs. Co-Authored-By: Claude Fable 5 --- .../_backend/check_feature.py | 61 +++++++----- aaanalysis/pipe/_find_features.py | 84 ++++++++++++++-- docs/source/index/release_notes.rst | 17 ++++ .../cpp_tests/test_check_feature_backend.py | 36 +++++++ .../unit/pipe_tests/test_aap_find_features.py | 96 ++++++++++++++++++- 5 files changed, 263 insertions(+), 31 deletions(-) diff --git a/aaanalysis/feature_engineering/_backend/check_feature.py b/aaanalysis/feature_engineering/_backend/check_feature.py index e2f8f314..f0e65ef3 100644 --- a/aaanalysis/feature_engineering/_backend/check_feature.py +++ b/aaanalysis/feature_engineering/_backend/check_feature.py @@ -27,21 +27,32 @@ def _get_max_pos_split(split=None): return n_max -def _get_max_pos_split_kws(split_kws=None): - """Get maximum position required for splits basd on split_kws""" - list_n_max = [] +def _get_split_kws_requirements(split_kws=None): + """Get per-split-type minimum part length required by split_kws. + + Each split type imposes a minimum length on every sequence part: a ``Segment`` + needs at least ``n_split_max`` residues (it cannot be split into more pieces than + it has residues), a ``Pattern`` at least ``len_max`` residues, and a + ``PeriodicPattern`` at least its first step (``steps[0]``). Returns a dict + ``{split_type: (required_len, param_name)}`` so callers can name the exact + parameter that binds the requirement. + """ + reqs = {} if ut.STR_SEGMENT in split_kws: - n_max = split_kws[ut.STR_SEGMENT]["n_split_max"] - list_n_max.append(n_max) + reqs[ut.STR_SEGMENT] = (split_kws[ut.STR_SEGMENT]["n_split_max"], "n_split_max") if ut.STR_PATTERN in split_kws: - n_max = split_kws[ut.STR_PATTERN]["len_max"] - list_n_max.append(n_max) + reqs[ut.STR_PATTERN] = (split_kws[ut.STR_PATTERN]["len_max"], "len_max") if ut.STR_PERIODIC_PATTERN in split_kws: - n_max = split_kws[ut.STR_PERIODIC_PATTERN]["steps"][0] - list_n_max.append(n_max) - if len(list_n_max) == 0: + reqs[ut.STR_PERIODIC_PATTERN] = (split_kws[ut.STR_PERIODIC_PATTERN]["steps"][0], "steps[0]") + return reqs + + +def _get_max_pos_split_kws(split_kws=None): + """Get maximum position required for splits basd on split_kws""" + reqs = _get_split_kws_requirements(split_kws=split_kws) + if len(reqs) == 0: raise ValueError(f"Wrong 'split_kws' ({split_kws})") - n_max = max(list_n_max) + n_max = max(val for val, _ in reqs.values()) return n_max @@ -374,20 +385,22 @@ def check_match_df_parts_features(df_parts=None, features=None): def check_match_df_parts_split_kws(df_parts=None, split_kws=None): """Check if df_parts and split_kws match regarding the sequence size""" n_max = _get_max_pos_split_kws(split_kws=split_kws) + reqs = _get_split_kws_requirements(split_kws=split_kws) + # Name the split type(s) whose required length binds 'n_max' so the message can point at the + # exact parameter to lower (or split type to drop) rather than only reporting the number. + driver = "/".join(f"{st} ({param}={val})" for st, (val, param) in reqs.items() if val == n_max) for part in list(df_parts): - if any(df_parts[part.lower()].map(len) < n_max): - mask = df_parts[part.lower()].map(len) < n_max - list_seq = df_parts[mask][part.lower()].to_list() - if len(list_seq) == 1: - seq = list_seq[0] - raise ValueError( - f"'{part}' part contains too short sequence ('{seq}', n={len(seq)})" - f"\n for '{split_kws}' split_kws (n_max={n_max})") - else: - seq = list_seq[0] - raise ValueError( - f"For split_kws (n_max={n_max}): '{split_kws}'," - f"\n following '{part}' part contains too short sequences (e.g., '{seq}', n={len(seq)}).") + lengths = df_parts[part.lower()].map(len) + if any(lengths < n_max): + list_seq = df_parts[lengths < n_max][part.lower()].to_list() + seq = list_seq[0] + count = "a sequence" if len(list_seq) == 1 else f"{len(list_seq)} sequences" + raise ValueError( + f"'{part}' part contains {count} too short (e.g. '{seq}', n={len(seq)}) for the " + f"{driver} split length (n_max={n_max})." + f"\n For free peptides with no flanking context, use Segment-only splits or reduce " + f"'len_max'/'n_split_max' (via SequenceFeature.get_split_kws)," + f"\n or add flanking context (jmd_n/jmd_c).") # Check df_scales & df_cat diff --git a/aaanalysis/pipe/_find_features.py b/aaanalysis/pipe/_find_features.py index c4f578fb..cbf900f7 100644 --- a/aaanalysis/pipe/_find_features.py +++ b/aaanalysis/pipe/_find_features.py @@ -8,6 +8,7 @@ more models. """ from typing import Optional, List, Tuple, Union, Dict +import warnings import numpy as np import pandas as pd from matplotlib.axes import Axes @@ -79,9 +80,56 @@ (ut.STR_SEGMENT, ut.STR_PERIODIC_PATTERN): "p2", (ut.STR_SEGMENT, ut.STR_PATTERN, ut.STR_PERIODIC_PATTERN): "p1+p2"} # Levers a power user may pin via the bounded ``kws`` dict (unknown keys raise). -_KWS_KEYS = {"n_explain", "n_split_max", "n_filter", "n_jmd", "simplify_strategy", +_KWS_KEYS = {"n_explain", "n_split_max", "n_filter", "n_jmd", "len_max", "simplify_strategy", "max_cor", "max_overlap"} _LIST_MODELS = [ut.MODEL_SVM, ut.MODEL_RF, ut.MODEL_LOG_REG] +# The first (default) PeriodicPattern step; a part must be at least this long to carry one. +_PERIODIC_STEP0 = 3 + + +def _split_kws_for(split_types=None, n_split_max=15, len_max=15): + """Build ``split_kws`` from the frontend ``SequenceFeature.get_split_kws``. + + Threads the Segment ``n_split_max`` and Pattern ``len_max`` levers through the public + front door so ``find_features`` can request shorter splits (needed for free peptides). + """ + return SequenceFeature.get_split_kws(split_types=list(split_types), + n_split_max=n_split_max, len_max=len_max) + + +def _fit_split_kws_to_parts(split_types=None, n_split_max=15, len_max=15, df_parts=None): + """Adapt the requested split config to the shortest sequence part (free-peptide safety net). + + A part of length ``L`` can only carry a ``Segment`` with ``n_split_max <= L``, a ``Pattern`` + with ``len_max <= L``, and a ``PeriodicPattern`` with ``steps[0] (=3) <= L``. When the shortest + part is too short for the requested config, ``Pattern`` / ``PeriodicPattern`` are dropped and the + ``Segment`` ``n_split_max`` is clamped so the run still works (``Segment``-only at minimum). A + single ``UserWarning`` names what changed. For parts long enough for the requested config nothing + is dropped or clamped, so the built ``split_kws`` is byte-identical to the requested one. + """ + min_len = int(min(df_parts[c].map(len).min() for c in df_parts.columns)) + kept, changes = list(split_types), [] + if ut.STR_PERIODIC_PATTERN in kept and min_len < _PERIODIC_STEP0: + kept.remove(ut.STR_PERIODIC_PATTERN) + changes.append(f"dropped 'PeriodicPattern' (needs >= {_PERIODIC_STEP0} residues)") + if ut.STR_PATTERN in kept and min_len < len_max: + kept.remove(ut.STR_PATTERN) + changes.append(f"dropped 'Pattern' (len_max={len_max} > shortest part n={min_len})") + fitted_n_split_max = n_split_max + if ut.STR_SEGMENT not in kept: + # Never leave zero split types; Segment is the universal fallback (works down to n=1). + kept.insert(0, ut.STR_SEGMENT) + if n_split_max > min_len: + fitted_n_split_max = min_len + changes.append(f"clamped Segment 'n_split_max' {n_split_max} -> {min_len}") + split_kws = _split_kws_for(split_types=kept, n_split_max=fitted_n_split_max, len_max=len_max) + if changes: + warnings.warn( + f"'find_features': the shortest sequence part (n={min_len}) is too short for the " + f"requested splits; {'; '.join(changes)}. This keeps the run working on free peptides / " + f"short parts. Set 'kws' (n_split_max / len_max / n_jmd) or add flanking context to " + f"control this.", UserWarning) + return split_kws def _resolve_model(model, random_state=None): @@ -158,6 +206,9 @@ def _resolve_config(search="balanced", kws=None): "n_jmd_vals": list(mode["n_jmd_vals"]), "simplify_strategy": mode["simplify_strategy"], "max_cor": 0.5, "max_overlap": 0.5, + # Pattern span (default 15 = the SequenceFeature.get_split_kws default). A power user lowers + # it via kws["len_max"] to request shorter Pattern splits on short / free-peptide parts. + "len_max": 15, } if kws is not None: ut.check_dict(name="kws", val=kws) @@ -169,6 +220,8 @@ def _resolve_config(search="balanced", kws=None): cfg["sweep_scales"] = False if "n_split_max" in kws: cfg["n_split_max_vals"] = [kws["n_split_max"]] + if "len_max" in kws: + cfg["len_max"] = kws["len_max"] if "n_filter" in kws: cfg["n_filter_vals"] = [kws["n_filter"]] if "n_jmd" in kws: @@ -286,8 +339,13 @@ def find_features(labels: ut.ArrayLike1D, Cross-validation scoring metric(s). A list triggers multi-objective Pareto selection. kws : dict, optional Bounded power-user overrides; each pins a swept lever to a single value (unknown keys raise). - Recognized keys: ``n_explain``, ``n_split_max``, ``n_filter``, ``n_jmd`` (the symmetric JMD - length ``jmd_n_len = jmd_c_len``), ``simplify_strategy``, ``max_cor``, ``max_overlap``. + Recognized keys: ``n_explain``, ``n_split_max`` (max ``Segment`` splits), ``len_max`` (max + ``Pattern`` span), ``n_filter``, ``n_jmd`` (the symmetric JMD length ``jmd_n_len = + jmd_c_len``), ``simplify_strategy``, ``max_cor``, ``max_overlap``. For **free peptides / short + parts** (no flanking context), pass ``kws={"n_jmd": 0}`` so no JMD is carved out; the split + config then auto-fits to the shortest part (``Pattern`` / ``PeriodicPattern`` are dropped and + ``n_split_max`` is clamped, with a ``UserWarning``). Lower ``n_split_max`` / ``len_max`` + yourself to control which splits are used. subcategories : list of str, optional AAontology subcategories to restrict the scale sets to. If ``None``, all scales of the grade. top_n : int, optional @@ -416,7 +474,15 @@ def _run_fast(sf=None, labels=None, df_seq=None, cfg=None, simplify=True, models df_scales = _load_scale_spec(spec, subcategories=subcategories) if df_scales is None: raise ValueError(f"'subcategories' ({subcategories}) should be names that match a scale.") - cpp = CPP(df_parts=df_parts, df_scales=df_scales, random_state=random_state, verbose=verbose) + # Thread the requested Segment n_split_max / Pattern len_max through to the split_kws (was + # always the default before, silently ignoring kws), and auto-fit to the shortest part so a + # free peptide / short part drops Pattern-type splits + clamps n_split_max instead of hard + # erroring. For long-enough parts this is byte-identical to the default split_kws. + split_kws = _fit_split_kws_to_parts(split_types=split_types, + n_split_max=cfg["n_split_max_vals"][0], + len_max=cfg["len_max"], df_parts=df_parts) + cpp = CPP(df_parts=df_parts, df_scales=df_scales, split_kws=split_kws, + random_state=random_state, verbose=verbose) df_feat = cpp.run(labels=labels, label_test=label_test, label_ref=label_ref, n_filter=cfg["n_filter_vals"][0], max_cor=cfg["max_cor"], max_overlap=cfg["max_overlap"], n_jobs=n_jobs) @@ -459,7 +525,8 @@ def _grid_stage(sf=None, df_seq=None, parts=None, split_sets=None, n_split_vals= raise ValueError(f"'subcategories' ({subcategories}) should match at least one scale.") params_parts = {"list_parts": [list(p) for p in parts], "jmd_n_len": list(n_jmd_vals), "jmd_c_len": list(n_jmd_vals)} - params_split = {"split_types": [list(s) for s in split_sets], "n_split_max": list(n_split_vals)} + params_split = {"split_types": [list(s) for s in split_sets], "n_split_max": list(n_split_vals), + "len_max": cfg["len_max"]} params_cpp = {"n_filter": list(n_filters), "label_test": label_test, "label_ref": label_ref, "max_cor": cfg["max_cor"], "max_overlap": cfg["max_overlap"]} cppg = CPPGrid(df_seq=df_seq, labels=labels, random_state=random_state, verbose=verbose, @@ -556,7 +623,12 @@ def _run_search(sf=None, labels=None, df_seq=None, cfg=None, simplify=True, mode base = _cv_scores(X_win, labels, models=models, cv=cv, metrics=metrics, random_state=random_state) rows3 = [] if simplify: - cpp_win = CPP(df_parts=df_parts_win, df_scales=df_scales_win, + # Rebuild the winner's split_kws (Segment n_split_max / Pattern len_max) so the CPP used for + # simplify validates against short / free-peptide parts. simplify operates on the existing + # df_feat (it never reads split_kws), so this does not change the result for normal parts. + split_kws_win = _split_kws_for(split_types=win2["split_types"], + n_split_max=win2["n_split_max"], len_max=cfg["len_max"]) + cpp_win = CPP(df_parts=df_parts_win, df_scales=df_scales_win, split_kws=split_kws_win, random_state=random_state, verbose=verbose) df_simpl = cpp_win.simplify(df_feat=df_feat_win, labels=labels, strategy=cfg["simplify_strategy"], ml_cv=cv, diff --git a/docs/source/index/release_notes.rst b/docs/source/index/release_notes.rst index ed7d1583..9b91e2c4 100644 --- a/docs/source/index/release_notes.rst +++ b/docs/source/index/release_notes.rst @@ -363,6 +363,23 @@ Changed full-path import such as ``from aaanalysis.protein_design import SeqMut`` must become ``from aaanalysis.protein_engineering import SeqMut``. +Fixed +~~~~~ + +- **CPP splits on free peptides / short parts (#338)**: ``aap.find_features`` and the + ``Pattern`` / ``PeriodicPattern`` splits were unusable on free peptides with no flanking + context (the linear-epitope case). ``find_features(search="fast")`` and its Stage-3 + simplify step ignored the requested / winning split configuration and always used the + default (``len_max=15``, ``n_split_max=15``), so any target region shorter than ~15 + residues raised. The bounded ``kws`` dict now accepts ``len_max`` (and actually honors + ``n_split_max``) so shorter ``Pattern`` / ``Segment`` splits can be requested; the fast + path auto-fits the split configuration to the shortest part — dropping + ``Pattern`` / ``PeriodicPattern`` and clamping ``n_split_max`` with a ``UserWarning`` — so + free peptides run out of the box; and the too-short-part ``ValueError`` now names the + binding split length and how to fix it (Segment-only splits, lower + ``len_max`` / ``n_split_max``, or add ``jmd_n`` / ``jmd_c`` context). Results for flanked + inputs are unchanged. + Version 1.0 (Stable Version) -------------------------------- diff --git a/tests/unit/cpp_tests/test_check_feature_backend.py b/tests/unit/cpp_tests/test_check_feature_backend.py index 73227a0c..942c73ad 100644 --- a/tests/unit/cpp_tests/test_check_feature_backend.py +++ b/tests/unit/cpp_tests/test_check_feature_backend.py @@ -251,6 +251,42 @@ def test_match_df_parts_split_kws_too_short(self): check_match_df_parts_split_kws( df_parts=self._df_parts(["AC"]), split_kws=kws) + def test_match_df_parts_split_kws_message_is_actionable(self): + # #338: a free peptide too short for the default splits must get a message that states the + # real cause (which split length binds n_max) AND how to fix it. + kws = {"Segment": {"n_split_min": 1, "n_split_max": 15}, + "Pattern": {"len_max": 15, "n_max": 4, "n_min": 2, "steps": [3, 4]}, + "PeriodicPattern": {"steps": [3, 4]}} + with pytest.raises(ValueError) as exc: + check_match_df_parts_split_kws(df_parts=self._df_parts(["PQFTIFGT"]), split_kws=kws) + msg = str(exc.value) + # Cause: names the binding split type + its parameter and the offending length. + assert "n_max=15" in msg and "len_max=15" in msg and "n=8" in msg + # Fix: points at the concrete remedies. + assert "Segment-only" in msg + assert "len_max" in msg and "n_split_max" in msg + assert "jmd_n" in msg and "jmd_c" in msg + + def test_match_df_parts_split_kws_names_binding_split_type(self): + # Segment n_split_max is the sole binding requirement here -> named as the driver. + kws = {"Segment": {"n_split_min": 1, "n_split_max": 12}} + with pytest.raises(ValueError, match=r"Segment \(n_split_max=12\)"): + check_match_df_parts_split_kws(df_parts=self._df_parts(["ACDEF"]), split_kws=kws) + + def test_match_df_parts_split_kws_segment_only_short_ok(self): + # Positive: Segment-only with n_split_max <= part length passes (free-peptide path). + kws = {"Segment": {"n_split_min": 1, "n_split_max": 8}} + assert check_match_df_parts_split_kws( + df_parts=self._df_parts(["PQFTIFGT", "AIVMWFLL"]), split_kws=kws) is None + + def test_match_df_parts_split_kws_reduced_len_max_ok(self): + # Positive: reducing len_max/n_split_max lets short parts pass with all split types. + kws = {"Segment": {"n_split_min": 1, "n_split_max": 8}, + "Pattern": {"len_max": 8, "n_max": 4, "n_min": 2, "steps": [3, 4]}, + "PeriodicPattern": {"steps": [3, 4]}} + assert check_match_df_parts_split_kws( + df_parts=self._df_parts(["PQFTIFGT", "AIVMWFLL"]), split_kws=kws) is None + def test_match_df_parts_df_scales_missing_char(self): # 'B' is not a canonical AA in df_scales index -> missing char, no gaps order = "ACDEFGHIKLMNPQRSTVWY" diff --git a/tests/unit/pipe_tests/test_aap_find_features.py b/tests/unit/pipe_tests/test_aap_find_features.py index cee5dc92..d5160218 100644 --- a/tests/unit/pipe_tests/test_aap_find_features.py +++ b/tests/unit/pipe_tests/test_aap_find_features.py @@ -1,4 +1,5 @@ """This script tests the aaanalysis.pipe.find_features() staged CPP AutoML golden pipeline.""" +import warnings import matplotlib matplotlib.use("Agg") from matplotlib.axes import Axes @@ -10,7 +11,8 @@ import aaanalysis.pipe as aap from aaanalysis.pipe._find_features import (_resolve_config, _resolve_models, _load_scale_spec, _cv_scores, _pareto_mask, _axis_impact, _MODES, - _PART_SETS, _SPLIT_TYPE_SETS) + _PART_SETS, _SPLIT_TYPE_SETS, _KWS_KEYS, + _fit_split_kws_to_parts, _split_kws_for) aa.options["verbose"] = False @@ -22,6 +24,13 @@ # kws that shrink a search to a tiny Stage-1 grid (one scale, one n_split) so tests stay fast. SMALL = {"n_explain": 30, "n_split_max": 15} +# #338: short free peptides (linear epitopes) with NO flanking context (8 aa each). +_FREE_SEQS = ["PQFTIFGT", "AIVMWFLL", "GKKRTLSN", "DDECWQPT", "MNPQRSTV", "LLIIVVAA", + "KKRPWWFT", "SSTTNNQQ", "WWYYFFTT", "RRKKHHDD"] +df_seq_free = pd.DataFrame({"entry": [f"P{i}" for i in range(len(_FREE_SEQS))], + "sequence": _FREE_SEQS}) +labels_free = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0] + def _explicit_fast(random_state=0): """The explicit single-CPP chain that find_features(search='fast') mirrors byte-for-byte.""" @@ -328,6 +337,37 @@ def test_fast_ax_eval_empty(self): random_state=0, n_jobs=1) assert isinstance(ax, Axes) and ax.eval == [] + # #338: free peptides with no flanking context must be usable end to end. + def test_fast_free_peptides_auto_fits_and_warns(self): + # search='fast' with n_jmd=0 (no flanks): the split config auto-fits to the short parts + # (Pattern dropped, n_split_max clamped) with a UserWarning, and still returns features. + with pytest.warns(UserWarning, match="too short"): + df_feat, _, df_eval = aap.find_features( + labels_free, df_seq=df_seq_free, search="fast", plot=False, + kws={"n_jmd": 0}, random_state=0, n_jobs=1) + assert len(df_feat) > 0 and len(df_eval) == 1 + + def test_fast_free_peptides_explicit_kws_runs(self): + # The len_max / n_split_max kws are honored (threaded into the split_kws) so a user can + # request shorter Pattern / Segment splits on free peptides; the run returns features. + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + df_feat, _, _ = aap.find_features( + labels_free, df_seq=df_seq_free, search="fast", plot=False, + kws={"n_jmd": 0, "n_split_max": 8, "len_max": 8}, random_state=0, n_jobs=1) + assert len(df_feat) > 0 + + @pytest.mark.slow + def test_balanced_free_peptides_runs(self): + # The staged search reaches Stage 3 (simplify) on free peptides without hard-erroring. + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + df_feat, _, df_eval = aap.find_features( + labels_free, df_seq=df_seq_free, search="balanced", plot=False, + kws={"n_jmd": 0}, random_state=0, n_jobs=1) + assert len(df_feat) > 0 + assert int(df_eval["is_selected"].sum()) == 1 + @pytest.mark.slow def test_balanced_ax_eval_publication_figures(self): from matplotlib.figure import Figure @@ -365,6 +405,60 @@ def test_resolve_config_unknown_kws_raises(self): with pytest.raises(ValueError): _resolve_config(search="balanced", kws={"bogus": 1}) + # #338: len_max lever + auto-fit for free peptides / short parts + def test_kws_keys_includes_len_max(self): + assert "len_max" in _KWS_KEYS + + def test_resolve_config_default_len_max(self): + # Default Pattern span is 15 for every grade (matches SequenceFeature.get_split_kws). + for mode in _MODES: + assert _resolve_config(search=mode)["len_max"] == 15 + + def test_resolve_config_kws_pins_len_max(self): + assert _resolve_config(search="balanced", kws={"len_max": 8})["len_max"] == 8 + + def test_split_kws_for_threads_levers(self): + skw = _split_kws_for(split_types=_SPLIT_TYPE_SETS[-1], n_split_max=6, len_max=7) + assert skw["Segment"]["n_split_max"] == 6 + assert skw["Pattern"]["len_max"] == 7 + + def test_fit_split_kws_long_parts_byte_identical(self): + # Parts long enough for the default config -> no drop / clamp / warning, default split_kws. + df_parts = pd.DataFrame({"tmd": ["ACDEFGHIKLMNPQRSTVWY"] * 3}) + with warnings.catch_warnings(): + warnings.simplefilter("error") # any warning would fail here + skw = _fit_split_kws_to_parts(split_types=_SPLIT_TYPE_SETS[-1], n_split_max=15, + len_max=15, df_parts=df_parts) + assert set(skw) == {"Segment", "Pattern", "PeriodicPattern"} + assert skw["Segment"]["n_split_max"] == 15 and skw["Pattern"]["len_max"] == 15 + + def test_fit_split_kws_short_parts_drops_pattern_and_clamps_segment(self): + df_parts = pd.DataFrame({"tmd": ["PQFTIFGT", "AIVMWFLL"]}) # n=8 + with pytest.warns(UserWarning, match="too short"): + skw = _fit_split_kws_to_parts(split_types=_SPLIT_TYPE_SETS[-1], n_split_max=15, + len_max=15, df_parts=df_parts) + # Pattern (len_max=15 > 8) dropped; Segment kept but clamped to 8; PeriodicPattern (3<=8) kept. + assert "Pattern" not in skw + assert skw["Segment"]["n_split_max"] == 8 + assert "PeriodicPattern" in skw + + def test_fit_split_kws_always_keeps_segment(self): + # Even a 2-residue part keeps a (clamped) Segment so the run never has zero split types. + df_parts = pd.DataFrame({"tmd": ["AC", "DE"]}) + with pytest.warns(UserWarning): + skw = _fit_split_kws_to_parts(split_types=_SPLIT_TYPE_SETS[-1], n_split_max=15, + len_max=15, df_parts=df_parts) + assert "Segment" in skw and skw["Segment"]["n_split_max"] == 2 + + def test_fit_split_kws_no_warn_when_config_fits(self): + # A user who already lowered n_split_max / len_max to fit the parts gets no warning. + df_parts = pd.DataFrame({"tmd": ["PQFTIFGT", "AIVMWFLL"]}) # n=8 + with warnings.catch_warnings(): + warnings.simplefilter("error") + skw = _fit_split_kws_to_parts(split_types=_SPLIT_TYPE_SETS[-1], n_split_max=8, + len_max=8, df_parts=df_parts) + assert set(skw) == {"Segment", "Pattern", "PeriodicPattern"} + def test_resolve_models_list(self): models = _resolve_models(["svm", "rf"], random_state=0) assert len(models) == 2