From b755e5c7315004bbff884db8d4e6250853faf1b5 Mon Sep 17 00:00:00 2001
From: Stephan Breimann <stephanbreimann@gmail.com>
Date: Sat, 4 Jul 2026 16:11:44 +0200
Subject: [PATCH] fix(cpp): make CPP splits usable on free peptides with no
 flanks (#338)

Pattern/PeriodicPattern splits and aap.find_features were unusable on free
peptides with no flanking context (linear-epitope case): the default split
config requires each part to be >= 15 residues, so any target region shorter
than ~15 aa raised an opaque "too short" ValueError.

- Actionable error: check_match_df_parts_split_kws now names the binding split
  length (which split type / parameter drives n_max) and the concrete fix
  (Segment-only splits, lower len_max/n_split_max, or add jmd_n/jmd_c context).
- Honor kws: find_features 'kws' now accepts 'len_max' and actually threads
  'n_split_max'/'len_max' into the split config for the fast path (previously
  ignored) and the CPPGrid search stages, so shorter Pattern/Segment splits can
  be requested.
- Auto-fit (fast path): the split config auto-fits to the shortest part,
  dropping Pattern/PeriodicPattern and clamping n_split_max with a UserWarning,
  so free peptides run out of the box. Byte-identical when parts are long enough.
- Search Stage 3: the simplify CPP now uses the winner's split_kws instead of
  the default, which previously hard-errored on short parts even though the grid
  had gracefully soft-dropped the non-fitting configs.

Results for flanked inputs are unchanged (byte-identical). Adds unit tests for
the actionable message, Segment-only / reduced-len_max short-part paths, the
auto-fit helper (drop/clamp + warning), and free-peptide fast/balanced runs.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .../_backend/check_feature.py                 | 61 +++++++-----
 aaanalysis/pipe/_find_features.py             | 84 ++++++++++++++--
 docs/source/index/release_notes.rst           | 17 ++++
 .../cpp_tests/test_check_feature_backend.py   | 36 +++++++
 .../unit/pipe_tests/test_aap_find_features.py | 96 ++++++++++++++++++-
 5 files changed, 263 insertions(+), 31 deletions(-)

diff --git a/aaanalysis/feature_engineering/_backend/check_feature.py b/aaanalysis/feature_engineering/_backend/check_feature.py
index e2f8f314..f0e65ef3 100644
--- a/aaanalysis/feature_engineering/_backend/check_feature.py
+++ b/aaanalysis/feature_engineering/_backend/check_feature.py
@@ -27,21 +27,32 @@ def _get_max_pos_split(split=None):
     return n_max
 
 
-def _get_max_pos_split_kws(split_kws=None):
-    """Get maximum position required for splits basd on split_kws"""
-    list_n_max = []
+def _get_split_kws_requirements(split_kws=None):
+    """Get per-split-type minimum part length required by split_kws.
+
+    Each split type imposes a minimum length on every sequence part: a ``Segment``
+    needs at least ``n_split_max`` residues (it cannot be split into more pieces than
+    it has residues), a ``Pattern`` at least ``len_max`` residues, and a
+    ``PeriodicPattern`` at least its first step (``steps[0]``). Returns a dict
+    ``{split_type: (required_len, param_name)}`` so callers can name the exact
+    parameter that binds the requirement.
+    """
+    reqs = {}
     if ut.STR_SEGMENT in split_kws:
-        n_max = split_kws[ut.STR_SEGMENT]["n_split_max"]
-        list_n_max.append(n_max)
+        reqs[ut.STR_SEGMENT] = (split_kws[ut.STR_SEGMENT]["n_split_max"], "n_split_max")
     if ut.STR_PATTERN in split_kws:
-        n_max = split_kws[ut.STR_PATTERN]["len_max"]
-        list_n_max.append(n_max)
+        reqs[ut.STR_PATTERN] = (split_kws[ut.STR_PATTERN]["len_max"], "len_max")
     if ut.STR_PERIODIC_PATTERN in split_kws:
-        n_max = split_kws[ut.STR_PERIODIC_PATTERN]["steps"][0]
-        list_n_max.append(n_max)
-    if len(list_n_max) == 0:
+        reqs[ut.STR_PERIODIC_PATTERN] = (split_kws[ut.STR_PERIODIC_PATTERN]["steps"][0], "steps[0]")
+    return reqs
+
+
+def _get_max_pos_split_kws(split_kws=None):
+    """Get maximum position required for splits basd on split_kws"""
+    reqs = _get_split_kws_requirements(split_kws=split_kws)
+    if len(reqs) == 0:
         raise ValueError(f"Wrong 'split_kws' ({split_kws})")
-    n_max = max(list_n_max)
+    n_max = max(val for val, _ in reqs.values())
     return n_max
 
 
@@ -374,20 +385,22 @@ def check_match_df_parts_features(df_parts=None, features=None):
 def check_match_df_parts_split_kws(df_parts=None, split_kws=None):
     """Check if df_parts and split_kws match regarding the sequence size"""
     n_max = _get_max_pos_split_kws(split_kws=split_kws)
+    reqs = _get_split_kws_requirements(split_kws=split_kws)
+    # Name the split type(s) whose required length binds 'n_max' so the message can point at the
+    # exact parameter to lower (or split type to drop) rather than only reporting the number.
+    driver = "/".join(f"{st} ({param}={val})" for st, (val, param) in reqs.items() if val == n_max)
     for part in list(df_parts):
-        if any(df_parts[part.lower()].map(len) < n_max):
-            mask = df_parts[part.lower()].map(len) < n_max
-            list_seq = df_parts[mask][part.lower()].to_list()
-            if len(list_seq) == 1:
-                seq = list_seq[0]
-                raise ValueError(
-                    f"'{part}' part contains too short sequence ('{seq}', n={len(seq)})"
-                    f"\n  for '{split_kws}' split_kws (n_max={n_max})")
-            else:
-                seq = list_seq[0]
-                raise ValueError(
-                    f"For split_kws (n_max={n_max}): '{split_kws}',"
-                    f"\n  following '{part}' part contains too short sequences (e.g., '{seq}', n={len(seq)}).")
+        lengths = df_parts[part.lower()].map(len)
+        if any(lengths < n_max):
+            list_seq = df_parts[lengths < n_max][part.lower()].to_list()
+            seq = list_seq[0]
+            count = "a sequence" if len(list_seq) == 1 else f"{len(list_seq)} sequences"
+            raise ValueError(
+                f"'{part}' part contains {count} too short (e.g. '{seq}', n={len(seq)}) for the "
+                f"{driver} split length (n_max={n_max})."
+                f"\n  For free peptides with no flanking context, use Segment-only splits or reduce "
+                f"'len_max'/'n_split_max' (via SequenceFeature.get_split_kws),"
+                f"\n  or add flanking context (jmd_n/jmd_c).")
 
 
 # Check df_scales & df_cat
diff --git a/aaanalysis/pipe/_find_features.py b/aaanalysis/pipe/_find_features.py
index c4f578fb..cbf900f7 100644
--- a/aaanalysis/pipe/_find_features.py
+++ b/aaanalysis/pipe/_find_features.py
@@ -8,6 +8,7 @@
 more models.
 """
 from typing import Optional, List, Tuple, Union, Dict
+import warnings
 import numpy as np
 import pandas as pd
 from matplotlib.axes import Axes
@@ -79,9 +80,56 @@
                  (ut.STR_SEGMENT, ut.STR_PERIODIC_PATTERN): "p2",
                  (ut.STR_SEGMENT, ut.STR_PATTERN, ut.STR_PERIODIC_PATTERN): "p1+p2"}
 # Levers a power user may pin via the bounded ``kws`` dict (unknown keys raise).
-_KWS_KEYS = {"n_explain", "n_split_max", "n_filter", "n_jmd", "simplify_strategy",
+_KWS_KEYS = {"n_explain", "n_split_max", "n_filter", "n_jmd", "len_max", "simplify_strategy",
              "max_cor", "max_overlap"}
 _LIST_MODELS = [ut.MODEL_SVM, ut.MODEL_RF, ut.MODEL_LOG_REG]
+# The first (default) PeriodicPattern step; a part must be at least this long to carry one.
+_PERIODIC_STEP0 = 3
+
+
+def _split_kws_for(split_types=None, n_split_max=15, len_max=15):
+    """Build ``split_kws`` from the frontend ``SequenceFeature.get_split_kws``.
+
+    Threads the Segment ``n_split_max`` and Pattern ``len_max`` levers through the public
+    front door so ``find_features`` can request shorter splits (needed for free peptides).
+    """
+    return SequenceFeature.get_split_kws(split_types=list(split_types),
+                                         n_split_max=n_split_max, len_max=len_max)
+
+
+def _fit_split_kws_to_parts(split_types=None, n_split_max=15, len_max=15, df_parts=None):
+    """Adapt the requested split config to the shortest sequence part (free-peptide safety net).
+
+    A part of length ``L`` can only carry a ``Segment`` with ``n_split_max <= L``, a ``Pattern``
+    with ``len_max <= L``, and a ``PeriodicPattern`` with ``steps[0] (=3) <= L``. When the shortest
+    part is too short for the requested config, ``Pattern`` / ``PeriodicPattern`` are dropped and the
+    ``Segment`` ``n_split_max`` is clamped so the run still works (``Segment``-only at minimum). A
+    single ``UserWarning`` names what changed. For parts long enough for the requested config nothing
+    is dropped or clamped, so the built ``split_kws`` is byte-identical to the requested one.
+    """
+    min_len = int(min(df_parts[c].map(len).min() for c in df_parts.columns))
+    kept, changes = list(split_types), []
+    if ut.STR_PERIODIC_PATTERN in kept and min_len < _PERIODIC_STEP0:
+        kept.remove(ut.STR_PERIODIC_PATTERN)
+        changes.append(f"dropped 'PeriodicPattern' (needs >= {_PERIODIC_STEP0} residues)")
+    if ut.STR_PATTERN in kept and min_len < len_max:
+        kept.remove(ut.STR_PATTERN)
+        changes.append(f"dropped 'Pattern' (len_max={len_max} > shortest part n={min_len})")
+    fitted_n_split_max = n_split_max
+    if ut.STR_SEGMENT not in kept:
+        # Never leave zero split types; Segment is the universal fallback (works down to n=1).
+        kept.insert(0, ut.STR_SEGMENT)
+    if n_split_max > min_len:
+        fitted_n_split_max = min_len
+        changes.append(f"clamped Segment 'n_split_max' {n_split_max} -> {min_len}")
+    split_kws = _split_kws_for(split_types=kept, n_split_max=fitted_n_split_max, len_max=len_max)
+    if changes:
+        warnings.warn(
+            f"'find_features': the shortest sequence part (n={min_len}) is too short for the "
+            f"requested splits; {'; '.join(changes)}. This keeps the run working on free peptides / "
+            f"short parts. Set 'kws' (n_split_max / len_max / n_jmd) or add flanking context to "
+            f"control this.", UserWarning)
+    return split_kws
 
 
 def _resolve_model(model, random_state=None):
@@ -158,6 +206,9 @@ def _resolve_config(search="balanced", kws=None):
         "n_jmd_vals": list(mode["n_jmd_vals"]),
         "simplify_strategy": mode["simplify_strategy"],
         "max_cor": 0.5, "max_overlap": 0.5,
+        # Pattern span (default 15 = the SequenceFeature.get_split_kws default). A power user lowers
+        # it via kws["len_max"] to request shorter Pattern splits on short / free-peptide parts.
+        "len_max": 15,
     }
     if kws is not None:
         ut.check_dict(name="kws", val=kws)
@@ -169,6 +220,8 @@ def _resolve_config(search="balanced", kws=None):
             cfg["sweep_scales"] = False
         if "n_split_max" in kws:
             cfg["n_split_max_vals"] = [kws["n_split_max"]]
+        if "len_max" in kws:
+            cfg["len_max"] = kws["len_max"]
         if "n_filter" in kws:
             cfg["n_filter_vals"] = [kws["n_filter"]]
         if "n_jmd" in kws:
@@ -286,8 +339,13 @@ def find_features(labels: ut.ArrayLike1D,
         Cross-validation scoring metric(s). A list triggers multi-objective Pareto selection.
     kws : dict, optional
         Bounded power-user overrides; each pins a swept lever to a single value (unknown keys raise).
-        Recognized keys: ``n_explain``, ``n_split_max``, ``n_filter``, ``n_jmd`` (the symmetric JMD
-        length ``jmd_n_len = jmd_c_len``), ``simplify_strategy``, ``max_cor``, ``max_overlap``.
+        Recognized keys: ``n_explain``, ``n_split_max`` (max ``Segment`` splits), ``len_max`` (max
+        ``Pattern`` span), ``n_filter``, ``n_jmd`` (the symmetric JMD length ``jmd_n_len =
+        jmd_c_len``), ``simplify_strategy``, ``max_cor``, ``max_overlap``. For **free peptides / short
+        parts** (no flanking context), pass ``kws={"n_jmd": 0}`` so no JMD is carved out; the split
+        config then auto-fits to the shortest part (``Pattern`` / ``PeriodicPattern`` are dropped and
+        ``n_split_max`` is clamped, with a ``UserWarning``). Lower ``n_split_max`` / ``len_max``
+        yourself to control which splits are used.
     subcategories : list of str, optional
         AAontology subcategories to restrict the scale sets to. If ``None``, all scales of the grade.
     top_n : int, optional
@@ -416,7 +474,15 @@ def _run_fast(sf=None, labels=None, df_seq=None, cfg=None, simplify=True, models
     df_scales = _load_scale_spec(spec, subcategories=subcategories)
     if df_scales is None:
         raise ValueError(f"'subcategories' ({subcategories}) should be names that match a scale.")
-    cpp = CPP(df_parts=df_parts, df_scales=df_scales, random_state=random_state, verbose=verbose)
+    # Thread the requested Segment n_split_max / Pattern len_max through to the split_kws (was
+    # always the default before, silently ignoring kws), and auto-fit to the shortest part so a
+    # free peptide / short part drops Pattern-type splits + clamps n_split_max instead of hard
+    # erroring. For long-enough parts this is byte-identical to the default split_kws.
+    split_kws = _fit_split_kws_to_parts(split_types=split_types,
+                                        n_split_max=cfg["n_split_max_vals"][0],
+                                        len_max=cfg["len_max"], df_parts=df_parts)
+    cpp = CPP(df_parts=df_parts, df_scales=df_scales, split_kws=split_kws,
+              random_state=random_state, verbose=verbose)
     df_feat = cpp.run(labels=labels, label_test=label_test, label_ref=label_ref,
                       n_filter=cfg["n_filter_vals"][0], max_cor=cfg["max_cor"],
                       max_overlap=cfg["max_overlap"], n_jobs=n_jobs)
@@ -459,7 +525,8 @@ def _grid_stage(sf=None, df_seq=None, parts=None, split_sets=None, n_split_vals=
         raise ValueError(f"'subcategories' ({subcategories}) should match at least one scale.")
     params_parts = {"list_parts": [list(p) for p in parts],
                     "jmd_n_len": list(n_jmd_vals), "jmd_c_len": list(n_jmd_vals)}
-    params_split = {"split_types": [list(s) for s in split_sets], "n_split_max": list(n_split_vals)}
+    params_split = {"split_types": [list(s) for s in split_sets], "n_split_max": list(n_split_vals),
+                    "len_max": cfg["len_max"]}
     params_cpp = {"n_filter": list(n_filters), "label_test": label_test, "label_ref": label_ref,
                   "max_cor": cfg["max_cor"], "max_overlap": cfg["max_overlap"]}
     cppg = CPPGrid(df_seq=df_seq, labels=labels, random_state=random_state, verbose=verbose,
@@ -556,7 +623,12 @@ def _run_search(sf=None, labels=None, df_seq=None, cfg=None, simplify=True, mode
     base = _cv_scores(X_win, labels, models=models, cv=cv, metrics=metrics, random_state=random_state)
     rows3 = []
     if simplify:
-        cpp_win = CPP(df_parts=df_parts_win, df_scales=df_scales_win,
+        # Rebuild the winner's split_kws (Segment n_split_max / Pattern len_max) so the CPP used for
+        # simplify validates against short / free-peptide parts. simplify operates on the existing
+        # df_feat (it never reads split_kws), so this does not change the result for normal parts.
+        split_kws_win = _split_kws_for(split_types=win2["split_types"],
+                                       n_split_max=win2["n_split_max"], len_max=cfg["len_max"])
+        cpp_win = CPP(df_parts=df_parts_win, df_scales=df_scales_win, split_kws=split_kws_win,
                       random_state=random_state, verbose=verbose)
         df_simpl = cpp_win.simplify(df_feat=df_feat_win, labels=labels,
                                     strategy=cfg["simplify_strategy"], ml_cv=cv,
diff --git a/docs/source/index/release_notes.rst b/docs/source/index/release_notes.rst
index ed7d1583..9b91e2c4 100644
--- a/docs/source/index/release_notes.rst
+++ b/docs/source/index/release_notes.rst
@@ -363,6 +363,23 @@ Changed
   full-path import such as ``from aaanalysis.protein_design import SeqMut`` must become
   ``from aaanalysis.protein_engineering import SeqMut``.
 
+Fixed
+~~~~~
+
+- **CPP splits on free peptides / short parts (#338)**: ``aap.find_features`` and the
+  ``Pattern`` / ``PeriodicPattern`` splits were unusable on free peptides with no flanking
+  context (the linear-epitope case). ``find_features(search="fast")`` and its Stage-3
+  simplify step ignored the requested / winning split configuration and always used the
+  default (``len_max=15``, ``n_split_max=15``), so any target region shorter than ~15
+  residues raised. The bounded ``kws`` dict now accepts ``len_max`` (and actually honors
+  ``n_split_max``) so shorter ``Pattern`` / ``Segment`` splits can be requested; the fast
+  path auto-fits the split configuration to the shortest part — dropping
+  ``Pattern`` / ``PeriodicPattern`` and clamping ``n_split_max`` with a ``UserWarning`` — so
+  free peptides run out of the box; and the too-short-part ``ValueError`` now names the
+  binding split length and how to fix it (Segment-only splits, lower
+  ``len_max`` / ``n_split_max``, or add ``jmd_n`` / ``jmd_c`` context). Results for flanked
+  inputs are unchanged.
+
 
 Version 1.0 (Stable Version)
 --------------------------------
diff --git a/tests/unit/cpp_tests/test_check_feature_backend.py b/tests/unit/cpp_tests/test_check_feature_backend.py
index 73227a0c..942c73ad 100644
--- a/tests/unit/cpp_tests/test_check_feature_backend.py
+++ b/tests/unit/cpp_tests/test_check_feature_backend.py
@@ -251,6 +251,42 @@ def test_match_df_parts_split_kws_too_short(self):
             check_match_df_parts_split_kws(
                 df_parts=self._df_parts(["AC"]), split_kws=kws)
 
+    def test_match_df_parts_split_kws_message_is_actionable(self):
+        # #338: a free peptide too short for the default splits must get a message that states the
+        # real cause (which split length binds n_max) AND how to fix it.
+        kws = {"Segment": {"n_split_min": 1, "n_split_max": 15},
+               "Pattern": {"len_max": 15, "n_max": 4, "n_min": 2, "steps": [3, 4]},
+               "PeriodicPattern": {"steps": [3, 4]}}
+        with pytest.raises(ValueError) as exc:
+            check_match_df_parts_split_kws(df_parts=self._df_parts(["PQFTIFGT"]), split_kws=kws)
+        msg = str(exc.value)
+        # Cause: names the binding split type + its parameter and the offending length.
+        assert "n_max=15" in msg and "len_max=15" in msg and "n=8" in msg
+        # Fix: points at the concrete remedies.
+        assert "Segment-only" in msg
+        assert "len_max" in msg and "n_split_max" in msg
+        assert "jmd_n" in msg and "jmd_c" in msg
+
+    def test_match_df_parts_split_kws_names_binding_split_type(self):
+        # Segment n_split_max is the sole binding requirement here -> named as the driver.
+        kws = {"Segment": {"n_split_min": 1, "n_split_max": 12}}
+        with pytest.raises(ValueError, match=r"Segment \(n_split_max=12\)"):
+            check_match_df_parts_split_kws(df_parts=self._df_parts(["ACDEF"]), split_kws=kws)
+
+    def test_match_df_parts_split_kws_segment_only_short_ok(self):
+        # Positive: Segment-only with n_split_max <= part length passes (free-peptide path).
+        kws = {"Segment": {"n_split_min": 1, "n_split_max": 8}}
+        assert check_match_df_parts_split_kws(
+            df_parts=self._df_parts(["PQFTIFGT", "AIVMWFLL"]), split_kws=kws) is None
+
+    def test_match_df_parts_split_kws_reduced_len_max_ok(self):
+        # Positive: reducing len_max/n_split_max lets short parts pass with all split types.
+        kws = {"Segment": {"n_split_min": 1, "n_split_max": 8},
+               "Pattern": {"len_max": 8, "n_max": 4, "n_min": 2, "steps": [3, 4]},
+               "PeriodicPattern": {"steps": [3, 4]}}
+        assert check_match_df_parts_split_kws(
+            df_parts=self._df_parts(["PQFTIFGT", "AIVMWFLL"]), split_kws=kws) is None
+
     def test_match_df_parts_df_scales_missing_char(self):
         # 'B' is not a canonical AA in df_scales index -> missing char, no gaps
         order = "ACDEFGHIKLMNPQRSTVWY"
diff --git a/tests/unit/pipe_tests/test_aap_find_features.py b/tests/unit/pipe_tests/test_aap_find_features.py
index cee5dc92..d5160218 100644
--- a/tests/unit/pipe_tests/test_aap_find_features.py
+++ b/tests/unit/pipe_tests/test_aap_find_features.py
@@ -1,4 +1,5 @@
 """This script tests the aaanalysis.pipe.find_features() staged CPP AutoML golden pipeline."""
+import warnings
 import matplotlib
 matplotlib.use("Agg")
 from matplotlib.axes import Axes
@@ -10,7 +11,8 @@
 import aaanalysis.pipe as aap
 from aaanalysis.pipe._find_features import (_resolve_config, _resolve_models, _load_scale_spec,
                                             _cv_scores, _pareto_mask, _axis_impact, _MODES,
-                                            _PART_SETS, _SPLIT_TYPE_SETS)
+                                            _PART_SETS, _SPLIT_TYPE_SETS, _KWS_KEYS,
+                                            _fit_split_kws_to_parts, _split_kws_for)
 
 aa.options["verbose"] = False
 
@@ -22,6 +24,13 @@
 # kws that shrink a search to a tiny Stage-1 grid (one scale, one n_split) so tests stay fast.
 SMALL = {"n_explain": 30, "n_split_max": 15}
 
+# #338: short free peptides (linear epitopes) with NO flanking context (8 aa each).
+_FREE_SEQS = ["PQFTIFGT", "AIVMWFLL", "GKKRTLSN", "DDECWQPT", "MNPQRSTV", "LLIIVVAA",
+              "KKRPWWFT", "SSTTNNQQ", "WWYYFFTT", "RRKKHHDD"]
+df_seq_free = pd.DataFrame({"entry": [f"P{i}" for i in range(len(_FREE_SEQS))],
+                            "sequence": _FREE_SEQS})
+labels_free = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
+
 
 def _explicit_fast(random_state=0):
     """The explicit single-CPP chain that find_features(search='fast') mirrors byte-for-byte."""
@@ -328,6 +337,37 @@ def test_fast_ax_eval_empty(self):
                                      random_state=0, n_jobs=1)
         assert isinstance(ax, Axes) and ax.eval == []
 
+    # #338: free peptides with no flanking context must be usable end to end.
+    def test_fast_free_peptides_auto_fits_and_warns(self):
+        # search='fast' with n_jmd=0 (no flanks): the split config auto-fits to the short parts
+        # (Pattern dropped, n_split_max clamped) with a UserWarning, and still returns features.
+        with pytest.warns(UserWarning, match="too short"):
+            df_feat, _, df_eval = aap.find_features(
+                labels_free, df_seq=df_seq_free, search="fast", plot=False,
+                kws={"n_jmd": 0}, random_state=0, n_jobs=1)
+        assert len(df_feat) > 0 and len(df_eval) == 1
+
+    def test_fast_free_peptides_explicit_kws_runs(self):
+        # The len_max / n_split_max kws are honored (threaded into the split_kws) so a user can
+        # request shorter Pattern / Segment splits on free peptides; the run returns features.
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            df_feat, _, _ = aap.find_features(
+                labels_free, df_seq=df_seq_free, search="fast", plot=False,
+                kws={"n_jmd": 0, "n_split_max": 8, "len_max": 8}, random_state=0, n_jobs=1)
+        assert len(df_feat) > 0
+
+    @pytest.mark.slow
+    def test_balanced_free_peptides_runs(self):
+        # The staged search reaches Stage 3 (simplify) on free peptides without hard-erroring.
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            df_feat, _, df_eval = aap.find_features(
+                labels_free, df_seq=df_seq_free, search="balanced", plot=False,
+                kws={"n_jmd": 0}, random_state=0, n_jobs=1)
+        assert len(df_feat) > 0
+        assert int(df_eval["is_selected"].sum()) == 1
+
     @pytest.mark.slow
     def test_balanced_ax_eval_publication_figures(self):
         from matplotlib.figure import Figure
@@ -365,6 +405,60 @@ def test_resolve_config_unknown_kws_raises(self):
         with pytest.raises(ValueError):
             _resolve_config(search="balanced", kws={"bogus": 1})
 
+    # #338: len_max lever + auto-fit for free peptides / short parts
+    def test_kws_keys_includes_len_max(self):
+        assert "len_max" in _KWS_KEYS
+
+    def test_resolve_config_default_len_max(self):
+        # Default Pattern span is 15 for every grade (matches SequenceFeature.get_split_kws).
+        for mode in _MODES:
+            assert _resolve_config(search=mode)["len_max"] == 15
+
+    def test_resolve_config_kws_pins_len_max(self):
+        assert _resolve_config(search="balanced", kws={"len_max": 8})["len_max"] == 8
+
+    def test_split_kws_for_threads_levers(self):
+        skw = _split_kws_for(split_types=_SPLIT_TYPE_SETS[-1], n_split_max=6, len_max=7)
+        assert skw["Segment"]["n_split_max"] == 6
+        assert skw["Pattern"]["len_max"] == 7
+
+    def test_fit_split_kws_long_parts_byte_identical(self):
+        # Parts long enough for the default config -> no drop / clamp / warning, default split_kws.
+        df_parts = pd.DataFrame({"tmd": ["ACDEFGHIKLMNPQRSTVWY"] * 3})
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")  # any warning would fail here
+            skw = _fit_split_kws_to_parts(split_types=_SPLIT_TYPE_SETS[-1], n_split_max=15,
+                                          len_max=15, df_parts=df_parts)
+        assert set(skw) == {"Segment", "Pattern", "PeriodicPattern"}
+        assert skw["Segment"]["n_split_max"] == 15 and skw["Pattern"]["len_max"] == 15
+
+    def test_fit_split_kws_short_parts_drops_pattern_and_clamps_segment(self):
+        df_parts = pd.DataFrame({"tmd": ["PQFTIFGT", "AIVMWFLL"]})  # n=8
+        with pytest.warns(UserWarning, match="too short"):
+            skw = _fit_split_kws_to_parts(split_types=_SPLIT_TYPE_SETS[-1], n_split_max=15,
+                                          len_max=15, df_parts=df_parts)
+        # Pattern (len_max=15 > 8) dropped; Segment kept but clamped to 8; PeriodicPattern (3<=8) kept.
+        assert "Pattern" not in skw
+        assert skw["Segment"]["n_split_max"] == 8
+        assert "PeriodicPattern" in skw
+
+    def test_fit_split_kws_always_keeps_segment(self):
+        # Even a 2-residue part keeps a (clamped) Segment so the run never has zero split types.
+        df_parts = pd.DataFrame({"tmd": ["AC", "DE"]})
+        with pytest.warns(UserWarning):
+            skw = _fit_split_kws_to_parts(split_types=_SPLIT_TYPE_SETS[-1], n_split_max=15,
+                                          len_max=15, df_parts=df_parts)
+        assert "Segment" in skw and skw["Segment"]["n_split_max"] == 2
+
+    def test_fit_split_kws_no_warn_when_config_fits(self):
+        # A user who already lowered n_split_max / len_max to fit the parts gets no warning.
+        df_parts = pd.DataFrame({"tmd": ["PQFTIFGT", "AIVMWFLL"]})  # n=8
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            skw = _fit_split_kws_to_parts(split_types=_SPLIT_TYPE_SETS[-1], n_split_max=8,
+                                          len_max=8, df_parts=df_parts)
+        assert set(skw) == {"Segment", "Pattern", "PeriodicPattern"}
+
     def test_resolve_models_list(self):
         models = _resolve_models(["svm", "rf"], random_state=0)
         assert len(models) == 2