breimanntools · breimanntools · Jul 3, 2026 · Jun 30, 2026 · Jul 1, 2026 · Jul 1, 2026
diff --git a/aaanalysis/feature_engineering/_backend/cpp/sequence_feature.py b/aaanalysis/feature_engineering/_backend/cpp/sequence_feature.py
@@ -136,6 +136,50 @@ def get_df_feat_(features=None, df_parts=None, labels=None,
     return df
 
 
+# Scale-composition baseline featurization
+def get_scale_composition_(df_parts=None, df_scales=None):
+    """Per-sequence scale average over the concatenated sequence parts (vectorized).
+
+    For each row of ``df_parts`` the part strings are concatenated into one span; every
+    residue that is not a (single-letter) index label of ``df_scales`` (gaps ``'-'`` and
+    any other non-canonical symbol) is dropped, and the remaining residues' scale rows are
+    averaged column-wise into one value per scale, giving the ``(n_seq, n_scales)`` matrix.
+    A row whose span has no scored residue (empty / all-non-canonical) becomes all-``NaN``;
+    ``n_kept`` reports the number of averaged residues per row so the frontend can warn.
+
+    Implementation: all residues of all spans are flattened into one byte array and mapped
+    to scale rows via a 256-entry lookup table; a single ``np.bincount`` tallies a small
+    per-sequence residue-count matrix ``(n_seq, n_letters)``, and one BLAS matmul against
+    the scale matrix yields the per-sequence sums — no per-sequence Python loop or
+    ``DataFrame.loc``, and the wide part scales with ``n_scales`` as a matmul, not a scatter.
+    """
+    n_seq = len(df_parts)
+    scales_arr = df_scales.to_numpy(dtype=float)                    # (n_letters, n_scales)
+    n_letters = scales_arr.shape[0]
+    # Byte -> scale-row lookup (-1 = non-scored); only single-character labels can match a residue
+    lut = np.full(256, -1, dtype=np.intp)
+    for row, aa in enumerate(df_scales.index):
+        if len(aa) == 1 and ord(aa) < 256:
+            lut[ord(aa)] = row
+    # Flatten every residue of every span, tracking its owning sequence
+    # (df_parts columns are guaranteed str by the frontend's get_df_parts)
+    spans = df_parts.agg("".join, axis=1).to_list()
+    lengths = np.fromiter((len(s) for s in spans), dtype=np.intp, count=n_seq)
+    codes = np.frombuffer("".join(spans).encode("latin-1", "replace"), dtype=np.uint8)
+    seq_ids = np.repeat(np.arange(n_seq), lengths)
+    rows = lut[codes]                                               # scale row per residue
+    valid = rows >= 0
+    seq_ids, rows = seq_ids[valid], rows[valid]
+    # Per-sequence residue-count matrix (n_seq, n_letters) from one bincount, then sums via matmul
+    counts = np.bincount(seq_ids * n_letters + rows,
+                         minlength=n_seq * n_letters).reshape(n_seq, n_letters)
+    n_kept = counts.sum(axis=1).astype(int)
+    sums = counts @ scales_arr                                     # (n_seq, n_scales)
+    with np.errstate(invalid="ignore"):
+        X = sums / n_kept[:, None]                                 # 0 / 0 -> NaN (empty rows)
+    return X, n_kept
+
+
 # Multi-class / regression label conversion
 def get_labels_ovr_(labels=None, label_test=1, label_ref=0):
     """One-vs-rest: per class, a full-length binary array (class -> test, rest -> ref)."""

diff --git a/aaanalysis/feature_engineering/_sequence_feature.py b/aaanalysis/feature_engineering/_sequence_feature.py
@@ -26,7 +26,7 @@
                                          get_positions_, get_amino_acids_,
                                          get_df_pos_, get_df_pos_parts_)
 from ._backend.cpp.sequence_feature import (get_split_kws_, get_features_, get_feature_names_,
-                                            get_feature_descriptions_, get_df_feat_,
+                                            get_feature_descriptions_, get_df_feat_, get_scale_composition_,
                                             get_labels_ovr_, get_labels_ovo_, get_labels_quantile_,
                                             get_labels_tiered_)
 from ._backend.cpp_run import _pick_feature_matrix_builder
@@ -853,6 +853,97 @@ def feature_matrix(self,
             start += n
         return list_X if batch else list_X[0]
 
+    def scale_composition(self,
+                          df_seq: pd.DataFrame,
+                          df_scales: Optional[pd.DataFrame] = None,
+                          list_parts: Optional[Union[str, List[str]]] = None,
+                          return_df: bool = False,
+                          ) -> Union[ut.ArrayLike2D, pd.DataFrame]:
+        """
+        Create the scale-composition baseline matrix for given sequences and scales.
+
+        Builds the no-positional-split **scale-based baseline** featurization: for each
+        sequence the requested Parts are concatenated into one span, and every scale is
+        averaged over the residues of that span, yielding one value per scale. The result
+        is the ``(n_seq, n_scales)`` matrix ``X`` — the sequence's mean profile in
+        scale-space, the scale-based analogue of amino-acid composition. Unlike
+        :meth:`SequenceFeature.feature_matrix`, which averages each scale over the residues
+        of a specific Part-Split, this method averages each scale over the **whole span**
+        (a single mean per scale), with no positional information.
+
+        **Application.** Use this to build a **baseline feature set** for a prediction
+        model: fit the same classifier on this scale-composition ``X`` and on a
+        :class:`CPP` :meth:`feature_matrix` and compare the scores to show how much the
+        positional Part-Split-Scale features add over a plain scale-average encoding (the
+        "scale baseline vs CPP" comparison). It is *not* a positional feature set — reach
+        for :class:`CPP` when you need where-along-the-sequence information.
+
+        .. versionadded:: 1.1.0
+
+        Parameters
+        ----------
+        df_seq : pd.DataFrame, shape (n_samples, n_seq_info)
+            DataFrame containing an ``entry`` column with unique protein identifiers and sequence information
+            in a distinct format: **Position-based**, **Part-based**, **Sequence-based**, or **Sequence-TMD-based**
+            (the same input accepted by :meth:`SequenceFeature.get_df_parts`).
+        df_scales : pd.DataFrame, shape (n_letters, n_scales), optional
+            DataFrame of scales with letters (amino acids) as index. Default from :meth:`load_scales`
+            unless specified in ``options['df_scales']``. The index defines which residues are scored;
+            residues absent from it are dropped before averaging.
+        list_parts : str or list of str, optional
+            Names of the sequence parts to average over (see :class:`SequenceFeature` for valid parts).
+            If ``None`` (default), the whole TMD-JMD span ``jmd_n`` + ``tmd`` + ``jmd_c`` (the ``tmd_jmd``
+            part) is used. Multiple parts are concatenated per sequence before averaging.
+        return_df : bool, default=False
+            If ``True``, return a labeled ``pd.DataFrame`` (rows indexed like ``df_parts``, one column per
+            scale) instead of a plain numpy array.
+
+        Returns
+        -------
+        X : array-like, shape (n_samples, n_scales)
+            Scale-average matrix containing, per sequence, the mean of each scale over the span residues.
+            Returned as a ``pd.DataFrame`` (scale names as columns) when ``return_df=True``.
+
+        Notes
+        -----
+        * **Missing / non-canonical residues**: a residue is averaged only if it is an index label of
+          ``df_scales``. Gap symbols (``'-'``) and any other non-canonical symbol (e.g. ``'X'``) are
+          dropped per the package convention, so the mean is taken over the scored residues only.
+        * A sequence whose span has **no** scored residue (empty span, or all residues non-canonical)
+          yields an all-``NaN`` row; a ``UserWarning`` naming the count is emitted when ``verbose=True``.
+        * This is a no-positional-split mean only; positional splits remain the job of :class:`CPP`.
+
+        See Also
+        --------
+        * :meth:`SequenceFeature.feature_matrix` for the positional Part-Split-Scale feature matrix.
+        * :meth:`SequenceFeature.get_df_parts` for the part-slicing used to build the span.
+
+        Examples
+        --------
+        .. include:: examples/sf_scale_composition.rst
+        """
+        # Load defaults
+        if df_scales is None:
+            df_scales = ut.load_default_scales()
+        # Check input
+        ut.check_df_seq(df_seq=df_seq)
+        check_df_scales(df_scales=df_scales)
+        ut.check_bool(name="return_df", val=return_df)
+        list_parts = ut.check_list_parts(list_parts=list_parts, return_default=False, accept_none=True)
+        # Resolve parts: None -> whole TMD-JMD span (jmd_n + tmd + jmd_c, the "tmd_jmd" part)
+        list_parts_used = ["tmd_jmd"] if list_parts is None else list_parts
+        # Build the sequence parts and the scale-average matrix
+        df_parts = self.get_df_parts(df_seq=df_seq, list_parts=list_parts_used)
+        X, n_kept = get_scale_composition_(df_parts=df_parts, df_scales=df_scales)
+        # Warn about rows that had no scored residue (all-NaN output)
+        n_empty = int((n_kept == 0).sum())
+        if n_empty > 0 and self.verbose:
+            warnings.warn(f"{n_empty} sequence(s) have no scored residue in the selected parts "
+                          f"(empty or all non-canonical); their scale-average row is all-NaN.")
+        if return_df:
+            return pd.DataFrame(X, index=df_parts.index, columns=list(df_scales.columns))
+        return X
+
     def prune_by_variance(self,
                           df_feat: pd.DataFrame,
                           df_parts: Optional[pd.DataFrame] = None,

diff --git a/docs/source/index/release_notes.rst b/docs/source/index/release_notes.rst
@@ -58,6 +58,15 @@ Added
 - **SequenceFeature.get_labels_quantile / get_labels_tiered**: Discretize a continuous
   target into binary ``labels`` — a single quantile cut, or a fixed positive set swept
   against stepwise-lowered negative cuts (each tier row-matched).
+- **SequenceFeature.scale_composition**: Scale-composition baseline featurizer that turns
+  sequences + scales into a ``(n_seq, n_scales)`` matrix by averaging each scale over a
+  sequence span (``list_parts=None`` → whole ``jmd_n`` + ``tmd`` + ``jmd_c``), dropping
+  missing / non-canonical residues — the sequence's mean profile in scale-space (the
+  scale-based analogue of amino-acid composition). The no-positional-split baseline to
+  compare against ``feature_matrix`` / CPP; optional ``return_df=True`` for a labeled frame.
+- **SequenceFeature.get_df_parts_from_windows**: Assemble a reference ``df_parts`` from
+  per-part window sets (e.g. ``AAWindowSampler.sample_synthetic`` output).
+- **SequenceFeature.get_seq_kws**: Return one protein's ``{jmd_n_seq, tmd_seq, jmd_c_seq}``
 - :meth:`~aaanalysis.SequenceFeature.get_df_parts_from_windows`: Assemble a reference ``df_parts`` from
   per-part window sets (e.g. :meth:`~aaanalysis.AAWindowSampler.sample_synthetic` output).
 - :meth:`~aaanalysis.SequenceFeature.get_seq_kws`: Return one protein's ``{jmd_n_seq, tmd_seq, jmd_c_seq}``