From 379bf6341e384470e66d5ddc78cb14fae1131394 Mon Sep 17 00:00:00 2001 From: Stephan Breimann Date: Sat, 4 Jul 2026 15:42:30 +0200 Subject: [PATCH 1/5] fix(correctness): batch of low-risk audit fixes (defensive copies, validation gaps, messages) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Part of #342. None of these change published numeric output. - Defensive copies (shared-object mutation): load_dataset(gap) and load_features no longer return/mutate the shared lru_cache frame; get_df_feat no longer rewrites the caller's df_parts in place. - Crash-on-edge-input: read_fasta raises a clear ValueError on pre-header text (was KeyError); SeqMut.mutate re-joins scored rows on (entry, mutation) so duplicate labels across entries no longer crash; CPP.eval clamps the cluster search so a single-feature set can't hit KMeans(0). - comp_seq_sim self-similarity diagonal is 100 (the [0,100] scale), not 1. - Plotting/display: marker_size length check is reachable (ValueError, not a later IndexError); display_df row/col selector is 0..n-1; weight_bold no longer renders thinner than normal; feature-map show_only_max uses imp_bar_label_type; profile y-padding uses max(0, ...); plot_legend validation errors report the real parameter names. - Validator/message hygiene: check_metric and check_match_X_n_clusters messages corrected; load_scales name=; comp_seq_sim paren; explicit categories on previously bare warnings.warn calls. Regression tests: tests/unit/test_correctness_batch_342.py (+ updated the check_match_X_n_clusters message assertion in test_aac_branch.py). Held (not in this commit): config.py options-validation group (verbose/ random_state/name_tmd/LOKY) — CONFIRM-FIRST surface, awaiting sign-off. Deferred: sliding-window over-large-window guard — needs a silent-empty- vs-raise decision (current silent-empty behavior is test-encoded). Co-Authored-By: Claude Opus 4.8 --- aaanalysis/_utils/plotting.py | 4 +- .../data_handling/_backend/parse_fasta.py | 5 +- aaanalysis/data_handling/_load_dataset.py | 6 +- aaanalysis/data_handling/_load_features.py | 3 +- aaanalysis/data_handling/_load_scales.py | 2 +- aaanalysis/data_handling/_read_fasta.py | 4 +- aaanalysis/feature_engineering/_aaclust.py | 2 +- .../_backend/check_aaclust.py | 2 +- .../_backend/check_feature.py | 3 +- .../_backend/cpp/cpp_eval.py | 3 +- .../_backend/cpp/cpp_plot_feature_map.py | 2 +- .../_backend/cpp/cpp_plot_profile.py | 2 +- aaanalysis/plotting/_plot_legend.py | 4 +- aaanalysis/plotting/_plot_settings.py | 5 + aaanalysis/protein_engineering/_seqmut.py | 12 ++- .../seq_analysis_pro/_backend/comp_seq_sim.py | 4 +- aaanalysis/seq_analysis_pro/_comp_seq_sim.py | 2 +- aaanalysis/show_html/_display_df.py | 2 +- tests/unit/aaclust_tests/test_aac_branch.py | 2 +- tests/unit/test_correctness_batch_342.py | 97 +++++++++++++++++++ 20 files changed, 139 insertions(+), 27 deletions(-) create mode 100644 tests/unit/test_correctness_batch_342.py diff --git a/aaanalysis/_utils/plotting.py b/aaanalysis/_utils/plotting.py index 68140079b..42028aee1 100644 --- a/aaanalysis/_utils/plotting.py +++ b/aaanalysis/_utils/plotting.py @@ -137,10 +137,10 @@ def _check_marker_size(marker_size: Union[int, float, List[Union[int, float]]] = if isinstance(marker_size, (int, float)): check_number_range(name='marker_size', val=marker_size, min_val=0, accept_none=True, just_int=False) elif isinstance(marker_size, list): + if len(marker_size) != len(list_cat): + raise ValueError(f"Length must match of 'marker_size' (marker_size) and categories ({list_cat}).") for i in marker_size: check_number_range(name='marker_size', val=i, min_val=0, accept_none=True, just_int=False) - elif isinstance(marker_size, list) and len(marker_size) != len(list_cat): - raise ValueError(f"Length must match of 'marker_size' (marker_size) and categories ({list_cat}).") else: raise ValueError(f"'marker_size' has wrong data type: {type(marker_size)}") # Create marker_size list diff --git a/aaanalysis/data_handling/_backend/parse_fasta.py b/aaanalysis/data_handling/_backend/parse_fasta.py index e8ea46f68..b49cc4284 100644 --- a/aaanalysis/data_handling/_backend/parse_fasta.py +++ b/aaanalysis/data_handling/_backend/parse_fasta.py @@ -24,7 +24,10 @@ def get_entries_from_fasta(file_path=None, col_id="entry", col_seq="sequence", c if len(list_info) > 1: for i in range(1, len(list_info[1:])+1): dict_current_entry[f'info{i}'] = list_info[i] - else: + elif line: + if not dict_current_entry: + raise ValueError(f"'file_path' ('{file_path}') is not a valid FASTA file: " + f"sequence data appears before the first '>' header.") dict_current_entry[col_seq] += line if dict_current_entry: list_entries.append(dict_current_entry) diff --git a/aaanalysis/data_handling/_load_dataset.py b/aaanalysis/data_handling/_load_dataset.py index abc57d965..eb06a4609 100644 --- a/aaanalysis/data_handling/_load_dataset.py +++ b/aaanalysis/data_handling/_load_dataset.py @@ -62,7 +62,7 @@ def post_check_df_seq(df_seq=None, n=None, name=None) -> None: f"\nThis maximum value depends on the filtering settings used." # Validation of sequence and domain datasets if n is not None and len(df_seq) != n*2: - warnings.warn(warning_message) + warnings.warn(warning_message, UserWarning) # Helper functions @@ -256,7 +256,9 @@ def load_dataset(name: str = "Overview", # Load overview table if name == "Overview": return ut.read_csv_cached(FOLDER_BENCHMARKS + f"Overview.{ut.STR_FILE_TYPE}").copy() - df = ut.read_csv_cached(FOLDER_BENCHMARKS + name + f".{ut.STR_FILE_TYPE}") + # Copy the cached frame (like the Overview branch) so downstream filtering / + # non-canonical-AA substitution can't mutate the shared cache in place + df = ut.read_csv_cached(FOLDER_BENCHMARKS + name + f".{ut.STR_FILE_TYPE}").copy() # Filter data if min_len is not None: n_before = len(df) diff --git a/aaanalysis/data_handling/_load_features.py b/aaanalysis/data_handling/_load_features.py index eebcc3c52..0ebc69954 100644 --- a/aaanalysis/data_handling/_load_features.py +++ b/aaanalysis/data_handling/_load_features.py @@ -51,4 +51,5 @@ def load_features(name: Literal["DOM_GSEC"] = "DOM_GSEC") -> pd.DataFrame: check_name(name=name) # Load features df_feat = ut.read_csv_cached(FOLDER_FEATURES + f"FEATURES_{name}.{ut.STR_FILE_TYPE}") - return df_feat + # Copy the cached frame so a caller's in-place edit can't corrupt the shared cache + return df_feat.copy() diff --git a/aaanalysis/data_handling/_load_scales.py b/aaanalysis/data_handling/_load_scales.py index 52981c433..486522eaa 100644 --- a/aaanalysis/data_handling/_load_scales.py +++ b/aaanalysis/data_handling/_load_scales.py @@ -245,7 +245,7 @@ def load_scales(name: Literal["scales", "scales_raw", "scales_cat", "scales_pc", # Check input check_name_of_scale(name=name) ut.check_bool(name="just_aaindex", val=just_aaindex) - ut.check_bool(name="unclassified_in", val=unclassified_out) + ut.check_bool(name="unclassified_out", val=unclassified_out) top60_n = check_top60_n(name=name, top60_n=top60_n) check_top_explain(name=name, top_explain_n=top_explain_n, top_explain_min_th=top_explain_min_th, top60_n=top60_n) diff --git a/aaanalysis/data_handling/_read_fasta.py b/aaanalysis/data_handling/_read_fasta.py index 2a9c07da9..c3920f66f 100644 --- a/aaanalysis/data_handling/_read_fasta.py +++ b/aaanalysis/data_handling/_read_fasta.py @@ -18,7 +18,7 @@ def post_check_unique_entries(list_entries=None, col_id=None) -> None: if len(list_duplicates) > 0: str_warning = (f"Entries from '{col_id}' should be unique. " f"\nFollowing entries are duplicated: {list_duplicates}") - warnings.warn(str_warning) + warnings.warn(str_warning, UserWarning) def post_check_col_db(df_seq=None, col_db=None, sep="|") -> None: @@ -26,7 +26,7 @@ def post_check_col_db(df_seq=None, col_db=None, sep="|") -> None: columns = list(df_seq) if col_db is not None and col_db not in columns: str_warning = f"'col_db' ('{col_db}') not in 'df_seq'. Check if 'sep' ('{sep}') is matching." - warnings.warn(str_warning) + warnings.warn(str_warning, UserWarning) def _adjust_columns(df_seq=None, col_seq=None, col_id=None, cols_info=None, col_db=None): diff --git a/aaanalysis/feature_engineering/_aaclust.py b/aaanalysis/feature_engineering/_aaclust.py index 2e20e0081..a1933e2f8 100644 --- a/aaanalysis/feature_engineering/_aaclust.py +++ b/aaanalysis/feature_engineering/_aaclust.py @@ -42,7 +42,7 @@ def check_match_X_n_clusters(X=None, n_clusters=None, accept_none=True) -> None: if n_samples < n_clusters: raise ValueError(f"n_samples={n_samples} (in 'X') should be >= 'n_clusters' ({n_clusters})") if n_unique_samples < n_clusters: - raise ValueError(f"'n_clusters' ({n_clusters}) should be >= n_unique_samples={n_unique_samples} (in 'X').") + raise ValueError(f"'n_clusters' ({n_clusters}) should be <= n_unique_samples={n_unique_samples} (in 'X').") def check_match_df_seq_X(df_seq=None, X=None) -> None: diff --git a/aaanalysis/feature_engineering/_backend/check_aaclust.py b/aaanalysis/feature_engineering/_backend/check_aaclust.py index f357d3e4e..d8c577bc8 100644 --- a/aaanalysis/feature_engineering/_backend/check_aaclust.py +++ b/aaanalysis/feature_engineering/_backend/check_aaclust.py @@ -11,7 +11,7 @@ def check_metric(metric=None): """""" if metric not in ut.LIST_METRICS: - error = f"'metric' should be None or one of following: {ut.LIST_METRICS}" + error = f"'metric' should be one of following: {ut.LIST_METRICS}" raise ValueError(error) diff --git a/aaanalysis/feature_engineering/_backend/check_feature.py b/aaanalysis/feature_engineering/_backend/check_feature.py index e2f8f3146..b0b95232c 100644 --- a/aaanalysis/feature_engineering/_backend/check_feature.py +++ b/aaanalysis/feature_engineering/_backend/check_feature.py @@ -460,7 +460,8 @@ def check_match_df_parts_df_scales(df_parts=None, df_scales=None, accept_gaps=Fa char_scales.append(ut.STR_AA_GAP) missing_char = [x for x in char_parts if x not in char_scales] # Replace gaps by default amino acid gap - if accept_gaps: + if accept_gaps and missing_char: + df_parts = df_parts.copy() # copy so we don't mutate the caller's df_parts in place for col in list(df_parts): for mc in missing_char: df_parts[col] = df_parts[col].str.replace(mc, ut.STR_AA_GAP) diff --git a/aaanalysis/feature_engineering/_backend/cpp/cpp_eval.py b/aaanalysis/feature_engineering/_backend/cpp/cpp_eval.py index f333eebf0..8c0c4dbb2 100644 --- a/aaanalysis/feature_engineering/_backend/cpp/cpp_eval.py +++ b/aaanalysis/feature_engineering/_backend/cpp/cpp_eval.py @@ -36,7 +36,8 @@ def get_min_cor(X, labels=None): def get_best_n_clusters(X=None, min_th=0.3, random_state=None): """Obtain the best number of clusters based on internal cluster minimum correlation.""" n_features, _ = X.shape - max_n = min([100, n_features-1]) + # Clamp to >= 1 so a single-feature set does not fall through to KMeans(n_clusters=0) + max_n = max(1, min(100, n_features - 1)) best_n_clusters = max_n for n_clusters in range(2, max_n): kmeans = KMeans(n_clusters=n_clusters, n_init="auto", random_state=random_state) diff --git a/aaanalysis/feature_engineering/_backend/cpp/cpp_plot_feature_map.py b/aaanalysis/feature_engineering/_backend/cpp/cpp_plot_feature_map.py index 4041e00c9..bffefbd3b 100644 --- a/aaanalysis/feature_engineering/_backend/cpp/cpp_plot_feature_map.py +++ b/aaanalysis/feature_engineering/_backend/cpp/cpp_plot_feature_map.py @@ -318,7 +318,7 @@ def plot_feature_map(df_feat=None, df_cat=None, position=position_bar, multialignment=multialignment_bar, weight_annotation="bold") - show_only_max = add_imp_bar_top != "long" + show_only_max = imp_bar_label_type != "long" args_ticks_0 = dict(show_zero=False, show_only_max=show_only_max, precision=1) ut.ticks_0(ax_br, **args_ticks_0) diff --git a/aaanalysis/feature_engineering/_backend/cpp/cpp_plot_profile.py b/aaanalysis/feature_engineering/_backend/cpp/cpp_plot_profile.py index dd645a485..7ef6eb86b 100644 --- a/aaanalysis/feature_engineering/_backend/cpp/cpp_plot_profile.py +++ b/aaanalysis/feature_engineering/_backend/cpp/cpp_plot_profile.py @@ -107,7 +107,7 @@ def _plot_profile(df_pos=None, shap_plot=False, ax=None, ax.set_xlim(x_lim) if ylim is None: ymin, ymax = ax.get_ylim() - y_space = min(0, (ymax - ymin) * 0.25) + y_space = max(0, (ymax - ymin) * 0.25) y_lim = (ymin - y_space, ymax + y_space) ax.set_ylim(y_lim) diff --git a/aaanalysis/plotting/_plot_legend.py b/aaanalysis/plotting/_plot_legend.py index 0ccfe8d87..f64354f52 100644 --- a/aaanalysis/plotting/_plot_legend.py +++ b/aaanalysis/plotting/_plot_legend.py @@ -153,10 +153,10 @@ def plot_legend(ax: Optional[Axes] = None, ut.check_number_val(name="lw", val=lw, accept_none=True, just_int=False) args_non_neg = {"labelspacing": labelspacing, "columnspacing": columnspacing, "handletextpad": handletextpad, "handlelength": handlelength, - "fontsize": fontsize, "fontsize_legend": fontsize_title} + "fontsize": fontsize, "fontsize_title": fontsize_title} for key in args_non_neg: ut.check_number_range(name=key, val=args_non_neg[key], min_val=0, accept_none=True, just_int=False) - ut.check_bool(name="add_legend", val=keep_legend, accept_none=False) + ut.check_bool(name="keep_legend", val=keep_legend, accept_none=False) # Create new legend ax = ut.plot_legend_(ax=ax, dict_color=dict_color, list_cat=list_cat, labels=labels, loc=loc, loc_out=loc_out, y=y, x=x, n_cols=n_cols, diff --git a/aaanalysis/plotting/_plot_settings.py b/aaanalysis/plotting/_plot_settings.py index d3cecf778..b8993a148 100644 --- a/aaanalysis/plotting/_plot_settings.py +++ b/aaanalysis/plotting/_plot_settings.py @@ -161,6 +161,11 @@ def plot_settings(font_scale: Union[int, float] = 1, if weight_bold: plt.rcParams["axes.labelweight"] = "bold" plt.rcParams["axes.titleweight"] = "bold" + plt.rcParams["axes.linewidth"] = 1.25 + plt.rcParams["xtick.major.width"] = 1.1 + plt.rcParams["xtick.minor.width"] = 0.9 + plt.rcParams["ytick.major.width"] = 1.1 + plt.rcParams["ytick.minor.width"] = 0.9 else: plt.rcParams["axes.labelweight"] = "normal" plt.rcParams["axes.titleweight"] = "normal" diff --git a/aaanalysis/protein_engineering/_seqmut.py b/aaanalysis/protein_engineering/_seqmut.py index 7579aa800..d89422374 100644 --- a/aaanalysis/protein_engineering/_seqmut.py +++ b/aaanalysis/protein_engineering/_seqmut.py @@ -281,12 +281,14 @@ def mutate(self, df_plan[ut.COL_TMD_STOP])] df_scored = self._delta_table(df_plan=df_plan, df_seq=df_seq, df_feat=df_feat, jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len) - df_scored = df_scored.set_index(ut.COL_MUTATION) - df_mut[ut.COL_DELTA_CPP] = df_scored.loc[df_mut[ut.COL_MUTATION], ut.COL_DELTA_CPP].to_numpy() - df_mut[ut.COL_SHIFT_SCORE] = df_scored.loc[df_mut[ut.COL_MUTATION], ut.COL_SHIFT_SCORE].to_numpy() + # build_scan_output sorts by delta_cpp, so re-join the scored rows back to the + # mutation order using (entry, mutation) — the label alone is not unique across entries. + df_scored = df_scored.set_index([ut.COL_ENTRY, ut.COL_MUTATION]) + keys = list(zip(df_mut[ut.COL_ENTRY], df_mut[ut.COL_MUTATION])) + df_mut[ut.COL_DELTA_CPP] = df_scored.loc[keys, ut.COL_DELTA_CPP].to_numpy() + df_mut[ut.COL_SHIFT_SCORE] = df_scored.loc[keys, ut.COL_SHIFT_SCORE].to_numpy() if self._model is not None: - df_mut[ut.COL_DELTA_PRED] = df_scored.loc[df_mut[ut.COL_MUTATION], - ut.COL_DELTA_PRED].to_numpy() + df_mut[ut.COL_DELTA_PRED] = df_scored.loc[keys, ut.COL_DELTA_PRED].to_numpy() return df_mut def scan(self, diff --git a/aaanalysis/seq_analysis_pro/_backend/comp_seq_sim.py b/aaanalysis/seq_analysis_pro/_backend/comp_seq_sim.py index 81545a8dc..868817065 100644 --- a/aaanalysis/seq_analysis_pro/_backend/comp_seq_sim.py +++ b/aaanalysis/seq_analysis_pro/_backend/comp_seq_sim.py @@ -37,9 +37,9 @@ def comp_pw_seq_sim_(df_seq=None): sim_score = comp_seq_sim_(seq1=seq1, seq2=seq2) df_pw_sim.at[id1, id2] = sim_score df_pw_sim.at[id2, id1] = sim_score - # Fill diagonal with 1s for self-similarity + # Fill diagonal with 100 for self-similarity (matches the [0, 100] scale of off-diagonal cells) arr = df_pw_sim.to_numpy(copy=True) - np.fill_diagonal(arr, 1) + np.fill_diagonal(arr, 100) df_pw_sim.iloc[:, :] = arr df_pw_sim = df_pw_sim.round(4) return df_pw_sim diff --git a/aaanalysis/seq_analysis_pro/_comp_seq_sim.py b/aaanalysis/seq_analysis_pro/_comp_seq_sim.py index 271022983..070f2f86b 100644 --- a/aaanalysis/seq_analysis_pro/_comp_seq_sim.py +++ b/aaanalysis/seq_analysis_pro/_comp_seq_sim.py @@ -63,7 +63,7 @@ def comp_seq_sim(seq1: Optional[str] = None, ut.check_df(name="df_seq", df=df_seq, accept_none=False, accept_nan=False, cols_required=[ut.COL_SEQ, ut.COL_ENTRY]) for entry, seq in zip(df_seq[ut.COL_ENTRY], df_seq[ut.COL_SEQ]): - ut.check_str(name=f"sequence ({entry}", val=seq, accept_none=False) + ut.check_str(name=f"sequence ({entry})", val=seq, accept_none=False) if df_seq is None: # Compute similarity seq_sim = comp_seq_sim_(seq1=seq1, seq2=seq2) diff --git a/aaanalysis/show_html/_display_df.py b/aaanalysis/show_html/_display_df.py index 13e137741..3a478c134 100644 --- a/aaanalysis/show_html/_display_df.py +++ b/aaanalysis/show_html/_display_df.py @@ -39,7 +39,7 @@ def _check_show(name="row_to_show", val=None, df=None): if val not in rows_or_columns: raise ValueError(f"'{name}' ('{val}') should be one of: {rows_or_columns}") elif isinstance(val, int): - ut.check_number_range(name=name, val=val, accept_none=True, min_val=0, max_val=n, just_int=True) + ut.check_number_range(name=name, val=val, accept_none=True, min_val=0, max_val=n - 1, just_int=True) else: raise ValueError(f"'{name}' ('{val}') should be int (<{n}) or one of following {str_row_or_column} names: {rows_or_columns}") diff --git a/tests/unit/aaclust_tests/test_aac_branch.py b/tests/unit/aaclust_tests/test_aac_branch.py index 6a38a72a5..df3ece4b2 100644 --- a/tests/unit/aaclust_tests/test_aac_branch.py +++ b/tests/unit/aaclust_tests/test_aac_branch.py @@ -39,7 +39,7 @@ def test_n_clusters_exceeds_n_unique_samples(self): X = np.array([[1.0, 2.0], [1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]) aac = aa.AAclust(verbose=False, random_state=42) - with pytest.raises(ValueError, match="should be >= n_unique_samples"): + with pytest.raises(ValueError, match="should be <= n_unique_samples"): aac.fit(X, n_clusters=5) @given(n_clusters=some.integers(min_value=2, max_value=6)) diff --git a/tests/unit/test_correctness_batch_342.py b/tests/unit/test_correctness_batch_342.py new file mode 100644 index 000000000..4bebde1e6 --- /dev/null +++ b/tests/unit/test_correctness_batch_342.py @@ -0,0 +1,97 @@ +"""Regression tests for the correctness audit batch (issue #342). + +Each test pins one fix from the batch so the defect cannot silently return. +The config.options fixes (verbose/random_state/name_tmd/LOKY) are intentionally +excluded here — config.py is a CONFIRM-FIRST surface handled separately. +""" +import numpy as np +import pandas as pd +import pytest + +import aaanalysis as aa +import aaanalysis.utils as ut + + +# --- A2: load_dataset(non_canonical_aa="gap") must not corrupt the shared cache --- +def test_load_dataset_gap_does_not_corrupt_cache(): + name = "SEQ_CAPSID" # contains non-canonical amino acids (B, U, X) + df_gap = aa.load_dataset(name=name, non_canonical_aa="gap") + assert df_gap[ut.COL_SEQ].str.contains(ut.STR_AA_GAP, regex=False).any(), \ + "test dataset must contain non-canonical AAs so the gap path is exercised" + df_keep = aa.load_dataset(name=name, non_canonical_aa="keep") + assert not df_keep[ut.COL_SEQ].str.contains(ut.STR_AA_GAP, regex=False).any(), \ + "'keep' returned gapped sequences -> the earlier 'gap' call corrupted the cache" + + +# --- A9: load_features must return a fresh copy, not the shared cached object --- +def test_load_features_returns_independent_copy(): + d1 = aa.load_features(name="DOM_GSEC") + d2 = aa.load_features(name="DOM_GSEC") + assert d1 is not d2 + d1.iloc[0, 0] = "__SENTINEL__" + d3 = aa.load_features(name="DOM_GSEC") + assert d3.iloc[0, 0] != "__SENTINEL__" + + +# --- A10: read_fasta -> clear ValueError on pre-header text; leading blank is skipped --- +def test_read_fasta_preheader_text_raises_valueerror(tmp_path): + bad = tmp_path / "bad.fasta" + bad.write_text("junk before header\n>A\nMKV\n") + with pytest.raises(ValueError): + aa.read_fasta(file_path=str(bad)) + + +def test_read_fasta_leading_blank_line_ok(tmp_path): + ok = tmp_path / "ok.fasta" + ok.write_text("\n>A\nMKV\n>B\nAAA\n") + df = aa.read_fasta(file_path=str(ok)) + assert len(df) == 2 + + +# --- A14: comp_seq_sim self-similarity diagonal on the [0, 100] scale --- +def test_comp_seq_sim_diagonal_is_100(): + pytest.importorskip("Bio") + df_seq = pd.DataFrame({ut.COL_ENTRY: ["P1", "P2"], + ut.COL_SEQ: ["ACDEFGHIKL", "ACDEFGHIKM"]}) + res = aa.comp_seq_sim(df_seq=df_seq) + diag = np.diag(np.asarray(res, dtype=float)) + assert np.allclose(diag, 100.0) + + +# --- A16: get_best_n_clusters must not return 0 for a single-feature set (KMeans(0)) --- +def test_get_best_n_clusters_single_feature(): + from aaanalysis.feature_engineering._backend.cpp.cpp_eval import get_best_n_clusters + X = np.array([[0.1, 0.2, 0.3, 0.4]]) # one feature (row) + assert get_best_n_clusters(X=X, min_th=0.3, random_state=0) >= 1 + + +# --- A17: wrong-length marker_size list -> ValueError, not a later IndexError --- +def test_check_marker_size_wrong_length_raises_valueerror(): + from aaanalysis._utils.plotting import _check_marker_size + with pytest.raises(ValueError): + _check_marker_size(marker_size=[10, 12], list_cat=["a", "b", "c"]) + assert _check_marker_size(marker_size=[10, 12, 14], list_cat=["a", "b", "c"]) == [10, 12, 14] + + +# --- A18: display_df row/col selector is 0..n-1 (off-by-one) --- +def test_display_df_out_of_bounds_selector_raises(): + df = pd.DataFrame({"x": [1, 2, 3]}) + with pytest.raises(ValueError): + aa.display_df(df=df, row_to_show=3) # valid rows are 0..2 + + +# --- A25: check_match_X_n_clusters states the correct inequality --- +def test_aaclust_n_clusters_message_uses_leq(): + from aaanalysis.feature_engineering._aaclust import check_match_X_n_clusters + X = np.array([[1.0, 2.0], [1.0, 2.0], [1.0, 2.0]]) # 3 samples, 1 unique + with pytest.raises(ValueError) as e: + check_match_X_n_clusters(X=X, n_clusters=2) # n_samples>=n_clusters, n_unique Date: Sat, 4 Jul 2026 16:41:22 +0200 Subject: [PATCH 2/5] fix(config): validate options candidates + name_tmd; undo own LOKY cap Completes the config.options group of #342 (CONFIRM-FIRST, approved): - _check_option validates the incoming verbose/random_state candidate directly instead of routing through the runtime resolvers, which read the current global and skipped validating a new value once one was set. - name_tmd is now validated like name_jmd_n / name_jmd_c (any name_* -> str). - check_n_jobs undoes only the loky CPU cap it set itself when allow_multiprocessing is re-enabled, so it is not left stuck at 1 and a user's own LOKY_MAX_CPU_COUNT is never clobbered. Tests added to tests/unit/test_correctness_batch_342.py. Co-Authored-By: Claude Opus 4.8 --- aaanalysis/config.py | 32 +++++++++++----- tests/unit/test_correctness_batch_342.py | 47 ++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 9 deletions(-) diff --git a/aaanalysis/config.py b/aaanalysis/config.py index 45bdfa667..538819aad 100644 --- a/aaanalysis/config.py +++ b/aaanalysis/config.py @@ -27,6 +27,11 @@ } +# Tracks whether allow_multiprocessing=False set the loky CPU cap, so re-enabling +# multiprocessing can undo exactly our own cap (never a user's own env setting). +_loky_capped_by_options = False + + # Check system level (option) parameters or depending on parameters def check_verbose(verbose=None): """Check if general verbosity is on or off. Adjusted based on options setting and value provided to object""" @@ -67,12 +72,18 @@ def check_n_jobs(n_jobs=None): global_n_jobs = options["n_jobs"] if global_n_jobs != "off": n_jobs = global_n_jobs + global _loky_capped_by_options allow_multiprocessing = options["allow_multiprocessing"] check_bool(name="allow_multiprocessing (options)", val=allow_multiprocessing) - # Disable multiprocessing + # Disable multiprocessing (and undo only our own cap once it is re-enabled, so the + # loky CPU count is not left stuck at 1 and a user's own env var is never clobbered) if not allow_multiprocessing: n_jobs = 1 os.environ['LOKY_MAX_CPU_COUNT'] = "1" + _loky_capped_by_options = True + elif _loky_capped_by_options: + os.environ.pop('LOKY_MAX_CPU_COUNT', None) + _loky_capped_by_options = False # Set n_jobs to maximum number of CPUs if n_jobs == -1: n_jobs = os.cpu_count() @@ -130,10 +141,13 @@ def _check_option(name_option="", option=None): """Check if option is valid""" if name_option == "verbose": if option != "off": - check_verbose(verbose=option) + # Validate the incoming candidate directly (check_verbose resolves against + # the current global and would skip validating a new value once one is set). + check_bool(name=name_option, val=option) if name_option == "random_state": if option != "off": - check_random_state(random_state=option) + check_number_range(name=name_option, val=option, min_val=0, + accept_none=True, just_int=True) if name_option == "n_jobs": if option != "off": # Concrete override: -1 (all cores) or a positive int. None is not a @@ -144,12 +158,12 @@ def _check_option(name_option="", option=None): accept_none=False, just_int=True) if name_option == "allow_multiprocessing": check_bool(name=name_option, val=option) - if "jmd" in name_option: - if "len" in name_option: - check_number_range(name=name_option, val=option, - min_val=0, accept_none=True, just_int=True) - if "name" in name_option: - check_str(name=name_option, val=option, accept_none=False) + if "jmd" in name_option and "len" in name_option: + check_number_range(name=name_option, val=option, + min_val=0, accept_none=True, just_int=True) + if "name" in name_option: + # Covers name_tmd, name_jmd_n, name_jmd_c + check_str(name=name_option, val=option, accept_none=False) if name_option == "ext_len": check_number_range(name=name_option, val=option, min_val=0, accept_none=False, just_int=True) if "df" in name_option: diff --git a/tests/unit/test_correctness_batch_342.py b/tests/unit/test_correctness_batch_342.py index 4bebde1e6..07d1ac1ba 100644 --- a/tests/unit/test_correctness_batch_342.py +++ b/tests/unit/test_correctness_batch_342.py @@ -95,3 +95,50 @@ def test_check_metric_message_no_none(): with pytest.raises(ValueError) as e: check_metric(metric="not-a-metric") assert "None" not in str(e.value) + + +# --- A5: options validation must check the incoming candidate, not the current global --- +def test_options_validation_not_bypassed_after_value_set(): + try: + aa.options["verbose"] = True + with pytest.raises(ValueError): + aa.options["verbose"] = "garbage" + assert aa.options["verbose"] is True # garbage was rejected, not stored + aa.options["random_state"] = 7 + with pytest.raises(ValueError): + aa.options["random_state"] = "garbage" + assert aa.options["random_state"] == 7 + finally: + aa.options["verbose"] = "off" + aa.options["random_state"] = "off" + + +# --- A20: name_tmd must be validated like the other name_* options --- +def test_options_name_tmd_is_validated(): + try: + with pytest.raises(ValueError): + aa.options["name_tmd"] = 123 + assert aa.options["name_tmd"] == "TMD" + finally: + aa.options["name_tmd"] = "TMD" + + +# --- A12: re-enabling allow_multiprocessing must not leave LOKY_MAX_CPU_COUNT stuck at 1 --- +def test_allow_multiprocessing_reenable_clears_loky_cap(): + import os + from aaanalysis.config import check_n_jobs + prev = os.environ.get("LOKY_MAX_CPU_COUNT") + try: + aa.options["allow_multiprocessing"] = False + check_n_jobs(n_jobs=1) + assert os.environ.get("LOKY_MAX_CPU_COUNT") == "1" + aa.options["allow_multiprocessing"] = True + check_n_jobs(n_jobs=1) + assert os.environ.get("LOKY_MAX_CPU_COUNT") != "1" + finally: + aa.options["allow_multiprocessing"] = True + check_n_jobs(n_jobs=1) # reset the internal cap flag + if prev is None: + os.environ.pop("LOKY_MAX_CPU_COUNT", None) + else: + os.environ["LOKY_MAX_CPU_COUNT"] = prev From ae6406c1f4016154e53a7f05224dbd4cc95d7d92 Mon Sep 17 00:00:00 2001 From: Stephan Breimann Date: Sat, 4 Jul 2026 17:12:48 +0200 Subject: [PATCH 3/5] fix(review): address code-review findings on the correctness batch - SeqMut.mutate: score in mutation order (build_scan_output/_delta_table gain a sort=False path) and assign positionally, so duplicate mutation rows no longer desync/crash the previous (entry, mutation) re-join. - config check_n_jobs: on allow_multiprocessing=False remember the user's prior LOKY_MAX_CPU_COUNT and restore it on re-enable (was overwritten with '1' then popped, losing the user's value). - Revert three rendered-output plot changes that don't belong in a no-output- change batch (feature_map importance-bar ticks, profile y-axis padding, weight_bold spine/tick widths); deferred to the output-changing decision set. comp_seq_sim's diagonal correction (1 -> 100 on the [0,100] scale) is kept as the sole intentional value fix (pro function, unanchored, no test pinned it). Tests: SeqMut duplicate-row regression + LOKY save/restore assertion. Co-Authored-By: Claude Opus 4.8 --- aaanalysis/config.py | 21 ++++++++++++------- .../_backend/cpp/cpp_plot_feature_map.py | 2 +- .../_backend/cpp/cpp_plot_profile.py | 2 +- aaanalysis/plotting/_plot_settings.py | 5 ----- .../_backend/seqmut/seqmut.py | 8 +++++-- aaanalysis/protein_engineering/_seqmut.py | 21 +++++++++---------- .../test_seqmut_mutate.py | 8 +++++++ tests/unit/test_correctness_batch_342.py | 21 ++++++++++++------- 8 files changed, 53 insertions(+), 35 deletions(-) diff --git a/aaanalysis/config.py b/aaanalysis/config.py index 538819aad..34fe966b0 100644 --- a/aaanalysis/config.py +++ b/aaanalysis/config.py @@ -27,9 +27,10 @@ } -# Tracks whether allow_multiprocessing=False set the loky CPU cap, so re-enabling -# multiprocessing can undo exactly our own cap (never a user's own env setting). +# Tracks whether allow_multiprocessing=False set the loky CPU cap, and the user's prior +# LOKY_MAX_CPU_COUNT value, so re-enabling multiprocessing restores it (never loses it). _loky_capped_by_options = False +_loky_prev_value = None # Check system level (option) parameters or depending on parameters @@ -72,18 +73,24 @@ def check_n_jobs(n_jobs=None): global_n_jobs = options["n_jobs"] if global_n_jobs != "off": n_jobs = global_n_jobs - global _loky_capped_by_options + global _loky_capped_by_options, _loky_prev_value allow_multiprocessing = options["allow_multiprocessing"] check_bool(name="allow_multiprocessing (options)", val=allow_multiprocessing) - # Disable multiprocessing (and undo only our own cap once it is re-enabled, so the - # loky CPU count is not left stuck at 1 and a user's own env var is never clobbered) + # Cap loky when multiprocessing is disabled, remembering the user's prior value so it is + # restored (not lost) once multiprocessing is re-enabled on the next parallel-capable call. if not allow_multiprocessing: n_jobs = 1 + if not _loky_capped_by_options: + _loky_prev_value = os.environ.get('LOKY_MAX_CPU_COUNT') + _loky_capped_by_options = True os.environ['LOKY_MAX_CPU_COUNT'] = "1" - _loky_capped_by_options = True elif _loky_capped_by_options: - os.environ.pop('LOKY_MAX_CPU_COUNT', None) + if _loky_prev_value is None: + os.environ.pop('LOKY_MAX_CPU_COUNT', None) + else: + os.environ['LOKY_MAX_CPU_COUNT'] = _loky_prev_value _loky_capped_by_options = False + _loky_prev_value = None # Set n_jobs to maximum number of CPUs if n_jobs == -1: n_jobs = os.cpu_count() diff --git a/aaanalysis/feature_engineering/_backend/cpp/cpp_plot_feature_map.py b/aaanalysis/feature_engineering/_backend/cpp/cpp_plot_feature_map.py index bffefbd3b..4041e00c9 100644 --- a/aaanalysis/feature_engineering/_backend/cpp/cpp_plot_feature_map.py +++ b/aaanalysis/feature_engineering/_backend/cpp/cpp_plot_feature_map.py @@ -318,7 +318,7 @@ def plot_feature_map(df_feat=None, df_cat=None, position=position_bar, multialignment=multialignment_bar, weight_annotation="bold") - show_only_max = imp_bar_label_type != "long" + show_only_max = add_imp_bar_top != "long" args_ticks_0 = dict(show_zero=False, show_only_max=show_only_max, precision=1) ut.ticks_0(ax_br, **args_ticks_0) diff --git a/aaanalysis/feature_engineering/_backend/cpp/cpp_plot_profile.py b/aaanalysis/feature_engineering/_backend/cpp/cpp_plot_profile.py index 7ef6eb86b..dd645a485 100644 --- a/aaanalysis/feature_engineering/_backend/cpp/cpp_plot_profile.py +++ b/aaanalysis/feature_engineering/_backend/cpp/cpp_plot_profile.py @@ -107,7 +107,7 @@ def _plot_profile(df_pos=None, shap_plot=False, ax=None, ax.set_xlim(x_lim) if ylim is None: ymin, ymax = ax.get_ylim() - y_space = max(0, (ymax - ymin) * 0.25) + y_space = min(0, (ymax - ymin) * 0.25) y_lim = (ymin - y_space, ymax + y_space) ax.set_ylim(y_lim) diff --git a/aaanalysis/plotting/_plot_settings.py b/aaanalysis/plotting/_plot_settings.py index b8993a148..d3cecf778 100644 --- a/aaanalysis/plotting/_plot_settings.py +++ b/aaanalysis/plotting/_plot_settings.py @@ -161,11 +161,6 @@ def plot_settings(font_scale: Union[int, float] = 1, if weight_bold: plt.rcParams["axes.labelweight"] = "bold" plt.rcParams["axes.titleweight"] = "bold" - plt.rcParams["axes.linewidth"] = 1.25 - plt.rcParams["xtick.major.width"] = 1.1 - plt.rcParams["xtick.minor.width"] = 0.9 - plt.rcParams["ytick.major.width"] = 1.1 - plt.rcParams["ytick.minor.width"] = 0.9 else: plt.rcParams["axes.labelweight"] = "normal" plt.rcParams["axes.titleweight"] = "normal" diff --git a/aaanalysis/protein_engineering/_backend/seqmut/seqmut.py b/aaanalysis/protein_engineering/_backend/seqmut/seqmut.py index c036e95da..c2e03341a 100644 --- a/aaanalysis/protein_engineering/_backend/seqmut/seqmut.py +++ b/aaanalysis/protein_engineering/_backend/seqmut/seqmut.py @@ -191,7 +191,7 @@ def comp_scan_scores(dX=None, mean_dif=None, weight_vec=None): def build_scan_output(df_plan=None, delta_cpp=None, shift_score=None, - delta_pred=None, wt_pred=None, wt_pred_std=None): + delta_pred=None, wt_pred=None, wt_pred_std=None, sort=True): """Assemble the tidy scan output DataFrame, sorted by descending |ΔCPP|. When ``delta_pred`` is given (a model is bound), the model prediction-shift columns @@ -209,7 +209,11 @@ def build_scan_output(df_plan=None, delta_cpp=None, shift_score=None, df_out[ut.COL_WT_PRED_STD] = wt_pred_std cols = cols + [ut.COL_DELTA_PRED, ut.COL_WT_PRED, ut.COL_WT_PRED_STD] df_out = df_out[cols] - df_out = df_out.sort_values(ut.COL_DELTA_CPP, ascending=False).reset_index(drop=True) + if sort: + df_out = df_out.sort_values(ut.COL_DELTA_CPP, ascending=False).reset_index(drop=True) + else: + # Keep df_plan row order so callers can align results positionally (no re-join). + df_out = df_out.reset_index(drop=True) return df_out diff --git a/aaanalysis/protein_engineering/_seqmut.py b/aaanalysis/protein_engineering/_seqmut.py index d89422374..cb02dac62 100644 --- a/aaanalysis/protein_engineering/_seqmut.py +++ b/aaanalysis/protein_engineering/_seqmut.py @@ -186,7 +186,7 @@ def __init__(self, # Helper methods def _delta_table(self, df_plan=None, df_seq=None, df_feat=None, jmd_n_len=10, jmd_c_len=10, - weight=None): + weight=None, sort=True): """Run the ΔCPP (+ model ΔP) engine for a mutation plan and return the scored output.""" features = list(df_feat[ut.COL_FEATURE]) mean_dif = df_feat[ut.COL_MEAN_DIF].to_numpy(dtype=float) @@ -201,8 +201,9 @@ def _delta_table(self, df_plan=None, df_seq=None, df_feat=None, jmd_n_len=10, jm X_wt=X_wt, X_mut=X_mut, wt_rows=wt_rows, model=self._model, target_class=self._target_class) return build_scan_output(df_plan=df_plan, delta_cpp=delta_cpp, shift_score=shift_score, - delta_pred=delta_pred, wt_pred=wt_pred, wt_pred_std=wt_pred_std) - return build_scan_output(df_plan=df_plan, delta_cpp=delta_cpp, shift_score=shift_score) + delta_pred=delta_pred, wt_pred=wt_pred, wt_pred_std=wt_pred_std, + sort=sort) + return build_scan_output(df_plan=df_plan, delta_cpp=delta_cpp, shift_score=shift_score, sort=sort) # Main methods def mutate(self, @@ -279,16 +280,14 @@ def mutate(self, for p, ts, te in zip(df_plan[ut.COL_POS], df_plan[ut.COL_TMD_START], df_plan[ut.COL_TMD_STOP])] + # Score in mutation order (sort=False) so results align row-for-row with df_mut; + # no label re-join, so duplicate mutation rows can never desync or crash. df_scored = self._delta_table(df_plan=df_plan, df_seq=df_seq, df_feat=df_feat, - jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len) - # build_scan_output sorts by delta_cpp, so re-join the scored rows back to the - # mutation order using (entry, mutation) — the label alone is not unique across entries. - df_scored = df_scored.set_index([ut.COL_ENTRY, ut.COL_MUTATION]) - keys = list(zip(df_mut[ut.COL_ENTRY], df_mut[ut.COL_MUTATION])) - df_mut[ut.COL_DELTA_CPP] = df_scored.loc[keys, ut.COL_DELTA_CPP].to_numpy() - df_mut[ut.COL_SHIFT_SCORE] = df_scored.loc[keys, ut.COL_SHIFT_SCORE].to_numpy() + jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len, sort=False) + df_mut[ut.COL_DELTA_CPP] = df_scored[ut.COL_DELTA_CPP].to_numpy() + df_mut[ut.COL_SHIFT_SCORE] = df_scored[ut.COL_SHIFT_SCORE].to_numpy() if self._model is not None: - df_mut[ut.COL_DELTA_PRED] = df_scored.loc[keys, ut.COL_DELTA_PRED].to_numpy() + df_mut[ut.COL_DELTA_PRED] = df_scored[ut.COL_DELTA_PRED].to_numpy() return df_mut def scan(self, diff --git a/tests/unit/protein_engineering_tests/test_seqmut_mutate.py b/tests/unit/protein_engineering_tests/test_seqmut_mutate.py index 595325799..2ee73b148 100644 --- a/tests/unit/protein_engineering_tests/test_seqmut_mutate.py +++ b/tests/unit/protein_engineering_tests/test_seqmut_mutate.py @@ -45,6 +45,14 @@ def test_without_df_feat_no_delta(self, df_seq_pos): df = aa.SeqMut().mutate(df_seq=df_seq_pos, mutations=_muts()) assert ut.COL_DELTA_CPP not in df.columns + def test_with_df_feat_duplicate_rows_no_crash(self, df_seq_pos, df_feat): + # Two identical (entry, pos, to_aa) rows share the same mutation label; the scored + # results must still align row-for-row (no non-unique re-join crash). + muts = _muts(("P1", "P1"), (12, 12), ("K", "K")) + df = aa.SeqMut().mutate(df_seq=df_seq_pos, mutations=muts, df_feat=df_feat) + assert len(df) == 2 + assert df[ut.COL_DELTA_CPP].iloc[0] == pytest.approx(df[ut.COL_DELTA_CPP].iloc[1], abs=1e-12) + def test_jmd_n_len(self, df_seq_pos, df_feat): df = aa.SeqMut().mutate(df_seq=df_seq_pos, mutations=_muts(("P1",), (12,), ("K",)), df_feat=df_feat, jmd_n_len=8) diff --git a/tests/unit/test_correctness_batch_342.py b/tests/unit/test_correctness_batch_342.py index 07d1ac1ba..38e78a6d6 100644 --- a/tests/unit/test_correctness_batch_342.py +++ b/tests/unit/test_correctness_batch_342.py @@ -123,21 +123,26 @@ def test_options_name_tmd_is_validated(): aa.options["name_tmd"] = "TMD" -# --- A12: re-enabling allow_multiprocessing must not leave LOKY_MAX_CPU_COUNT stuck at 1 --- -def test_allow_multiprocessing_reenable_clears_loky_cap(): +# --- A12: disabling then re-enabling allow_multiprocessing must restore (not lose) the +# user's own LOKY_MAX_CPU_COUNT, and never leave it stuck at "1" --- +def test_allow_multiprocessing_restores_user_loky_value(): import os - from aaanalysis.config import check_n_jobs + from aaanalysis import config as _cfg prev = os.environ.get("LOKY_MAX_CPU_COUNT") try: + # Start from a clean, enabled state (reset the module cap flag). + aa.options["allow_multiprocessing"] = True + _cfg.check_n_jobs(n_jobs=1) + os.environ["LOKY_MAX_CPU_COUNT"] = "4" # the user's own cap aa.options["allow_multiprocessing"] = False - check_n_jobs(n_jobs=1) - assert os.environ.get("LOKY_MAX_CPU_COUNT") == "1" + _cfg.check_n_jobs(n_jobs=1) + assert os.environ.get("LOKY_MAX_CPU_COUNT") == "1" # our cap while disabled aa.options["allow_multiprocessing"] = True - check_n_jobs(n_jobs=1) - assert os.environ.get("LOKY_MAX_CPU_COUNT") != "1" + _cfg.check_n_jobs(n_jobs=1) + assert os.environ.get("LOKY_MAX_CPU_COUNT") == "4" # user's value restored, not lost finally: aa.options["allow_multiprocessing"] = True - check_n_jobs(n_jobs=1) # reset the internal cap flag + _cfg.check_n_jobs(n_jobs=1) if prev is None: os.environ.pop("LOKY_MAX_CPU_COUNT", None) else: From e1db3bcf6113ca71de4980cf3fd9fa7393679332 Mon Sep 17 00:00:00 2001 From: Stephan Breimann Date: Sat, 4 Jul 2026 18:13:38 +0200 Subject: [PATCH 4/5] fix(config): don't clobber a user's LOKY_MAX_CPU_COUNT set during the disabled window Re-review follow-up: check_n_jobs only undoes its own cap on re-enable if the value is still '1'; if the user set their own LOKY_MAX_CPU_COUNT (e.g. for another loky/ joblib library) while multiprocessing was disabled, it is left untouched. Co-Authored-By: Claude Opus 4.8 --- aaanalysis/config.py | 12 ++++++++---- tests/unit/test_correctness_batch_342.py | 23 +++++++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/aaanalysis/config.py b/aaanalysis/config.py index 34fe966b0..7407c511d 100644 --- a/aaanalysis/config.py +++ b/aaanalysis/config.py @@ -85,10 +85,14 @@ def check_n_jobs(n_jobs=None): _loky_capped_by_options = True os.environ['LOKY_MAX_CPU_COUNT'] = "1" elif _loky_capped_by_options: - if _loky_prev_value is None: - os.environ.pop('LOKY_MAX_CPU_COUNT', None) - else: - os.environ['LOKY_MAX_CPU_COUNT'] = _loky_prev_value + # Only undo our own cap if it is still in place. If the user set their own + # LOKY_MAX_CPU_COUNT (e.g. for another loky/joblib library) while multiprocessing + # was disabled, the value is no longer "1" -> leave it untouched. + if os.environ.get('LOKY_MAX_CPU_COUNT') == "1": + if _loky_prev_value is None: + os.environ.pop('LOKY_MAX_CPU_COUNT', None) + else: + os.environ['LOKY_MAX_CPU_COUNT'] = _loky_prev_value _loky_capped_by_options = False _loky_prev_value = None # Set n_jobs to maximum number of CPUs diff --git a/tests/unit/test_correctness_batch_342.py b/tests/unit/test_correctness_batch_342.py index 38e78a6d6..e5f631924 100644 --- a/tests/unit/test_correctness_batch_342.py +++ b/tests/unit/test_correctness_batch_342.py @@ -147,3 +147,26 @@ def test_allow_multiprocessing_restores_user_loky_value(): os.environ.pop("LOKY_MAX_CPU_COUNT", None) else: os.environ["LOKY_MAX_CPU_COUNT"] = prev + + +# --- A12 (cont.): a value the user sets DURING the disabled window must not be clobbered --- +def test_allow_multiprocessing_reenable_preserves_user_change_during_disable(): + import os + from aaanalysis import config as _cfg + prev = os.environ.get("LOKY_MAX_CPU_COUNT") + try: + aa.options["allow_multiprocessing"] = True + _cfg.check_n_jobs(n_jobs=1) # clean flag state + aa.options["allow_multiprocessing"] = False + _cfg.check_n_jobs(n_jobs=1) # our cap -> "1" + os.environ["LOKY_MAX_CPU_COUNT"] = "32" # user sets their own value while disabled + aa.options["allow_multiprocessing"] = True + _cfg.check_n_jobs(n_jobs=1) + assert os.environ.get("LOKY_MAX_CPU_COUNT") == "32" # user's value not clobbered + finally: + aa.options["allow_multiprocessing"] = True + _cfg.check_n_jobs(n_jobs=1) + if prev is None: + os.environ.pop("LOKY_MAX_CPU_COUNT", None) + else: + os.environ["LOKY_MAX_CPU_COUNT"] = prev From dc4289c985648fa846738c0478aff55a797345f5 Mon Sep 17 00:00:00 2001 From: Stephan Breimann Date: Sat, 4 Jul 2026 22:40:08 +0200 Subject: [PATCH 5/5] chore(test): drop issue/finding-id references from correctness-batch test comments Per PR review: the test comments no longer cite the issue number or internal audit finding IDs; each test's comment now plainly states what it pins. Co-Authored-By: Claude Opus 4.8 --- tests/unit/test_correctness_batch_342.py | 36 +++++++++++------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/tests/unit/test_correctness_batch_342.py b/tests/unit/test_correctness_batch_342.py index e5f631924..1c6cda97c 100644 --- a/tests/unit/test_correctness_batch_342.py +++ b/tests/unit/test_correctness_batch_342.py @@ -1,9 +1,5 @@ -"""Regression tests for the correctness audit batch (issue #342). - -Each test pins one fix from the batch so the defect cannot silently return. -The config.options fixes (verbose/random_state/name_tmd/LOKY) are intentionally -excluded here — config.py is a CONFIRM-FIRST surface handled separately. -""" +"""Regression tests pinning a batch of low-risk correctness fixes so each defect +cannot silently return.""" import numpy as np import pandas as pd import pytest @@ -12,7 +8,7 @@ import aaanalysis.utils as ut -# --- A2: load_dataset(non_canonical_aa="gap") must not corrupt the shared cache --- +# load_dataset(non_canonical_aa="gap") must not corrupt the shared cache def test_load_dataset_gap_does_not_corrupt_cache(): name = "SEQ_CAPSID" # contains non-canonical amino acids (B, U, X) df_gap = aa.load_dataset(name=name, non_canonical_aa="gap") @@ -23,7 +19,7 @@ def test_load_dataset_gap_does_not_corrupt_cache(): "'keep' returned gapped sequences -> the earlier 'gap' call corrupted the cache" -# --- A9: load_features must return a fresh copy, not the shared cached object --- +# load_features must return a fresh copy, not the shared cached object def test_load_features_returns_independent_copy(): d1 = aa.load_features(name="DOM_GSEC") d2 = aa.load_features(name="DOM_GSEC") @@ -33,7 +29,7 @@ def test_load_features_returns_independent_copy(): assert d3.iloc[0, 0] != "__SENTINEL__" -# --- A10: read_fasta -> clear ValueError on pre-header text; leading blank is skipped --- +# read_fasta -> clear ValueError on pre-header text; leading blank is skipped def test_read_fasta_preheader_text_raises_valueerror(tmp_path): bad = tmp_path / "bad.fasta" bad.write_text("junk before header\n>A\nMKV\n") @@ -48,7 +44,7 @@ def test_read_fasta_leading_blank_line_ok(tmp_path): assert len(df) == 2 -# --- A14: comp_seq_sim self-similarity diagonal on the [0, 100] scale --- +# comp_seq_sim self-similarity diagonal on the [0, 100] scale def test_comp_seq_sim_diagonal_is_100(): pytest.importorskip("Bio") df_seq = pd.DataFrame({ut.COL_ENTRY: ["P1", "P2"], @@ -58,14 +54,14 @@ def test_comp_seq_sim_diagonal_is_100(): assert np.allclose(diag, 100.0) -# --- A16: get_best_n_clusters must not return 0 for a single-feature set (KMeans(0)) --- +# get_best_n_clusters must not return 0 for a single-feature set (KMeans(0)) def test_get_best_n_clusters_single_feature(): from aaanalysis.feature_engineering._backend.cpp.cpp_eval import get_best_n_clusters X = np.array([[0.1, 0.2, 0.3, 0.4]]) # one feature (row) assert get_best_n_clusters(X=X, min_th=0.3, random_state=0) >= 1 -# --- A17: wrong-length marker_size list -> ValueError, not a later IndexError --- +# wrong-length marker_size list -> ValueError, not a later IndexError def test_check_marker_size_wrong_length_raises_valueerror(): from aaanalysis._utils.plotting import _check_marker_size with pytest.raises(ValueError): @@ -73,14 +69,14 @@ def test_check_marker_size_wrong_length_raises_valueerror(): assert _check_marker_size(marker_size=[10, 12, 14], list_cat=["a", "b", "c"]) == [10, 12, 14] -# --- A18: display_df row/col selector is 0..n-1 (off-by-one) --- +# display_df row/col selector is 0..n-1 (off-by-one) def test_display_df_out_of_bounds_selector_raises(): df = pd.DataFrame({"x": [1, 2, 3]}) with pytest.raises(ValueError): aa.display_df(df=df, row_to_show=3) # valid rows are 0..2 -# --- A25: check_match_X_n_clusters states the correct inequality --- +# check_match_X_n_clusters states the correct inequality def test_aaclust_n_clusters_message_uses_leq(): from aaanalysis.feature_engineering._aaclust import check_match_X_n_clusters X = np.array([[1.0, 2.0], [1.0, 2.0], [1.0, 2.0]]) # 3 samples, 1 unique @@ -89,7 +85,7 @@ def test_aaclust_n_clusters_message_uses_leq(): assert "<=" in str(e.value) -# --- A26: check_metric message no longer advertises None --- +# check_metric message no longer advertises None def test_check_metric_message_no_none(): from aaanalysis.feature_engineering._backend.check_aaclust import check_metric with pytest.raises(ValueError) as e: @@ -97,7 +93,7 @@ def test_check_metric_message_no_none(): assert "None" not in str(e.value) -# --- A5: options validation must check the incoming candidate, not the current global --- +# options validation must check the incoming candidate, not the current global def test_options_validation_not_bypassed_after_value_set(): try: aa.options["verbose"] = True @@ -113,7 +109,7 @@ def test_options_validation_not_bypassed_after_value_set(): aa.options["random_state"] = "off" -# --- A20: name_tmd must be validated like the other name_* options --- +# name_tmd must be validated like the other name_* options def test_options_name_tmd_is_validated(): try: with pytest.raises(ValueError): @@ -123,8 +119,8 @@ def test_options_name_tmd_is_validated(): aa.options["name_tmd"] = "TMD" -# --- A12: disabling then re-enabling allow_multiprocessing must restore (not lose) the -# user's own LOKY_MAX_CPU_COUNT, and never leave it stuck at "1" --- +# disabling then re-enabling allow_multiprocessing must restore (not lose) the +# user's own LOKY_MAX_CPU_COUNT, and never leave it stuck at "1" def test_allow_multiprocessing_restores_user_loky_value(): import os from aaanalysis import config as _cfg @@ -149,7 +145,7 @@ def test_allow_multiprocessing_restores_user_loky_value(): os.environ["LOKY_MAX_CPU_COUNT"] = prev -# --- A12 (cont.): a value the user sets DURING the disabled window must not be clobbered --- +# a value the user sets DURING the disabled window must not be clobbered def test_allow_multiprocessing_reenable_preserves_user_change_during_disable(): import os from aaanalysis import config as _cfg