Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions aaanalysis/_utils/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,10 @@ def _check_marker_size(marker_size: Union[int, float, List[Union[int, float]]] =
if isinstance(marker_size, (int, float)):
check_number_range(name='marker_size', val=marker_size, min_val=0, accept_none=True, just_int=False)
elif isinstance(marker_size, list):
if len(marker_size) != len(list_cat):
raise ValueError(f"Length must match of 'marker_size' (marker_size) and categories ({list_cat}).")
for i in marker_size:
check_number_range(name='marker_size', val=i, min_val=0, accept_none=True, just_int=False)
elif isinstance(marker_size, list) and len(marker_size) != len(list_cat):
raise ValueError(f"Length must match of 'marker_size' (marker_size) and categories ({list_cat}).")
else:
raise ValueError(f"'marker_size' has wrong data type: {type(marker_size)}")
# Create marker_size list
Expand Down
43 changes: 34 additions & 9 deletions aaanalysis/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@
}


# Tracks whether allow_multiprocessing=False set the loky CPU cap, and the user's prior
# LOKY_MAX_CPU_COUNT value, so re-enabling multiprocessing restores it (never loses it).
_loky_capped_by_options = False
_loky_prev_value = None


# Check system level (option) parameters or depending on parameters
def check_verbose(verbose=None):
"""Check if general verbosity is on or off. Adjusted based on options setting and value provided to object"""
Expand Down Expand Up @@ -67,12 +73,28 @@ def check_n_jobs(n_jobs=None):
global_n_jobs = options["n_jobs"]
if global_n_jobs != "off":
n_jobs = global_n_jobs
global _loky_capped_by_options, _loky_prev_value
allow_multiprocessing = options["allow_multiprocessing"]
check_bool(name="allow_multiprocessing (options)", val=allow_multiprocessing)
# Disable multiprocessing
# Cap loky when multiprocessing is disabled, remembering the user's prior value so it is
# restored (not lost) once multiprocessing is re-enabled on the next parallel-capable call.
if not allow_multiprocessing:
n_jobs = 1
if not _loky_capped_by_options:
_loky_prev_value = os.environ.get('LOKY_MAX_CPU_COUNT')
_loky_capped_by_options = True
os.environ['LOKY_MAX_CPU_COUNT'] = "1"
elif _loky_capped_by_options:
# Only undo our own cap if it is still in place. If the user set their own
# LOKY_MAX_CPU_COUNT (e.g. for another loky/joblib library) while multiprocessing
# was disabled, the value is no longer "1" -> leave it untouched.
if os.environ.get('LOKY_MAX_CPU_COUNT') == "1":
if _loky_prev_value is None:
os.environ.pop('LOKY_MAX_CPU_COUNT', None)
else:
os.environ['LOKY_MAX_CPU_COUNT'] = _loky_prev_value
_loky_capped_by_options = False
_loky_prev_value = None
# Set n_jobs to maximum number of CPUs
if n_jobs == -1:
n_jobs = os.cpu_count()
Expand Down Expand Up @@ -130,10 +152,13 @@ def _check_option(name_option="", option=None):
"""Check if option is valid"""
if name_option == "verbose":
if option != "off":
check_verbose(verbose=option)
# Validate the incoming candidate directly (check_verbose resolves against
# the current global and would skip validating a new value once one is set).
check_bool(name=name_option, val=option)
if name_option == "random_state":
if option != "off":
check_random_state(random_state=option)
check_number_range(name=name_option, val=option, min_val=0,
accept_none=True, just_int=True)
if name_option == "n_jobs":
if option != "off":
# Concrete override: -1 (all cores) or a positive int. None is not a
Expand All @@ -144,12 +169,12 @@ def _check_option(name_option="", option=None):
accept_none=False, just_int=True)
if name_option == "allow_multiprocessing":
check_bool(name=name_option, val=option)
if "jmd" in name_option:
if "len" in name_option:
check_number_range(name=name_option, val=option,
min_val=0, accept_none=True, just_int=True)
if "name" in name_option:
check_str(name=name_option, val=option, accept_none=False)
if "jmd" in name_option and "len" in name_option:
check_number_range(name=name_option, val=option,
min_val=0, accept_none=True, just_int=True)
if "name" in name_option:
# Covers name_tmd, name_jmd_n, name_jmd_c
check_str(name=name_option, val=option, accept_none=False)
if name_option == "ext_len":
check_number_range(name=name_option, val=option, min_val=0, accept_none=False, just_int=True)
if "df" in name_option:
Expand Down
5 changes: 4 additions & 1 deletion aaanalysis/data_handling/_backend/parse_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ def get_entries_from_fasta(file_path=None, col_id="entry", col_seq="sequence", c
if len(list_info) > 1:
for i in range(1, len(list_info[1:])+1):
dict_current_entry[f'info{i}'] = list_info[i]
else:
elif line:
if not dict_current_entry:
raise ValueError(f"'file_path' ('{file_path}') is not a valid FASTA file: "
f"sequence data appears before the first '>' header.")
dict_current_entry[col_seq] += line
if dict_current_entry:
list_entries.append(dict_current_entry)
Expand Down
6 changes: 4 additions & 2 deletions aaanalysis/data_handling/_load_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def post_check_df_seq(df_seq=None, n=None, name=None) -> None:
f"\nThis maximum value depends on the filtering settings used."
# Validation of sequence and domain datasets
if n is not None and len(df_seq) != n*2:
warnings.warn(warning_message)
warnings.warn(warning_message, UserWarning)


# Helper functions
Expand Down Expand Up @@ -256,7 +256,9 @@ def load_dataset(name: str = "Overview",
# Load overview table
if name == "Overview":
return ut.read_csv_cached(FOLDER_BENCHMARKS + f"Overview.{ut.STR_FILE_TYPE}").copy()
df = ut.read_csv_cached(FOLDER_BENCHMARKS + name + f".{ut.STR_FILE_TYPE}")
# Copy the cached frame (like the Overview branch) so downstream filtering /
# non-canonical-AA substitution can't mutate the shared cache in place
df = ut.read_csv_cached(FOLDER_BENCHMARKS + name + f".{ut.STR_FILE_TYPE}").copy()
# Filter data
if min_len is not None:
n_before = len(df)
Expand Down
3 changes: 2 additions & 1 deletion aaanalysis/data_handling/_load_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,5 @@ def load_features(name: Literal["DOM_GSEC"] = "DOM_GSEC") -> pd.DataFrame:
check_name(name=name)
# Load features
df_feat = ut.read_csv_cached(FOLDER_FEATURES + f"FEATURES_{name}.{ut.STR_FILE_TYPE}")
return df_feat
# Copy the cached frame so a caller's in-place edit can't corrupt the shared cache
return df_feat.copy()
2 changes: 1 addition & 1 deletion aaanalysis/data_handling/_load_scales.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def load_scales(name: Literal["scales", "scales_raw", "scales_cat", "scales_pc",
# Check input
check_name_of_scale(name=name)
ut.check_bool(name="just_aaindex", val=just_aaindex)
ut.check_bool(name="unclassified_in", val=unclassified_out)
ut.check_bool(name="unclassified_out", val=unclassified_out)
top60_n = check_top60_n(name=name, top60_n=top60_n)
check_top_explain(name=name, top_explain_n=top_explain_n,
top_explain_min_th=top_explain_min_th, top60_n=top60_n)
Expand Down
4 changes: 2 additions & 2 deletions aaanalysis/data_handling/_read_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ def post_check_unique_entries(list_entries=None, col_id=None) -> None:
if len(list_duplicates) > 0:
str_warning = (f"Entries from '{col_id}' should be unique. "
f"\nFollowing entries are duplicated: {list_duplicates}")
warnings.warn(str_warning)
warnings.warn(str_warning, UserWarning)


def post_check_col_db(df_seq=None, col_db=None, sep="|") -> None:
"""Check if database column is in DataFrame"""
columns = list(df_seq)
if col_db is not None and col_db not in columns:
str_warning = f"'col_db' ('{col_db}') not in 'df_seq'. Check if 'sep' ('{sep}') is matching."
warnings.warn(str_warning)
warnings.warn(str_warning, UserWarning)


def _adjust_columns(df_seq=None, col_seq=None, col_id=None, cols_info=None, col_db=None):
Expand Down
2 changes: 1 addition & 1 deletion aaanalysis/feature_engineering/_aaclust.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def check_match_X_n_clusters(X=None, n_clusters=None, accept_none=True) -> None:
if n_samples < n_clusters:
raise ValueError(f"n_samples={n_samples} (in 'X') should be >= 'n_clusters' ({n_clusters})")
if n_unique_samples < n_clusters:
raise ValueError(f"'n_clusters' ({n_clusters}) should be >= n_unique_samples={n_unique_samples} (in 'X').")
raise ValueError(f"'n_clusters' ({n_clusters}) should be <= n_unique_samples={n_unique_samples} (in 'X').")


def check_match_df_seq_X(df_seq=None, X=None) -> None:
Expand Down
2 changes: 1 addition & 1 deletion aaanalysis/feature_engineering/_backend/check_aaclust.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
def check_metric(metric=None):
""""""
if metric not in ut.LIST_METRICS:
error = f"'metric' should be None or one of following: {ut.LIST_METRICS}"
error = f"'metric' should be one of following: {ut.LIST_METRICS}"
raise ValueError(error)


3 changes: 2 additions & 1 deletion aaanalysis/feature_engineering/_backend/check_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,8 @@ def check_match_df_parts_df_scales(df_parts=None, df_scales=None, accept_gaps=Fa
char_scales.append(ut.STR_AA_GAP)
missing_char = [x for x in char_parts if x not in char_scales]
# Replace gaps by default amino acid gap
if accept_gaps:
if accept_gaps and missing_char:
df_parts = df_parts.copy() # copy so we don't mutate the caller's df_parts in place
for col in list(df_parts):
for mc in missing_char:
df_parts[col] = df_parts[col].str.replace(mc, ut.STR_AA_GAP)
Expand Down
3 changes: 2 additions & 1 deletion aaanalysis/feature_engineering/_backend/cpp/cpp_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ def get_min_cor(X, labels=None):
def get_best_n_clusters(X=None, min_th=0.3, random_state=None):
"""Obtain the best number of clusters based on internal cluster minimum correlation."""
n_features, _ = X.shape
max_n = min([100, n_features-1])
# Clamp to >= 1 so a single-feature set does not fall through to KMeans(n_clusters=0)
max_n = max(1, min(100, n_features - 1))
best_n_clusters = max_n
for n_clusters in range(2, max_n):
kmeans = KMeans(n_clusters=n_clusters, n_init="auto", random_state=random_state)
Expand Down
4 changes: 2 additions & 2 deletions aaanalysis/plotting/_plot_legend.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,10 +153,10 @@ def plot_legend(ax: Optional[Axes] = None,
ut.check_number_val(name="lw", val=lw, accept_none=True, just_int=False)
args_non_neg = {"labelspacing": labelspacing, "columnspacing": columnspacing,
"handletextpad": handletextpad, "handlelength": handlelength,
"fontsize": fontsize, "fontsize_legend": fontsize_title}
"fontsize": fontsize, "fontsize_title": fontsize_title}
for key in args_non_neg:
ut.check_number_range(name=key, val=args_non_neg[key], min_val=0, accept_none=True, just_int=False)
ut.check_bool(name="add_legend", val=keep_legend, accept_none=False)
ut.check_bool(name="keep_legend", val=keep_legend, accept_none=False)
# Create new legend
ax = ut.plot_legend_(ax=ax, dict_color=dict_color, list_cat=list_cat, labels=labels,
loc=loc, loc_out=loc_out, y=y, x=x, n_cols=n_cols,
Expand Down
8 changes: 6 additions & 2 deletions aaanalysis/protein_engineering/_backend/seqmut/seqmut.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def comp_scan_scores(dX=None, mean_dif=None, weight_vec=None):


def build_scan_output(df_plan=None, delta_cpp=None, shift_score=None,
delta_pred=None, wt_pred=None, wt_pred_std=None):
delta_pred=None, wt_pred=None, wt_pred_std=None, sort=True):
"""Assemble the tidy scan output DataFrame, sorted by descending |ΔCPP|.

When ``delta_pred`` is given (a model is bound), the model prediction-shift columns
Expand All @@ -209,7 +209,11 @@ def build_scan_output(df_plan=None, delta_cpp=None, shift_score=None,
df_out[ut.COL_WT_PRED_STD] = wt_pred_std
cols = cols + [ut.COL_DELTA_PRED, ut.COL_WT_PRED, ut.COL_WT_PRED_STD]
df_out = df_out[cols]
df_out = df_out.sort_values(ut.COL_DELTA_CPP, ascending=False).reset_index(drop=True)
if sort:
df_out = df_out.sort_values(ut.COL_DELTA_CPP, ascending=False).reset_index(drop=True)
else:
# Keep df_plan row order so callers can align results positionally (no re-join).
df_out = df_out.reset_index(drop=True)
return df_out


Expand Down
19 changes: 10 additions & 9 deletions aaanalysis/protein_engineering/_seqmut.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def __init__(self,

# Helper methods
def _delta_table(self, df_plan=None, df_seq=None, df_feat=None, jmd_n_len=10, jmd_c_len=10,
weight=None):
weight=None, sort=True):
"""Run the ΔCPP (+ model ΔP) engine for a mutation plan and return the scored output."""
features = list(df_feat[ut.COL_FEATURE])
mean_dif = df_feat[ut.COL_MEAN_DIF].to_numpy(dtype=float)
Expand All @@ -201,8 +201,9 @@ def _delta_table(self, df_plan=None, df_seq=None, df_feat=None, jmd_n_len=10, jm
X_wt=X_wt, X_mut=X_mut, wt_rows=wt_rows, model=self._model,
target_class=self._target_class)
return build_scan_output(df_plan=df_plan, delta_cpp=delta_cpp, shift_score=shift_score,
delta_pred=delta_pred, wt_pred=wt_pred, wt_pred_std=wt_pred_std)
return build_scan_output(df_plan=df_plan, delta_cpp=delta_cpp, shift_score=shift_score)
delta_pred=delta_pred, wt_pred=wt_pred, wt_pred_std=wt_pred_std,
sort=sort)
return build_scan_output(df_plan=df_plan, delta_cpp=delta_cpp, shift_score=shift_score, sort=sort)

# Main methods
def mutate(self,
Expand Down Expand Up @@ -279,14 +280,14 @@ def mutate(self,
for p, ts, te in zip(df_plan[ut.COL_POS],
df_plan[ut.COL_TMD_START],
df_plan[ut.COL_TMD_STOP])]
# Score in mutation order (sort=False) so results align row-for-row with df_mut;
# no label re-join, so duplicate mutation rows can never desync or crash.
df_scored = self._delta_table(df_plan=df_plan, df_seq=df_seq, df_feat=df_feat,
jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len)
df_scored = df_scored.set_index(ut.COL_MUTATION)
df_mut[ut.COL_DELTA_CPP] = df_scored.loc[df_mut[ut.COL_MUTATION], ut.COL_DELTA_CPP].to_numpy()
df_mut[ut.COL_SHIFT_SCORE] = df_scored.loc[df_mut[ut.COL_MUTATION], ut.COL_SHIFT_SCORE].to_numpy()
jmd_n_len=jmd_n_len, jmd_c_len=jmd_c_len, sort=False)
df_mut[ut.COL_DELTA_CPP] = df_scored[ut.COL_DELTA_CPP].to_numpy()
df_mut[ut.COL_SHIFT_SCORE] = df_scored[ut.COL_SHIFT_SCORE].to_numpy()
if self._model is not None:
df_mut[ut.COL_DELTA_PRED] = df_scored.loc[df_mut[ut.COL_MUTATION],
ut.COL_DELTA_PRED].to_numpy()
df_mut[ut.COL_DELTA_PRED] = df_scored[ut.COL_DELTA_PRED].to_numpy()
return df_mut

def scan(self,
Expand Down
4 changes: 2 additions & 2 deletions aaanalysis/seq_analysis_pro/_backend/comp_seq_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ def comp_pw_seq_sim_(df_seq=None):
sim_score = comp_seq_sim_(seq1=seq1, seq2=seq2)
df_pw_sim.at[id1, id2] = sim_score
df_pw_sim.at[id2, id1] = sim_score
# Fill diagonal with 1s for self-similarity
# Fill diagonal with 100 for self-similarity (matches the [0, 100] scale of off-diagonal cells)
arr = df_pw_sim.to_numpy(copy=True)
np.fill_diagonal(arr, 1)
np.fill_diagonal(arr, 100)
df_pw_sim.iloc[:, :] = arr
df_pw_sim = df_pw_sim.round(4)
return df_pw_sim
Expand Down
2 changes: 1 addition & 1 deletion aaanalysis/seq_analysis_pro/_comp_seq_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def comp_seq_sim(seq1: Optional[str] = None,
ut.check_df(name="df_seq", df=df_seq, accept_none=False, accept_nan=False,
cols_required=[ut.COL_SEQ, ut.COL_ENTRY])
for entry, seq in zip(df_seq[ut.COL_ENTRY], df_seq[ut.COL_SEQ]):
ut.check_str(name=f"sequence ({entry}", val=seq, accept_none=False)
ut.check_str(name=f"sequence ({entry})", val=seq, accept_none=False)
if df_seq is None:
# Compute similarity
seq_sim = comp_seq_sim_(seq1=seq1, seq2=seq2)
Expand Down
2 changes: 1 addition & 1 deletion aaanalysis/show_html/_display_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def _check_show(name="row_to_show", val=None, df=None):
if val not in rows_or_columns:
raise ValueError(f"'{name}' ('{val}') should be one of: {rows_or_columns}")
elif isinstance(val, int):
ut.check_number_range(name=name, val=val, accept_none=True, min_val=0, max_val=n, just_int=True)
ut.check_number_range(name=name, val=val, accept_none=True, min_val=0, max_val=n - 1, just_int=True)
else:
raise ValueError(f"'{name}' ('{val}') should be int (<{n}) or one of following {str_row_or_column} names: {rows_or_columns}")

Expand Down
2 changes: 1 addition & 1 deletion tests/unit/aaclust_tests/test_aac_branch.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_n_clusters_exceeds_n_unique_samples(self):
X = np.array([[1.0, 2.0], [1.0, 2.0], [3.0, 4.0],
[5.0, 6.0], [7.0, 8.0]])
aac = aa.AAclust(verbose=False, random_state=42)
with pytest.raises(ValueError, match="should be >= n_unique_samples"):
with pytest.raises(ValueError, match="should be <= n_unique_samples"):
aac.fit(X, n_clusters=5)

@given(n_clusters=some.integers(min_value=2, max_value=6))
Expand Down
8 changes: 8 additions & 0 deletions tests/unit/protein_engineering_tests/test_seqmut_mutate.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,14 @@ def test_without_df_feat_no_delta(self, df_seq_pos):
df = aa.SeqMut().mutate(df_seq=df_seq_pos, mutations=_muts())
assert ut.COL_DELTA_CPP not in df.columns

def test_with_df_feat_duplicate_rows_no_crash(self, df_seq_pos, df_feat):
# Two identical (entry, pos, to_aa) rows share the same mutation label; the scored
# results must still align row-for-row (no non-unique re-join crash).
muts = _muts(("P1", "P1"), (12, 12), ("K", "K"))
df = aa.SeqMut().mutate(df_seq=df_seq_pos, mutations=muts, df_feat=df_feat)
assert len(df) == 2
assert df[ut.COL_DELTA_CPP].iloc[0] == pytest.approx(df[ut.COL_DELTA_CPP].iloc[1], abs=1e-12)

def test_jmd_n_len(self, df_seq_pos, df_feat):
df = aa.SeqMut().mutate(df_seq=df_seq_pos, mutations=_muts(("P1",), (12,), ("K",)),
df_feat=df_feat, jmd_n_len=8)
Expand Down
Loading
Loading